In [23]:
# idea use the pandas read_html function to match the table with the correnct name using the values that are the same in both tables and create a name mapping similar to the one for the statements

In [None]:
accession_number = accession_number_series[0]
soup = get_statement_soup(ticker, accession_number, statement_name)

In [24]:
print_links_to_desired_statment(ticker, statement_name)

https://www.sec.gov/Archives/edgar/data/0000719955/000162828023009175/R6.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000162828022007494/R6.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312521100319/R5.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312520088937/R5.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312519097973/R5.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312518102232/R5.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312517104341/R4.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312516525847/R4.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312515118009/R4.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312514129974/R4.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312513142309/R4.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312512140070/R3.htm
https://www.sec.gov/Archives/edgar/data/0000719955/000119312511085046/R3.xml

In [16]:
def create_column_name_mapping_to_statement(soup: BeautifulSoup) -> dict:
    """
    Create a dictionary that maps internal column titles to displayed names.

    Parameters:
    - soup (BeautifulSoup): The HTML soup object containing the balance sheet.

    Returns:
    - dict: Dictionary mapping internal column titles to displayed names.
    """
    column_name_mapping_common_as_key = {}
    column_name_mapping_gaap_as_key = {}

    # Iterate through each row in the balance sheet table
    for row in soup.select("tr.re, tr.ro, tr.reu, tr.rou"):
        # Extract the 'onclick' attribute to get the internal column title
        onclick_attr = row.select_one("td.pl a, td.pl.custom a")["onclick"]
        gaap_tag = onclick_attr.split("defref_")[-1].split("',")[0]

        # Extract the displayed name from the table cell
        common_name = row.select_one("td.pl a, td.pl.custom a").text.strip()
        common_name = keep_letters_and_numbers_only_in_string(common_name)
        # Add to the dictionary
        column_name_mapping_common_as_key[common_name] = gaap_tag
        column_name_mapping_gaap_as_key[gaap_tag] = common_name

    return column_name_mapping_common_as_key, column_name_mapping_gaap_as_key

In [17]:
def create_dynamic_mapping(ticker, statement_type):
    accession_number_series = get_10K_accessionNumbers_for_ticker(ticker)
    common_as_key_mapping = defaultdict(list)
    gaap_as_key_mapping = defaultdict(list)
    for accession_number in accession_number_series:
        try:
            soup = get_statement_soup(ticker, accession_number, statement_type)
        except Exception as e:
            logging.error(
                f"Failed to get statement soup: {e} for accession number: {accession_number}"
            )
            soup = None
        if soup is None:
            pass
        else:
            mapping_common, mapping_gaap = create_column_name_mapping_to_statement(soup)
            for common_name, gaap_tag in mapping_common.items():
                common_as_key_mapping[common_name].append(gaap_tag)

            for gaap_tag, common_name in mapping_gaap.items():
                gaap_as_key_mapping[gaap_tag].append(common_name)

    final_mapping = {}

    for common_name, gaap_tags in common_as_key_mapping.items():
        if len(set(gaap_tags)) > 1:
            final_mapping[common_name] = list(set(gaap_tags))
        else:
            gaap_tag = gaap_tags[0]
            final_mapping[gaap_tag] = list(set(gaap_as_key_mapping[gaap_tag]))

    return final_mapping

In [None]:
dynamic_mapping = create_dynamic_mapping(ticker, statement_name)

In [None]:
columns_test, values_test = extract_columns_and_values_from_statement(soup)

In [18]:
def update_column_names_using_mapping(dynamic_mapping: dict, gaap_tags: list) -> list:
    """
    Update a list of GAAP tags based on a dynamic mapping.

    Parameters:
    - dynamic_mapping (dict): The mapping from GAAP tags to common names or vice versa.
    - gaap_tags (list): The list of GAAP tags to update.

    Returns:
    - list: Updated list of GAAP tags.
    """
    updated_gaap_tags = []

    for tag in gaap_tags:
        # If the tag matches a key, keep it as is
        if tag in dynamic_mapping:
            updated_gaap_tags.append(tag)
        else:
            # Check if the tag is a value in any of the keys
            for key, values in dynamic_mapping.items():
                if tag in values:
                    updated_gaap_tags.append(key)
                    break
            else:
                # If the tag didn't match any key or value, keep it as is
                updated_gaap_tags.append(tag)

    return updated_gaap_tags

In [None]:
updated_columns = update_column_names_using_mapping(dynamic_mapping, columns_test)

In [19]:
def process_singular_balance_sheet(
    ticker,
    accession_number,
    statement_type,
    name_mapping: dict = None,
    first: bool = True,
):
    try:
        soup = get_statement_soup(ticker, accession_number, statement_type)
    except Exception as e:
        logging.error(
            f"Failed to get statement soup: {e} for accession number: {accession_number}"
        )
        soup = None
    if soup is None:
        pass
    else:
        columns, values_set = extract_columns_and_values_from_statement(soup)
        if name_mapping is not None:
            columns = update_column_names_using_mapping(name_mapping, columns)
        index_dates = get_two_index_dates_of_statement(soup)
        df = create_dataframe_of_statement_values_columns_dates(
            values_set, columns, index_dates
        )
        df = df.loc[:, ~df.columns.duplicated()]
        if df.empty:
            pass

        elif first == True:
            first = False
            return df

        else:
            renamed_df = df.tail(1)
            return renamed_df

In [20]:
def concatenate_all_sheets(ticker, statement_type, blank: bool = False):
    accession_number_series = get_10K_accessionNumbers_for_ticker(ticker)
    dynamic_name_mapping = create_dynamic_mapping(ticker, statement_type)
    all_sheets = pd.DataFrame()
    first = True
    for i, accession_number in enumerate(accession_number_series):
        if i > 0:
            first = False
        sheet = process_singular_balance_sheet(
            ticker,
            accession_number,
            statement_type,
            name_mapping=dynamic_name_mapping,
            first=first,
        )
        all_sheets = pd.concat([all_sheets, sheet], axis=0, join="outer")
        print(f"concatenated {statement_type} {i} to the full Dataframe")
    if blank == True:
        blank_all_sheets = all_sheets.replace(["None", np.NaN], "")
        return blank_all_sheets.T
    else:
        return all_sheets.T

In [14]:
single_tesing_df = process_singular_balance_sheet(
    ticker, accession_number, statement_name, name_mapping=dynamic_mapping, first=True
)

In [21]:
all_sheets = concatenate_all_sheets(ticker, statement_name, blank=True)

ERROR:root:Failed to get statement soup: 'consolidated balance sheets' for accession number: 000119312510075151


concatenated consolidated balance sheets 0 to the full Dataframe
concatenated consolidated balance sheets 1 to the full Dataframe
concatenated consolidated balance sheets 2 to the full Dataframe
concatenated consolidated balance sheets 3 to the full Dataframe
concatenated consolidated balance sheets 4 to the full Dataframe
concatenated consolidated balance sheets 5 to the full Dataframe
concatenated consolidated balance sheets 6 to the full Dataframe
concatenated consolidated balance sheets 7 to the full Dataframe
concatenated consolidated balance sheets 8 to the full Dataframe
concatenated consolidated balance sheets 9 to the full Dataframe
concatenated consolidated balance sheets 10 to the full Dataframe
concatenated consolidated balance sheets 11 to the full Dataframe
concatenated consolidated balance sheets 12 to the full Dataframe


ERROR:root:Failed to get statement soup: 'consolidated balance sheets' for accession number: 000119312510075151


concatenated consolidated balance sheets 13 to the full Dataframe


In [22]:
all_sheets

Unnamed: 0,2023-01-29,2022-01-30,2021-01-31,2020-02-02,2019-02-03,2018-01-28,2017-01-29,2016-01-31,2015-02-01,2014-02-02,2013-02-03,2012-01-29,2011-01-30
us-gaap_AssetsCurrentAbstract,,,,,,,,,,,,,
us-gaap_CashAndCashEquivalentsAtCarryingValue,367344.0,850338.0,1200337.0,432162.0,338954.0,390136.0,213713.0,193647.0,222927.0,330121.0,424555.0,502757.0,628403000.0
us-gaap_ReceivablesNetCurrent,115685.0,131683.0,143728.0,111737.0,107102.0,90119.0,88803.0,79304.0,67465.0,60330.0,62985.0,45961.0,41565000.0
us-gaap_InventoryNet,1456123.0,1246372.0,1006299.0,1100544.0,1124992.0,1061593.0,977505.0,978138.0,887701.0,813160.0,640024.0,553461.0,513381000.0
us-gaap_PrepaidExpenseCurrent,64961.0,69252.0,93822.0,90426.0,101356.0,62204.0,52882.0,44654.0,36265.0,35309.0,26339.0,24188.0,21120000.0
us-gaap_OtherAssetsCurrent,31967.0,26249.0,22894.0,20766.0,21939.0,11876.0,10652.0,11438.0,13005.0,10852.0,9819.0,9229.0,8176000.0
us-gaap_AssetsCurrent,2036080.0,2323894.0,2467080.0,1755635.0,1694343.0,1636445.0,1367180.0,1336100.0,1391923.0,1419103.0,1316772.0,1276366.0,1347594000.0
us-gaap_PropertyPlantAndEquipmentNet,1065381.0,920773.0,873894.0,929038.0,929635.0,932283.0,923283.0,886813.0,883012.0,849293.0,812037.0,734672.0,730556000.0
us-gaap_OperatingLeaseRightOfUseAsset,1286452.0,1132764.0,1086009.0,1166383.0,,,,,,,,,
Deferred income taxes net,81389.0,56585.0,61854.0,47977.0,,,,,,121486.0,99764.0,91744.0,85612000.0


In [None]:
def create_many_to_one_name_mapping_to_statement(soup: BeautifulSoup) -> dict:
    """
    Modified to create a many-to-one dictionary. still need to implement
    """
    column_name_mapping = defaultdict(list)
    for row in soup.select("tr.re, tr.ro, tr.reu, tr.rou"):
        onclick_attr = row.select_one("td.pl a, td.pl.custom a")["onclick"]
        column_title = onclick_attr.split("defref_")[-1].split("',")[0]
        displayed_name = row.select_one("td.pl a, td.pl.custom a").text.strip()
        displayed_name = keep_letters_and_numbers_only_in_string(displayed_name).lower()
        column_name_mapping[displayed_name].append(column_title)
    return column_name_mapping

In [None]:
def rename_df_columns_using_mapping(
    df: pd.DataFrame, column_name_mapping: dict
) -> pd.DataFrame:
    """
    Rename columns in the DataFrame using the many-to-one mapping dictionary.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - column_name_mapping (dict): The many-to-one mapping dictionary.

    Returns:
    - pd.DataFrame: DataFrame with renamed columns.
    """
    # Create a reverse mapping from internal column title to displayed name
    reverse_mapping = {
        col: disp_name
        for disp_name, cols in column_name_mapping.items()
        for col in cols
    }
    # Rename columns in the DataFrame
    df = df.rename(columns=reverse_mapping)

    return df