In [1]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from edgar_func_with_flow import *


headers = {"User-Agent": "russ@sunriseanalysis.com"}
ticker = "WSM"

In [2]:
cik = get_cik_matching_ticker(ticker)
ten_K_accession_numbers = get_10K_accessionNumbers_for_ticker(ticker)
ten_Q_accession_numbers = get_10Q_accessionNumbers_for_ticker(ticker)
first_ten_K_accession_number = ten_K_accession_numbers.iloc[0].replace("-", "")
base_link = (
    f"https://www.sec.gov/Archives/edgar/data/{cik}/{first_ten_K_accession_number}"
)

filing_summary_link = f"{base_link}/FilingSummary.xml"
filing_summary_response = requests.get(filing_summary_link, headers=headers)
soup = BeautifulSoup(filing_summary_response.content, "lxml-xml", from_encoding="utf-8")

statement_file_names_dict = {}

# Iterating through each Report tag and extracting the required information
for report in soup.find_all("Report"):
    short_name_tag = report.find("ShortName")
    long_name_tag = report.find("LongName")
    html_file_name_tag = report.find("HtmlFileName")  # Looking for HtmlFileName
    xml_file_name_tag = report.find("XmlFileName")  # Looking for XmlFileName

    # Determine the file name, considering both HTML and XML
    file_name = None
    if html_file_name_tag:
        file_name = html_file_name_tag.text
    elif xml_file_name_tag:
        file_name = xml_file_name_tag.text

    # Adding to the dictionary if all relevant tags are found and "Statement" is in the long name
    if (
        long_name_tag
        and short_name_tag
        and file_name
        and "Statement" in long_name_tag.text
    ):
        short_name = short_name_tag.text.lower()
        statement_file_names_dict[short_name] = file_name

# Result
statement_file_names_dict

{'consolidated statements of earnings': 'R3.htm',
 'consolidated statements of comprehensive income': 'R4.htm',
 'consolidated statements of comprehensive income (parenthetical)': 'R5.htm',
 'consolidated balance sheets': 'R6.htm',
 'consolidated balance sheets (parenthetical)': 'R7.htm',
 "consolidated statements of stockholders' equity": 'R8.htm',
 'consolidated statements of cash flows': 'R9.htm'}

In [5]:
ten_K_accession_numbers = get_10K_accessionNumbers_for_ticker(ticker)
ten_Q_accession_numbers = get_10Q_accessionNumbers_for_ticker(ticker)
first_ten_K_accession_number = ten_K_accession_numbers.iloc[0].replace("-", "")


def get_statement_file_names(ticker, accession_number):
    cik = get_cik_matching_ticker(ticker)
    base_link = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}"
    filing_summary_link = f"{base_link}/FilingSummary.xml"
    headers = {"User-Agent": "russ@sunriseanalysis.com"}
    filing_summary_response = requests.get(filing_summary_link, headers=headers)
    soup = BeautifulSoup(
        filing_summary_response.content, "lxml-xml", from_encoding="utf-8"
    )
    statement_file_names_dict = {}

    for report in soup.find_all("Report"):
        short_name_tag = report.find("ShortName")
        long_name_tag = report.find("LongName")
        html_file_name_tag = report.find("HtmlFileName")
        xml_file_name_tag = report.find("XmlFileName")
        file_name = (
            html_file_name_tag.text
            if html_file_name_tag
            else xml_file_name_tag.text
            if xml_file_name_tag
            else None
        )

        if (
            long_name_tag
            and short_name_tag
            and file_name
            and "Statement" in long_name_tag.text
        ):
            short_name = short_name_tag.text.lower()
            statement_file_names_dict[short_name] = file_name

    return statement_file_names_dict


# this is included in the edgar_func_with_flow.py file

In [6]:
get_statement_file_names(ticker, first_ten_K_accession_number)

{'consolidated statements of earnings': 'R3.htm',
 'consolidated statements of comprehensive income': 'R4.htm',
 'consolidated statements of comprehensive income (parenthetical)': 'R5.htm',
 'consolidated balance sheets': 'R6.htm',
 'consolidated balance sheets (parenthetical)': 'R7.htm',
 "consolidated statements of stockholders' equity": 'R8.htm',
 'consolidated statements of cash flows': 'R9.htm'}

In [9]:
# balance_sheet_link = (f"{base_link}/{statement_file_names_dict['consolidated balance sheets']}")
# earnings_link = (f"{base_link}/{statement_file_names_dict['consolidated statements of earnings']}")
# cash_flow_link = (f"{base_link}/{statement_file_names_dict['consolidated statements of cash flows']}")
def get_statement_soup(ticker, accession_number, statement_name):
    """
    the statement_name should be one of the following:
    'consolidated balance sheets'
    'consolidated statements of earnings'
    'consolidated statements of cash flows'
    """
    cik = get_cik_matching_ticker(ticker)
    headers = {"User-Agent": "russ@sunriseanalysis.com"}
    base_link = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}"
    statement_file_names_dict = get_statement_file_names(ticker, accession_number)
    statement_link = f"{base_link}/{statement_file_names_dict[statement_name]}"
    statement_response = requests.get(statement_link, headers=headers)
    if statement_link.endswith(".xml"):
        soup = BeautifulSoup(
            statement_response.content, "lxml-xml", from_encoding="utf-8"
        )
    else:
        soup = BeautifulSoup(statement_response.content, "lxml")
    return soup

In [10]:
soup = get_statement_soup(
    ticker, first_ten_K_accession_number, "consolidated balance sheets"
)

In [12]:
statement_response = requests.get(balance_sheet_link, headers=headers)
if balance_sheet_link.endswith(".xml"):
    soup2 = BeautifulSoup(statement_response.content, "lxml-xml", from_encoding="utf-8")
else:
    soup2 = BeautifulSoup(statement_response.content, "lxml")

NameError: name 'balance_sheet_link' is not defined

In [11]:
columns = []
values = []
index_date = soup2.select_one("th.th > div").text.strip()
for row in soup2.select("tr.re, tr.ro"):
    onclick_attr = row.select_one("td.pl a, td.pl.custom a")["onclick"]
    column_title = onclick_attr.split("defref_")[-1].split("',")[0]
    columns.append(column_title)
    value_tag = row.select_one("td.nump")
    neg_tag = row.select_one("td.num")
    if value_tag:
        value = value_tag.text.replace("$", "").replace(",", "").strip()
        values.append(value)
    elif neg_tag:
        value = (
            neg_tag.text.replace("$", "")
            .replace(",", "")
            .replace("(", "")
            .replace(")", "")
            .strip()
        )
        values.append("-" + value)
    else:
        values.append(None)

# Creating a DataFrame with the extracted data
df = pd.DataFrame([values], columns=columns, index=[index_date])

# Creating a dictionary to map column titles to displayed names
column_name_mapping = {}
for row in soup2.select("tr.re, tr.ro"):
    onclick_attr = row.select_one("td.pl a, td.pl.custom a")["onclick"]
    column_title = onclick_attr.split("defref_")[-1].split("',")[0]
    displayed_name = row.select_one("td.pl a, td.pl.custom a").text.strip()
    column_name_mapping[column_title] = displayed_name

NameError: name 'soup2' is not defined

In [13]:
def parse_balance_sheet(soup):
    # Extracting the index (first date)
    index_date = soup.select_one("th.th > div").text.strip()
    columns = []
    values = []

    # Extracting columns and values
    for row in soup.select("tr.re, tr.ro"):
        onclick_attr = row.select_one("td.pl a, td.pl.custom a")["onclick"]
        column_title = onclick_attr.split("defref_")[-1].split("',")[0]
        columns.append(column_title)
        value_tag = row.select_one("td.nump")
        neg_tag = row.select_one("td.num")
        if value_tag:
            value = value_tag.text.replace("$", "").replace(",", "").strip()
            values.append(value)
        elif neg_tag:
            value = (
                neg_tag.text.replace("$", "")
                .replace(",", "")
                .replace("(", "")
                .replace(")", "")
                .strip()
            )
            values.append("-" + value)
        else:
            values.append(None)

    # Creating a DataFrame with the extracted data
    df = pd.DataFrame([values], columns=columns, index=[index_date])
    df.dropna(axis=1, how='all', inplace=True) # drop columns with all NaN values

    # Creating a dictionary to map column titles to displayed names
    column_name_mapping = {}
    for row in soup.select("tr.re, tr.ro"):
        onclick_attr = row.select_one("td.pl a, td.pl.custom a")["onclick"]
        column_title = onclick_attr.split("defref_")[-1].split("',")[0]
        displayed_name = row.select_one("td.pl a, td.pl.custom a").text.strip()
        column_name_mapping[column_title] = displayed_name

    return df, column_name_mapping

In [14]:
df, names = parse_balance_sheet(soup)
df

Unnamed: 0,us-gaap_CashAndCashEquivalentsAtCarryingValue,us-gaap_ReceivablesNetCurrent,us-gaap_InventoryNet,us-gaap_PrepaidExpenseCurrent,us-gaap_OtherAssetsCurrent,us-gaap_PropertyPlantAndEquipmentNet,us-gaap_OperatingLeaseRightOfUseAsset,us-gaap_DeferredIncomeTaxAssetsNet,us-gaap_Goodwill,us-gaap_OtherAssetsNoncurrent,...,us-gaap_OtherLiabilitiesCurrent,wsm_DeferredLeaseIncentivesLiabilityNoncurrent,us-gaap_OperatingLeaseLiabilityNoncurrent,us-gaap_OtherLiabilitiesNoncurrent,us-gaap_PreferredStockValue,us-gaap_CommonStockValue,us-gaap_AdditionalPaidInCapital,us-gaap_RetainedEarningsAccumulatedDeficit,us-gaap_AccumulatedOtherComprehensiveIncomeLossNetOfTax,us-gaap_TreasuryStockCommonValue
"Jan. 29, 2023",367344,115685,1456123,64961,31967,1065381,1286452,81389,77307,116407,...,108138,10027,1211693,103794,0,663,573117,1141819,-13809,-739


In [15]:

def get_index_date(soup: BeautifulSoup) -> str:
    return soup.select_one("th.th > div").text.strip()

def extract_columns_and_values(soup: BeautifulSoup):
    columns = []
    values = []
    for row in soup.select("tr.re, tr.ro"):
        onclick_attr = row.select_one("td.pl a, td.pl.custom a")["onclick"]
        column_title = onclick_attr.split("defref_")[-1].split("',")[0]
        columns.append(column_title)
        
        value_tag = row.select_one("td.nump")
        neg_tag = row.select_one("td.num")
        
        if value_tag:
            value = value_tag.text.replace("$", "").replace(",", "").strip()
            values.append(value)
        elif neg_tag:
            value = neg_tag.text.replace("$", "").replace(",", "").replace("(", "").replace(")", "").strip()
            values.append("-" + value)
        else:
            values.append(None)
    return columns, values

def create_dataframe(columns: list, values: list, index_date: str) -> pd.DataFrame:
    df = pd.DataFrame([values], columns=columns, index=[index_date])
    df.dropna(axis=1, how='all', inplace=True)
    return df

def create_column_name_mapping(soup: BeautifulSoup) -> dict:
    column_name_mapping = {}
    for row in soup.select("tr.re, tr.ro"):
        onclick_attr = row.select_one("td.pl a, td.pl.custom a")["onclick"]
        column_title = onclick_attr.split("defref_")[-1].split("',")[0]
        displayed_name = row.select_one("td.pl a, td.pl.custom a").text.strip()
        column_name_mapping[column_title] = displayed_name
    return column_name_mapping

def parse_balance_sheet(soup: BeautifulSoup):
    index_date = get_index_date(soup)
    columns, values = extract_columns_and_values(soup)
    df = create_dataframe(columns, values, index_date)
    column_name_mapping = create_column_name_mapping(soup)
    
    return df, column_name_mapping


In [17]:
df, names = parse_balance_sheet(soup)

In [18]:
df

Unnamed: 0,us-gaap_CashAndCashEquivalentsAtCarryingValue,us-gaap_ReceivablesNetCurrent,us-gaap_InventoryNet,us-gaap_PrepaidExpenseCurrent,us-gaap_OtherAssetsCurrent,us-gaap_PropertyPlantAndEquipmentNet,us-gaap_OperatingLeaseRightOfUseAsset,us-gaap_DeferredIncomeTaxAssetsNet,us-gaap_Goodwill,us-gaap_OtherAssetsNoncurrent,...,us-gaap_OtherLiabilitiesCurrent,wsm_DeferredLeaseIncentivesLiabilityNoncurrent,us-gaap_OperatingLeaseLiabilityNoncurrent,us-gaap_OtherLiabilitiesNoncurrent,us-gaap_PreferredStockValue,us-gaap_CommonStockValue,us-gaap_AdditionalPaidInCapital,us-gaap_RetainedEarningsAccumulatedDeficit,us-gaap_AccumulatedOtherComprehensiveIncomeLossNetOfTax,us-gaap_TreasuryStockCommonValue
"Jan. 29, 2023",367344,115685,1456123,64961,31967,1065381,1286452,81389,77307,116407,...,108138,10027,1211693,103794,0,663,573117,1141819,-13809,-739
