In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import pdfkit

In [2]:
pip install pdfkit

Note: you may need to restart the kernel to use updated packages.


In [3]:
companyData = pd.read_csv('datasets/company_data.csv')
companyData['cik_str'] = companyData['cik_str'].astype(str).str.zfill(10)

In [4]:
target_company_data = companyData[companyData['ticker'].isin(['AAPL', 'MSFT', 'NVDA', 'GOOGL', 'AMZN', 'META', 'TSLA'])]

In [5]:
target_company_data = target_company_data.set_index('ticker')
target_company_data

Unnamed: 0_level_0,cik_str,title
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
AAPL,320193,Apple Inc.
NVDA,1045810,NVIDIA CORP
MSFT,789019,MICROSOFT CORP
GOOGL,1652044,Alphabet Inc.
AMZN,1018724,AMAZON COM INC
META,1326801,"Meta Platforms, Inc."
TSLA,1318605,"Tesla, Inc."


In [6]:
headers = {
        "User-Agent": "wuyang.gao@mail.utoronto.ca"
    }

In [7]:
def extract_recent_company_filings(cik):
    """
    Extracts recent company filings from the SEC  database using the company's CIK (Central Index Key).

    Parameters:
    cik (str): The Central Index Key (CIK) of the company.

    Returns:
    pd.DataFrame: A DataFrame containing metadata for recent filings.
    """
    headers = {
        "User-Agent": "wuyang.gao@mail.utoronto.ca"
    }

    filingMetadata = requests.get(
        f'https://data.sec.gov/submissions/CIK{cik}.json', 
        headers = headers
    )

    allForms = pd.DataFrame.from_dict(
        filingMetadata.json()['filings']['recent']
    )

    return allForms

In [8]:
cik = target_company_data.loc['AAPL','cik_str']
all_forms = extract_recent_company_filings(cik)
all_forms.head()

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,core_type,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0000320193-24-000116,2024-10-17,2024-10-15,2024-10-17T18:30:18.000Z,,4,,,,4,15760,0,0,xslF345X05/wk-form4_1729204211.xml,FORM 4
1,0000320193-24-000114,2024-10-08,2024-10-04,2024-10-08T18:30:13.000Z,,4,,,,4,5246,0,0,xslF345X05/wk-form4_1728426607.xml,FORM 4
2,0001958244-24-005135,2024-10-04,,2024-10-04T17:04:02.000Z,33.0,144,001-36743,241355561.0,,144,5409,0,0,xsl144X01/primary_doc.xml,
3,0000320193-24-000112,2024-10-03,2024-10-01,2024-10-03T18:31:01.000Z,,4,,,,4,15107,0,0,xslF345X05/wk-form4_1727994654.xml,FORM 4
4,0000320193-24-000111,2024-10-03,2024-10-01,2024-10-03T18:30:50.000Z,,4,,,,4,10988,0,0,xslF345X05/wk-form4_1727994644.xml,FORM 4


In [9]:
all_forms_test =all_forms[all_forms['form']=='10-K']

In [10]:
all_forms_test.head()

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,core_type,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
92,0000320193-23-000106,2023-11-03,2023-09-30,2023-11-02T18:08:27.000Z,34,10-K,001-36743,231373899,,XBRL,9569569,1,1,aapl-20230930.htm,10-K
170,0000320193-22-000108,2022-10-28,2022-09-24,2022-10-27T18:01:14.000Z,34,10-K,001-36743,221338448,,XBRL,10332356,1,1,aapl-20220924.htm,10-K
245,0000320193-21-000105,2021-10-29,2021-09-25,2021-10-28T18:04:28.000Z,34,10-K,001-36743,211359752,,XBRL,10502096,1,1,aapl-20210925.htm,10-K
314,0000320193-20-000096,2020-10-30,2020-09-26,2020-10-29T18:06:25.000Z,34,10-K,001-36743,201273977,,XBRL,12502600,1,1,aapl-20200926.htm,10-K
385,0000320193-19-000119,2019-10-31,2019-09-28,2019-10-30T18:12:36.000Z,34,10-K,001-36743,191181423,,XBRL,12861616,1,1,a10-k20199282019.htm,10-K


In [None]:
https://www.sec.gov/Archives/edgar/data/0000320193/000032019322000108/aapl-20220924.htm

In [11]:
all_forms['form'].unique()

array(['4', '144', '5', '8-K', '10-Q', 'UPLOAD', 'CORRESP', 'SD', '3',
       'SC 13G/A', 'PX14A6G', '25-NSE', 'DEFA14A', 'DEF 14A', '10-K',
       '424B2', 'FWP', '4/A', 'S-8', 'S-8 POS', 'PX14A6N', 'S-3ASR',
       'IRANNOTICE', 'CERT', '8-A12B', '3/A', '25', 'SC 13G', '8-K/A',
       'CERTNYS', 'NO ACT'], dtype=object)

In [None]:
https://www.sec.gov/Archives/edgar/data/0000320193/0000320193-23-000106/aapl-20230930.htm

In [12]:
doc_url = f'https://www.sec.gov/Archives/edgar/data/0000320193/000032019322000108/aapl-20220924.htm'
response_doc = requests.get(doc_url, headers=headers)

In [13]:
html_file = "temp_aapl.html"
with open(html_file, 'wb') as file:
    file.write(response_doc.content)

In [16]:
pdf_file = "aapl_report.pdf"
pdfkit.from_file(html_file, pdf_file)

OSError: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError


In [18]:
def exrtact_form_filings(cik, time, form, save_path):

    all_forms = extract_recent_company_filings(cik)

    all_forms['filingDate'] = pd.to_datetime(all_forms['filingDate'])

    all_forms = all_forms[all_forms['filingDate']>=time]

    specific_form = all_forms[all_forms['form']==form]

    id_df = specific_form[['accessionNumber', 'primaryDocument', 'filingDate']]
    
    headers = {
        "User-Agent": "wuyang.gao@mail.utoronto.ca"
    }

    os.makedirs(save_path, exist_ok=True)


    for i in range(id_df.shape[0]):
        
        accession_num = id_df.iloc[i]['accessionNumber'].replace("-", "")
        # this step is very important
        primary_doc = id_df.iloc[i]['primaryDocument']
        filing_date = id_df.iloc[i]['filingDate'].strftime('%Y-%m-%d')

        doc_url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accession_num}/{primary_doc}'
        response_doc = requests.get(doc_url, headers=headers)
        

        

        if response_doc.status_code == 200:
            html_file = os.path.join(save_path, f'{form}_{filing_date}_{accession_num}.html')             
            with open(html_file, 'wb') as file:
                file.write(response_doc.content)
            pdf_file = os.path.join(save_path, f'{form}_{filing_date}_{accession_num}.pdf') 

            try:
                pdfkit.from_file(html_file, pdf_file)
                print(f"PDF saved as {pdf_file}")
            except OSError as e:
                print(f"An error occurred but was ignored: {e}")
            

            print(f'Successfully downloaded: {pdf_file}')
        else:
            print(f'Failed to download: {doc_url}. Status code: {response_doc.status_code}')

In [19]:
target_company_data = target_company_data.reset_index()

In [20]:
target_company_data

Unnamed: 0,ticker,cik_str,title
0,AAPL,320193,Apple Inc.
1,NVDA,1045810,NVIDIA CORP
2,MSFT,789019,MICROSOFT CORP
3,GOOGL,1652044,Alphabet Inc.
4,AMZN,1018724,AMAZON COM INC
5,META,1326801,"Meta Platforms, Inc."
6,TSLA,1318605,"Tesla, Inc."


In [21]:
form_list = ['8-K', '10-Q', '10-K']
for form in form_list:
    for i in range(target_company_data.shape[0]):
        cik = target_company_data.iloc[i, 1]
        time = pd.to_datetime('2014-01-01')
        
        save_path = f'datasets/filings/{target_company_data.iloc[i,0]}'
        
        exrtact_form_filings(cik, time, form, save_path)
    


An error occurred but was ignored: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError

Successfully downloaded: datasets/filings/AAPL/8-K_2024-09-10_000114036124040659.pdf
An error occurred but was ignored: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError

Successfully downloaded: datasets/filings/AAPL/8-K_2024-08-26_000114036124038601.pdf
An error occurred but was ignored: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError

Successfully downloaded: datasets/filings/AAPL/8-K_2024-08-23_000114036124038403.pdf
An error occurred but was ignored: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError

Successfully downloaded: datasets/filings/AAPL/8-K_2024-08-01_000032019324000080.pdf
An error occurred but was ignored: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError

Successfully downloaded: 

In [28]:
def Find_Entity_Common_Stock_Shares_Outstanding(cik):
    headers = {
        "User-Agent": "wuyang.gao@mail.utoronto.ca"
    }

    # Fetch the data from the SEC API
    response = requests.get(
        f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',
        headers=headers
    )
    
    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(f"Error fetching data: {response.status_code}")

    # Load the JSON data
    companyFacts = response.json()

    # Check if 'facts' key exists and navigate to the required data
    if 'facts' in companyFacts and 'dei' in companyFacts['facts']:
        dei_facts = companyFacts['facts']['dei']
        
        # Check if 'EntityCommonStockSharesOutstanding' exists
        if 'EntityCommonStockSharesOutstanding' in dei_facts:
            shares_outstanding = dei_facts['EntityCommonStockSharesOutstanding']['units']['shares']
            
            # Extract time and value lists
            time_list = [item['end'] for item in shares_outstanding]
            value_list = [item['val'] for item in shares_outstanding]
            
            # Create DataFrame
            overall_df = pd.DataFrame({'time': time_list, 'number_of_shares': value_list})

            return overall_df
        else:
            raise KeyError("EntityCommonStockSharesOutstanding not found in DEI facts.")
    else:
        raise KeyError("DEI facts not found in company facts.")

# Example usage
# cik_number = '0000320193'  # Example CIK number for Apple Inc.
# df = Find_Entity_Common_Stock_Shares_Outstanding(cik_number)
# print(df)


In [30]:
target_company_data

Unnamed: 0,ticker,cik_str,title
0,AAPL,320193,Apple Inc.
1,MSFT,789019,MICROSOFT CORP
2,NVDA,1045810,NVIDIA CORP
3,GOOGL,1652044,Alphabet Inc.
4,AMZN,1018724,AMAZON COM INC
5,META,1326801,"Meta Platforms, Inc."
6,TSLA,1318605,"Tesla, Inc."


In [34]:
for i in range(6, target_company_data.shape[0]):
    cik = target_company_data.iloc[i, 1]
    save_path = f'datasets/common_shares_outstanding'
    os.makedirs(save_path, exist_ok=True)

    file_path = os.path.join(save_path, f'{target_company_data.iloc[i,0]}.csv')

    stock_shares_outstanding_df = Find_Entity_Common_Stock_Shares_Outstanding(cik)
    stock_shares_outstanding_df.to_csv(file_path, index=False)

# Here, we found that GOOGL, META don't have this value



In [40]:
import requests
import pandas as pd

def Find_Entity_Public_Float(cik):
    headers = {
        "User-Agent": "wuyang.gao@mail.utoronto.ca"
    }

    # Fetch the data from the SEC API
    response = requests.get(
        f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',
        headers=headers
    )
    
    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(f"Error fetching data: {response.status_code}")

    # Load the JSON data
    companyFacts = response.json()

    # Check if 'facts' key exists and navigate to the required data
    if 'facts' in companyFacts and 'dei' in companyFacts['facts']:
        dei_facts = companyFacts['facts']['dei']
        
        # Check if 'EntityPublicFloat' exists
        if 'EntityPublicFloat' in dei_facts:
            public_float_data = dei_facts['EntityPublicFloat']['units']['USD']
            
            # Extract time and value lists
            time_list = [item['end'] for item in public_float_data]
            value_list = [item['val'] for item in public_float_data]
            
            # Create DataFrame
            overall_df = pd.DataFrame({'time': time_list, 'market_capitalization': value_list})

            return overall_df
        else:
            raise KeyError("EntityPublicFloat not found in DEI facts.")
    else:
        raise KeyError("DEI facts not found in company facts.")

# Example usage
# cik_number = '0000320193'  # Example CIK number for Apple Inc.
# df = Find_Entity_Public_Float(cik_number)
# print(df)


In [41]:
for i in range(target_company_data.shape[0]):
    cik = target_company_data.iloc[i, 1]
    save_path = f'datasets/entity_public_float'
    os.makedirs(save_path, exist_ok=True)

    file_path = os.path.join(save_path, f'{target_company_data.iloc[i,0]}.csv')

    stock_shares_outstanding_df = Find_Entity_Public_Float(cik)
    stock_shares_outstanding_df.to_csv(file_path, index=False)

In [42]:
# Here it shows the 
def Find_Net_Income_Loss(cik):
    companyConcept = requests.get(
        f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/NetIncomeLoss.json',
        headers=headers
    )

    net_income_loss = companyConcept.json()['units']['USD']
    start_time = [item['start'] for item in net_income_loss]
    end_time = [item['end'] for item in net_income_loss]
    value = [item['val'] for item in net_income_loss]
    net_income_loss_df = pd.DataFrame({'start':start_time, 'end':end_time, 'net_income_loss':value})
    return net_income_loss_df

In [44]:
for i in range(target_company_data.shape[0]):
    cik = target_company_data.iloc[i, 1]
    save_path = f'datasets/net_income_loss'
    os.makedirs(save_path, exist_ok=True)

    file_path = os.path.join(save_path, f'{target_company_data.iloc[i,0]}.csv')

    net_income_loss_df = Find_Net_Income_Loss(cik)
    net_income_loss_df.to_csv(file_path, index=False)


In [45]:
def Find_Assets(cik):
    companyConcept = requests.get(
        f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/Assets.json',
        headers=headers
    )

    net_income_loss = companyConcept.json()['units']['USD']
    time = [item['end'] for item in net_income_loss]
    value = [item['val'] for item in net_income_loss]
    assets_df = pd.DataFrame({'time':time, 'assets':value})
    return assets_df

In [46]:
for i in range(target_company_data.shape[0]):
    cik = target_company_data.iloc[i, 1]
    save_path = f'datasets/assets'
    os.makedirs(save_path, exist_ok=True)

    file_path = os.path.join(save_path, f'{target_company_data.iloc[i,0]}.csv')

    assets_df = Find_Assets(cik)
    assets_df.to_csv(file_path, index=False)

In [58]:
def Find_Revenues(cik):
    companyConcept = requests.get(
        f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/Revenues.json',
        headers=headers
    )

    revenues = companyConcept.json()['units']['USD']
    time = [item['end'] for item in revenues]
    value = [item['val'] for item in revenues]
    revenues_df = pd.DataFrame({'time':time, 'assets':value})
    return revenues_df

In [56]:
for i in range(5,target_company_data.shape[0]):
    cik = target_company_data.iloc[i, 1]
    save_path = f'datasets/revenues'
    os.makedirs(save_path, exist_ok=True)

    file_path = os.path.join(save_path, f'{target_company_data.iloc[i,0]}.csv')

    assets_df = Find_Revenues(cik)
    assets_df.to_csv(file_path, index=False)

# Here, we couldn't find Amason on the revenue.

In [64]:
def Find_Accounts_Payable(cik):
    companyConcept = requests.get(
        f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/AccountsPayable.json',
        headers=headers
    )

    AccountsPayable = companyConcept.json()['units']['USD']
    time = [item['end'] for item in AccountsPayable]
    value = [item['val'] for item in AccountsPayable]
    accounts_payable_df = pd.DataFrame({'time':time, 'accounts_payable':value})
    return accounts_payable_df

In [77]:
for i in range(target_company_data.shape[0]):
    cik = target_company_data.iloc[i, 1]
    save_path = f'datasets/accounts_payable'
    os.makedirs(save_path, exist_ok=True)

    file_path = os.path.join(save_path, f'{target_company_data.iloc[i,0]}.csv')

    accounts_payable_df = Find_Accounts_Payable(cik)
    accounts_payable_df.to_csv(file_path, index=False)
# Here we find most of them don't have this accounts_payable.

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [73]:
def Find_Liabilities(cik):
    companyConcept = requests.get(
        f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/Liabilities.json',
        headers=headers
    )

    Liabilities = companyConcept.json()['units']['USD']
    time = [item['end'] for item in Liabilities]
    value = [item['val'] for item in Liabilities]
    liabilities_df = pd.DataFrame({'time':time, 'liabilities':value})
    return liabilities_df

In [76]:
for i in range(5, target_company_data.shape[0]):
    cik = target_company_data.iloc[i, 1]
    save_path = f'datasets/liabilities'
    os.makedirs(save_path, exist_ok=True)

    file_path = os.path.join(save_path, f'{target_company_data.iloc[i,0]}.csv')

    liabilities_df = Find_Liabilities(cik)
    liabilities_df.to_csv(file_path, index=False)

# Here Amazon doesn't have this