In [1]:
import bs4
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random

In [4]:
# header needed to interact with SEC APIs

USER_AGENT = 'test@test.com'
headers = {'User-Agent': USER_AGENT}

In [9]:

def get_10k_details(cik, n, headers=headers):
    cik = str(cik).zfill(10)
    base_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=10-K&count={n}"
    response = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extracting company information
    company_name_tag = soup.find("span", class_="companyName")
    company_name = company_name_tag.text.split(" CIK")[0] if company_name_tag else ""
    if company_name_tag and company_name_tag.find("a"):
        cik_number = company_name_tag.find("a").text.replace(' (see all company filings)','')
    else: 
        print(company_name_tag)
        print(cik)
        raise ValueError('No CIK found')

    entries = []
    table = soup.find("table", class_="tableFile2")
    if table:
        rows = table.find_all("tr")[1:]  # skip header row
        for row in rows:
            cells = row.find_all("td")
            if len(cells) >= 5:
                form = cells[0].text.strip()
                filing_date = cells[3].text.strip()
                acc_no = cells[2].text.split("Acc-no:")[1].split()[0].strip() if "Acc-no:" in cells[2].text else ""
                documents_link = cells[1].find("a", href=True)["href"]
                interactive_data_link = cells[1].find("a", href=True, id="interactiveDataBtn")["href"] if cells[1].find("a", href=True, id="interactiveDataBtn") else ""
                
                entry_dict = {
                    "form": form,
                    "file_number": acc_no,
                    "form_link": None,
                    "attachments_link": None,
                    "index_link": "https://www.sec.gov" + documents_link,
                    "interactive_elements_link": "https://www.sec.gov" + interactive_data_link,
                    "year": None,
                    "quarter": "full year",
                    "filing_date": filing_date,
                    "CIK": cik_number,
                    "company_name": company_name
                    }
                entries.append(entry_dict)
    
    for entry in entries:
        updated_entry_dict = update_entry_with_attachments(entry)

    return entries

def update_entry_with_attachments(entry_dict, headers=headers):
    headers = headers
    response = requests.get(entry_dict['index_link'], headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Initialize attachments link as an empty dictionary
    entry_dict['attachments_link'] = {}

    # Find all tables with class 'tableFile' (both Document Format Files and Data Files)
    tables = soup.find_all("table", class_="tableFile")
    for table in tables:
        rows = table.find_all("tr")[1:]  # skip header row
        for row in rows:
            cells = row.find_all("td")
            if len(cells) > 3:  # Ensure row has enough columns to avoid IndexError
                seq = cells[0].text.strip()
                description = cells[1].text.strip()
                doc_link = cells[2].find("a", href=True)["href"].strip() if cells[2].find("a", href=True) else ""
                doc_type = cells[3].text.strip()

                # Check for the 10-K document to update form_link
                if doc_type in ['10-K', '10-K/A', '10-KT', '10-KT/A'] and entry_dict['form_link'] is None:
                    entry_dict['form_link'] = "https://www.sec.gov" + doc_link.replace('ix?doc=/','')
                else:
                    key_format = f"{seq}-{description}-{doc_type}" if description else f"{seq}-no_description-{doc_type}"
                    entry_dict['attachments_link'][key_format] = "https://www.sec.gov" + doc_link

    period_of_report_div = soup.find("div", class_="infoHead", string="Period of Report")
    if period_of_report_div:
        period_of_report_info = period_of_report_div.find_next_sibling("div", class_="info")
        if period_of_report_info:
            period_of_report = period_of_report_info.text.strip()
            report_year = period_of_report.split("-")[0]  # Extract the year part
            entry_dict['year'] = report_year
        else:
            print(entry_dict['company_name'], 'CIK', "Period of Report info not found.1")
    else:
        print(entry_dict['company_name'], 'CIK', "Period of Report info not found.2")

    return entry_dict



In [6]:
sp500 = pd.read_csv('sp500_constituents.csv')
sp500_CIK = [str(cik).zfill(10) for cik in sp500['CIK'].unique()]
# sp500_CIK

In [7]:
get_10k_details(random.choice(sp500_CIK), 5)

test = get_10k_details('0000202058', 5)

for d in test:
    print(d['form_link'])
    print(d['index_link'])


https://www.sec.gov/Archives/edgar/data/202058/000020205824000029/hrs-20231229.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205824000029/0000202058-24-000029-index.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205823000014/hrs-20221230.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205823000014/0000202058-23-000014-index.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205822000015/hrs-20211231.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205822000015/0000202058-22-000015-index.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205821000008/hrs-20210101.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205821000008/0000202058-21-000008-index.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205820000011/lhx13202010-kt.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205820000011/0000202058-20-000011-index.htm
https://www.sec.gov/Archives/edgar/data/202058/000020205819000090/hrs628201910-k.htm
https://www.s

In [10]:
%%time

# documents_sp500 = [get_10k_details(cik, 10) for cik in sp500_CIK]

from tqdm.notebook import tqdm  # or from tqdm import tqdm_notebook as tqdm in older versions

# Assuming sp500_CIK is your list of CIKs
documents_sp500 = [get_10k_details(cik, 10) for cik in tqdm(sp500_CIK, desc="Retrieving 10-K Details")]


document_dictionaries = [document_dict for document_list in documents_sp500 for document_dict in document_list]


Retrieving 10-K Details:   0%|          | 0/500 [00:00<?, ?it/s]

CPU times: user 2min 10s, sys: 4.55 s, total: 2min 15s
Wall time: 27min 46s


In [13]:
print(len(document_dictionaries))


4843
4843


In [15]:
import json

# Define the file path
file_path = "sp500_ten_years_documents.json"

# Write the list of dictionaries to the JSON file
with open(file_path, 'w') as json_file:
    json.dump(document_dictionaries, json_file)