In [4]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import time
import json

# SEC headers – you must include a valid user-agent
HEADERS = {
    "User-Agent": "Your Name your.email@example.com"  # ⚠️ Change this to your information
}

# Company CIKs (Central Index Key)
companies = {
    "Microsoft": "0000789019",
    "Apple": "0000320193",
    "Tesla": "0001318605"
}

BASE_URL = "https://data.sec.gov"
SEARCH_URL = "https://data.sec.gov/submissions/CIK{cik}.json"

def get_10k_urls(cik, count=3):
    """Get the URLs for the most recent 10-K filings for a company.
    
    Args:
        cik (str): Company's Central Index Key
        count (int): Number of 10-K filings to retrieve
        
    Returns:
        list: List of filing information (accession number, filing date)
    """
    cik_padded = cik.zfill(10)
    url = SEARCH_URL.format(cik=cik_padded)
    
    try:
        res = requests.get(url, headers=HEADERS)
        res.raise_for_status()
        filings = res.json()['filings']['recent']
        
        filing_info = []
        for i, form in enumerate(filings['form']):
            if form == "10-K":
                acc_num = filings['accessionNumber'][i]
                filing_date = filings['filingDate'][i]
                filing_info.append((acc_num, filing_date))
                if len(filing_info) == count:
                    break
        return filing_info
    except requests.exceptions.RequestException as e:
        print(f"Error fetching filing URLs: {e}")
        return []

def download_10k_documents(company_name, cik):
    """Download 10-K filings for a company.
    
    Args:
        company_name (str): Name of the company
        cik (str): Company's Central Index Key
    """
    print(f"\n📥 Downloading 10-Ks for {company_name}")
    filing_info = get_10k_urls(cik)
    folder = f"./10K_Filings/{company_name}"
    os.makedirs(folder, exist_ok=True)
    
    if not filing_info:
        print(f"⚠️ No 10-K filings found for {company_name}")
        return

    for acc_num, filing_date in filing_info:
        try:
            # Format accession number for URL (remove dashes for directory, keep dashes for filename)
            acc_num_no_dashes = acc_num.replace('-', '')
            
            # First get the index page that lists all the documents in the filing
            index_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_num_no_dashes}/{acc_num}-index.htm"
            
            print(f"Accessing index: {index_url}")
            res = requests.get(index_url, headers=HEADERS)
            res.raise_for_status()
            
            # Parse the HTML to find the actual 10-K document
            soup = BeautifulSoup(res.text, 'html.parser')
            table = soup.find('table', summary='Document Format Files')
            
            if not table:
                print(f"⚠️ Could not find document table for {company_name}, filing date {filing_date}")
                continue
                
            # Look for the primary 10-K document
            found_file = False
            for row in table.find_all('tr'):
                cells = row.find_all('td')
                if len(cells) >= 3:
                    # Check if this is the 10-K document (either in type or description)
                    doc_type = cells[0].get_text().strip().lower()
                    description = cells[1].get_text().strip().lower()
                    
                    if '10-k' in doc_type or '10-k' in description:
                        # Get the document link
                        doc_link = cells[2].find('a')
                        if doc_link and doc_link.has_attr('href'):
                            doc_href = doc_link['href']
                            
                            # Convert relative URL to absolute URL
                            if doc_href.startswith('/'):
                                doc_url = f"https://www.sec.gov{doc_href}"
                            else:
                                doc_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_num_no_dashes}/{doc_href}"
                            
                            print(f"Downloading document: {doc_url}")
                            doc = requests.get(doc_url, headers=HEADERS)
                            doc.raise_for_status()
                            
                            # Extract filename for saving
                            filename = os.path.basename(doc_href)
                            file_path = os.path.join(folder, f"{company_name}_10K_{filing_date}_{filename}")
                            
                            with open(file_path, "w", encoding="utf-8") as f:
                                f.write(doc.text)
                                
                            print(f"✅ Saved: {file_path}")
                            found_file = True
                            break
            
            if not found_file:
                print(f"⚠️ No suitable 10-K document found for {company_name} on {filing_date}")
                
            # Be nice to the SEC server
            time.sleep(1)
                
        except requests.exceptions.RequestException as e:
            print(f"❌ Error downloading filing: {e}")
        except Exception as e:
            print(f"❌ Error processing filing: {str(e)}")

# Download for each company
for name, cik in companies.items():
    download_10k_documents(name, cik)
    time.sleep(2)  # Add delay between companies to avoid rate limiting


📥 Downloading 10-Ks for Microsoft
Accessing index: https://www.sec.gov/Archives/edgar/data/0000789019/000095017024087843/0000950170-24-087843-index.htm
Downloading document: https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000095017024087843/msft-20240630.htm
✅ Saved: ./10K_Filings/Microsoft/Microsoft_10K_2024-07-30_msft-20240630.htm
Accessing index: https://www.sec.gov/Archives/edgar/data/0000789019/000095017023035122/0000950170-23-035122-index.htm
Downloading document: https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000095017023035122/msft-20230630.htm
✅ Saved: ./10K_Filings/Microsoft/Microsoft_10K_2023-07-27_msft-20230630.htm
Accessing index: https://www.sec.gov/Archives/edgar/data/0000789019/000156459022026876/0001564590-22-026876-index.htm
Downloading document: https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000156459022026876/msft-10k_20220630.htm
✅ Saved: ./10K_Filings/Microsoft/Microsoft_10K_2022-07-28_msft-10k_20220630.htm

📥 Downloading 10-Ks for Apple
Ac

In [5]:
import pandas as pd
import re
from collections import defaultdict

def process_10k_to_csv():
    """
    Process downloaded 10-K HTML files into CSV format.
    Extracts key sections and organizes them into a structured CSV file.
    """
    print("\n📊 Converting 10-K files to CSV format...")
    
    # Create a list to store all data
    all_data = []
    
    # Process each company's filings
    for company_name in companies.keys():
        folder = f"./10K_Filings/{company_name}"
        try:
            files = [f for f in os.listdir(folder) if f.endswith('.htm')]
            
            for file in files:
                file_path = os.path.join(folder, file)
                
                # Extract filing date from filename
                match = re.search(r'10K_(\d{4}-\d{2}-\d{2})', file)
                filing_date = match.group(1) if match else "Unknown"
                
                # Read HTML content
                with open(file_path, 'r', encoding='utf-8') as f:
                    html_content = f.read()
                
                # Parse HTML
                soup = BeautifulSoup(html_content, 'html.parser')
                
                # Extract text content (removing scripts, styles)
                for script in soup(["script", "style"]):
                    script.extract()
                text = soup.get_text()
                
                # Clean text (remove excessive whitespace)
                text = re.sub(r'\n+', '\n', text)
                text = re.sub(r' +', ' ', text)
                
                # Extract key sections (simplified approach)
                sections = {
                    'Risk Factors': extract_section(text, ['Item 1A.', 'ITEM 1A.', 'Risk Factors'], 
                                                  ['Item 1B', 'ITEM 1B', 'Item 2']),
                    'Management Discussion': extract_section(text, ['Item 7.', 'ITEM 7.', "Management's Discussion"], 
                                                           ['Item 7A', 'ITEM 7A', 'Item 8']),
                    'Business Description': extract_section(text, ['Item 1.', 'ITEM 1.', 'Business'], 
                                                          ['Item 1A', 'ITEM 1A']),
                }
                
                # Truncate long sections to avoid CSV issues
                max_length = 32000
                for section, content in sections.items():
                    if content and len(content) > max_length:
                        sections[section] = content[:max_length] + "... [truncated]"
                
                # Add to data collection
                all_data.append({
                    'Company': company_name,
                    'Filing Date': filing_date,
                    'Year': filing_date.split('-')[0],
                    'Risk Factors': sections['Risk Factors'],
                    'Management Discussion': sections['Management Discussion'],
                    'Business Description': sections['Business Description'],
                    'File': file
                })
                
                print(f"✅ Processed: {file}")
                
        except Exception as e:
            print(f"❌ Error processing company {company_name}: {str(e)}")
    
    if all_data:
        # Create DataFrame and save to CSV
        df = pd.DataFrame(all_data)
        csv_path = './10K_Analysis.csv'
        df.to_csv(csv_path, index=False, encoding='utf-8')
        print(f"✅ CSV file created: {csv_path}")
        
        # Also save a summary CSV with just metadata
        summary_df = df[['Company', 'Filing Date', 'Year', 'File']]
        summary_df.to_csv('./10K_Summary.csv', index=False, encoding='utf-8')
        print(f"✅ Summary CSV file created: ./10K_Summary.csv")
    else:
        print("⚠️ No data to convert to CSV.")

def extract_section(text, start_markers, end_markers):
    """
    Extract a section from text based on start and end markers.
    
    Args:
        text (str): The full text to search
        start_markers (list): List of possible section start markers
        end_markers (list): List of possible section end markers
        
    Returns:
        str: The extracted section or empty string if not found
    """
    # Find the earliest occurrence of any start marker
    start_pos = len(text)
    for marker in start_markers:
        pos = text.find(marker)
        if pos != -1 and pos < start_pos:
            start_pos = pos
    
    if start_pos == len(text):
        return ""  # No start marker found
    
    # Find the earliest occurrence of any end marker after start_pos
    end_pos = len(text)
    for marker in end_markers:
        pos = text.find(marker, start_pos + 1)
        if pos != -1 and pos < end_pos:
            end_pos = pos
    
    # Extract the section
    section = text[start_pos:end_pos].strip()
    return section

In [6]:
# Add this at the end of your notebook

# Download for each company
for name, cik in companies.items():
    download_10k_documents(name, cik)
    time.sleep(2)  # Add delay between companies to avoid rate limiting

# After downloading, process the files to CSV
process_10k_to_csv()


📥 Downloading 10-Ks for Microsoft
Accessing index: https://www.sec.gov/Archives/edgar/data/0000789019/000095017024087843/0000950170-24-087843-index.htm
Downloading document: https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000095017024087843/msft-20240630.htm
✅ Saved: ./10K_Filings/Microsoft/Microsoft_10K_2024-07-30_msft-20240630.htm
Accessing index: https://www.sec.gov/Archives/edgar/data/0000789019/000095017023035122/0000950170-23-035122-index.htm
Downloading document: https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000095017023035122/msft-20230630.htm
✅ Saved: ./10K_Filings/Microsoft/Microsoft_10K_2023-07-27_msft-20230630.htm
Accessing index: https://www.sec.gov/Archives/edgar/data/0000789019/000156459022026876/0001564590-22-026876-index.htm
Downloading document: https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000156459022026876/msft-10k_20220630.htm
✅ Saved: ./10K_Filings/Microsoft/Microsoft_10K_2022-07-28_msft-10k_20220630.htm

📥 Downloading 10-Ks for Apple
Ac

In [7]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import os

# Read the existing CSV file
df = pd.read_csv('10K_Analysis.csv')

# Display the basic info
print(f"Loaded {len(df)} rows of 10-K data")
print(f"Companies: {df['Company'].unique()}")
print(f"Years: {sorted(df['Year'].unique())}")

# Preview the DataFrame
print("\nPreview of the data:")
print(df.head())

# Check for missing data
print("\nMissing data count per column:")
print(df.isna().sum())

# You can access specific company data
microsoft_data = df[df['Company'] == 'Microsoft']
print(f"\nMicrosoft filings: {len(microsoft_data)}")

# You can filter by year
data_2023 = df[df['Year'] == 2023]
print(f"Filings from 2023: {len(data_2023)}")

Loaded 9 rows of 10-K data
Companies: ['Microsoft' 'Apple' 'Tesla']
Years: [np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]

Preview of the data:
     Company Filing Date  Year  Risk Factors  Management Discussion  \
0  Microsoft  2024-07-30  2024           NaN                    NaN   
1  Microsoft  2022-07-28  2022           NaN                    NaN   
2  Microsoft  2023-07-27  2023           NaN                    NaN   
3      Apple  2022-10-28  2022           NaN                    NaN   
4      Apple  2023-11-03  2023           NaN                    NaN   

   Business Description                                            File  
0                   NaN      Microsoft_10K_2024-07-30_msft-20240630.htm  
1                   NaN  Microsoft_10K_2022-07-28_msft-10k_20220630.htm  
2                   NaN      Microsoft_10K_2023-07-27_msft-20230630.htm  
3                   NaN          Apple_10K_2022-10-28_aapl-20220924.htm  
4                   NaN          Apple_10

In [8]:
def update_sections_in_dataframe():
    """Update the sections in the DataFrame by reprocessing the HTML files."""
    print("\nUpdating sections in the DataFrame...")
    
    # For each row in the DataFrame
    for index, row in df.iterrows():
        company = row['Company']
        filename = row['File']
        file_path = os.path.join('./10K_Filings', company, filename)
        
        if not os.path.exists(file_path):
            print(f"⚠️ File not found: {file_path}")
            continue
            
        try:
            # Read HTML content
            with open(file_path, 'r', encoding='utf-8') as f:
                html_content = f.read()
                
            # Parse HTML
            soup = BeautifulSoup(html_content, 'html.parser')
                
            # Extract text content (removing scripts, styles)
            for script in soup(["script", "style"]):
                script.extract()
            text = soup.get_text()
                
            # Clean text (remove excessive whitespace)
            text = re.sub(r'\n+', '\n', text)
            text = re.sub(r' +', ' ', text)
                
            # Extract key sections with improved patterns
            risk_factors = extract_section(
                text, 
                ['Item 1A', 'ITEM 1A', 'Risk Factors'], 
                ['Item 1B', 'ITEM 1B', 'Item 2', 'ITEM 2']
            )
                
            mgmt_discussion = extract_section(
                text, 
                ['Item 7', 'ITEM 7', "Management's Discussion"], 
                ['Item 7A', 'ITEM 7A', 'Item 8', 'ITEM 8']
            )
                
            business_desc = extract_section(
                text, 
                ['Item 1', 'ITEM 1', 'Business'], 
                ['Item 1A', 'ITEM 1A']
            )
                
            # Update the DataFrame with extracted content
            # Truncate if necessary
            max_length = 32000
            df.at[index, 'Risk Factors'] = (risk_factors[:max_length] + '... [truncated]' 
                                           if len(risk_factors) > max_length else risk_factors)
            df.at[index, 'Management Discussion'] = (mgmt_discussion[:max_length] + '... [truncated]' 
                                                  if len(mgmt_discussion) > max_length else mgmt_discussion)
            df.at[index, 'Business Description'] = (business_desc[:max_length] + '... [truncated]' 
                                                 if len(business_desc) > max_length else business_desc)
                
            print(f"✅ Updated sections for {filename}")
                
        except Exception as e:
            print(f"❌ Error processing {filename}: {str(e)}")
    
    # Save the updated DataFrame
    df.to_csv('10K_Analysis_updated.csv', index=False)
    print("✅ Saved updated DataFrame to '10K_Analysis_updated.csv'")
    
    return df

def extract_section(text, start_markers, end_markers):
    """
    Extract a section from text based on start and end markers.
    
    Args:
        text (str): The full text to search
        start_markers (list): List of possible section start markers
        end_markers (list): List of possible section end markers
        
    Returns:
        str: The extracted section or empty string if not found
    """
    # Find the earliest occurrence of any start marker
    start_pos = len(text)
    for marker in start_markers:
        pos = text.find(marker)
        if pos != -1 and pos < start_pos:
            start_pos = pos
    
    if start_pos == len(text):
        return ""  # No start marker found
    
    # Find the earliest occurrence of any end marker after start_pos
    end_pos = len(text)
    for marker in end_markers:
        pos = text.find(marker, start_pos + 1)
        if pos != -1 and pos < end_pos:
            end_pos = pos
    
    # Extract the section
    section = text[start_pos:end_pos].strip()
    return section

# Run the function to update the sections
updated_df = update_sections_in_dataframe()

# Check how many sections were successfully extracted
print("\nExtracted section counts:")
print(f"Risk Factors: {updated_df['Risk Factors'].notna().sum()}")
print(f"Management Discussion: {updated_df['Management Discussion'].notna().sum()}")
print(f"Business Description: {updated_df['Business Description'].notna().sum()}")


Updating sections in the DataFrame...
✅ Updated sections for Microsoft_10K_2024-07-30_msft-20240630.htm
✅ Updated sections for Microsoft_10K_2022-07-28_msft-10k_20220630.htm
✅ Updated sections for Microsoft_10K_2023-07-27_msft-20230630.htm
✅ Updated sections for Apple_10K_2022-10-28_aapl-20220924.htm
✅ Updated sections for Apple_10K_2023-11-03_aapl-20230930.htm
✅ Updated sections for Apple_10K_2024-11-01_aapl-20240928.htm
✅ Updated sections for Tesla_10K_2024-01-29_tsla-20231231.htm
✅ Updated sections for Tesla_10K_2023-01-31_tsla-20221231.htm
✅ Updated sections for Tesla_10K_2025-01-30_tsla-20241231.htm
✅ Saved updated DataFrame to '10K_Analysis_updated.csv'

Extracted section counts:
Risk Factors: 9
Management Discussion: 9
Business Description: 9


  df.at[index, 'Risk Factors'] = (risk_factors[:max_length] + '... [truncated]'
  df.at[index, 'Management Discussion'] = (mgmt_discussion[:max_length] + '... [truncated]'
  df.at[index, 'Business Description'] = (business_desc[:max_length] + '... [truncated]'


In [4]:
from bs4 import BeautifulSoup
import pandas as pd
soup = BeautifulSoup(file, "html.parser")
# Load the file
with open("10K_Filings/Apple/Apple_10K_2022-10-28_aapl-20220924.htm", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "lxml")

# Extract all tables
tables = soup.find_all("table")

# Convert tables to DataFrames
dfs = [pd.read_html(str(table))[0] for table in tables]

# Example: print first table
if dfs:
    print(dfs[0].head())
else:
    print("No tables found.")


ValueError: I/O operation on closed file.

In [6]:
from bs4 import BeautifulSoup
import pandas as pd
soup = BeautifulSoup(file, "html.parser")
# Load the file
with open("10K_Filings/Apple/Apple_10K_2022-10-28_aapl-20220924.htm", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "lxml")

# Extract all tables
tables = soup.find_all("table")

# Convert tables to DataFrames
dfs = [pd.read_html(str(table))[0] for table in tables]

# Example: print first table
if dfs:
    print(dfs[0].head())
else:
    print("No tables found.")


ValueError: I/O operation on closed file.

In [None]:
soup = BeautifulSoup(file, "html.parser")