In [96]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


In [104]:
# List of company names and their corresponding CIK codes
pharma_companies = {
    "Johnson & Johnson": "0000200406",
    "Pfizer Inc.": "0000078003",
    "Novartis AG": "0001114448",
    "Merck & Co., Inc.": "0000310158",
    "Sanofi": "0001121404",
    "GlaxoSmithKline plc": "0001131399",
    "AbbVie Inc.": "0001551152",
    "AstraZeneca plc": "0000901832",
    "Bristol-Myers Squibb Company": "0000014272",
    "Eli Lilly and Company": "0000059478"
}

# Create a Pandas DataFrame
companies_df = pd.DataFrame(list(pharma_companies.items()), columns=["company_name", "cik"])



# Define headers for SEC API access
headers = {'User-Agent': 'Brian Lu brian901231@gmail.com'}

# Add print statements to check the progress
def get_10k_filing_links(cik):
    """Scrape ALL 10-K filing links from SEC EDGAR"""
    base_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=10-K&count=100&output=atom"
    filing_links = []

    try:
        response = requests.get(base_url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            entries = soup.find_all("entry")
            
            for entry in entries:
                filing_date = entry.find("filing-date").text
                filing_url = entry.find("filing-href").text  # URL to filing page
                
                # Extract Accession Number from URL
                parts = filing_url.split('/')
                accession_number = parts[-1].replace('-', '')

                filing_links.append({
                    "filing_date": filing_date,
                    "accession_number": accession_number,
                    "filing_url": filing_url
                })
            
            return filing_links
        else:
            print(f"Failed to get historical filings for {cik}")
            return []
    except Exception as e:
        print(f"Error scraping SEC EDGAR for {cik}: {e}")
        return []


def extract_10k_selenium(doc_url):
    """Extracts full 10-K text using Selenium to render JavaScript"""
    try:
        # Set up Chrome options
        options = Options()
        options.add_argument("--headless")  # Run in headless mode (no browser UI)
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")

        # ✅ Spoof User-Agent to bypass SEC bot detection
        user_agent = "Brian Lu brian901231@gmail.com"
        options.add_argument(f"user-agent={user_agent}")

        # Start a Chrome WebDriver session
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)

        # Open the SEC filing URL
        print(f"Opening {doc_url}...")
        driver.get(doc_url)

        # Wait for the page to load completely
        time.sleep(5)  # Adjust if needed

        # Extract the main filing text
        filing_text = driver.find_element(By.TAG_NAME, "body").text

        # Close the browser
        driver.quit()

        return filing_text

    except Exception as e:
        print(f"Error extracting 10-K text from {doc_url}: {e}")
        return None
    

def extract_10k_text(filing_url):
    """Extract the main 10-K text from the filing HTML page"""
    try:
        response = requests.get(filing_url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")


        # Locate the "Filing Table" where all documents are listed
        table = soup.find("table", {"summary": "Document Format Files"})
        if not table:
            print(f"Could not find filing table on page: {filing_url}")
            return None

        # Find the row that contains "10-K" (the actual document)
        for row in table.find_all("tr"):
            cells = row.find_all("td")
            if len(cells) > 3 and "10-K" in cells[1].text:
                filing_doc_link = cells[2].find("a")["href"]
                break
        else:
            print(f"No 10-K document found on page: {filing_url}")
            return None

        # Construct the full document URL
        doc_url = f"https://www.sec.gov{filing_doc_link}"
        
        # Download the 10-K document
        doc_response = requests.get(doc_url, headers=headers)
        if doc_response.status_code == 200:
            if "ix?doc" in doc_url:

                print(f"Successfully extracted 10-K document from iXBRL format: {doc_url}")
                return extract_10k_selenium(doc_url)
            else:
                print(f"Successfully extracted 10-K document from html format: {doc_url}")
                filing_text = doc_response.text
                # Optional: Clean the text output
                filing_text = "\n".join([line.strip() for line in filing_text.split("\n") if line.strip()])
                return filing_text
        else:
            print(f"Failed to download 10-K document from {doc_url}")
            return None

    except Exception as e:
        print(f"Error extracting 10-K text from {filing_url}: {e}")
        return None


def create_10k_dataset(companies_df, start_year=2010, max_companies=None):
    """Create dataset with ALL historical 10-K texts for companies"""
    
    print(f"\nStarting with {len(companies_df)} companies")

    # Limit number of companies if specified
    if max_companies:
        companies_df = companies_df.head(max_companies)
        print(f"Processing first {max_companies} companies as a test")

    dataset = []
    
    for _, company in companies_df.iterrows():
        cik = str(company['cik']).zfill(10)  # Ensure proper CIK format
        print(f"\nProcessing {company['company_name']} (CIK: {cik})...")

        # Get ALL historical 10-K filings
        filings = get_10k_filing_links(cik)

        for filing in filings:
            year = int(filing["filing_date"][:4])
            if year >= start_year:
                filing_url = filing["filing_url"]

                # Extract 10-K text from the filing page
                text = extract_10k_text(filing_url)
                
                if text:
                    dataset.append({
                        'company_name': company['company_name'],
                        'cik': cik,
                        'year': year,
                        'filing_url': filing_url,
                        '10k_text': text
                    })
                else:
                    print(f"Failed to retrieve 10-K text for {company['company_name']} ({year})")

        time.sleep(0.2)  # Prevent rate-limiting

    # Create final DataFrame
    df = pd.DataFrame(dataset)
    df = df.sort_values(['company_name', 'year'])
    
    print(f"\nDataset created with {len(df)} 10-K filings")
    return df

In [105]:
# Create the dataset
pharma_10k_df_top_10 = create_10k_dataset(companies_df, start_year=2010, max_companies=1)
display(pharma_10k_df_top_10)


Starting with 10 companies
Processing first 1 companies as a test

Processing Johnson & Johnson (CIK: 0000200406)...


  k = self.parse_starttag(i)


Successfully extracted 10-K document from iXBRL format: https://www.sec.gov/ix?doc=/Archives/edgar/data/200406/000020040625000038/jnj-20241229.htm
Opening https://www.sec.gov/ix?doc=/Archives/edgar/data/200406/000020040625000038/jnj-20241229.htm...
Failed to retrieve 10-K text for Johnson & Johnson (2025)
Successfully extracted 10-K document from iXBRL format: https://www.sec.gov/ix?doc=/Archives/edgar/data/200406/000020040624000013/jnj-20231231.htm
Opening https://www.sec.gov/ix?doc=/Archives/edgar/data/200406/000020040624000013/jnj-20231231.htm...


KeyboardInterrupt: 

In [99]:
# show distinct company names and find the length
pharma_10k_df_top_10['10k_text'].unique().shape[0]

16

In [103]:
# print 10k_text with year 2025
print(pharma_10k_df_top_10[pharma_10k_df_top_10['year'] == 2025]['10k_text'].values[0])

IndexError: index 0 is out of bounds for axis 0 with size 0