In [1]:
import os
import time
import requests
import pandas as pd
from edgar import Company, set_identity
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# --- CONFIGURATION ---
set_identity("Ashish juttua@clarkson.edu") 
RAW_DATA_DIR = "../data/data_raw_html"
MAX_REQ_PER_SEC = 10 

# Rate Limiter to ensure we don't get banned while moving fast
class RateLimiter:
    def __init__(self, rate_limit):
        self.rate_limit = rate_limit
        self.tokens = rate_limit
        self.last_update = time.time()
        self.lock = Lock()

    def wait_for_token(self):
        with self.lock:
            while True:
                now = time.time()
                elapsed = now - self.last_update
                if elapsed > 1.0:
                    self.tokens = self.rate_limit
                    self.last_update = now
                if self.tokens > 0:
                    self.tokens -= 1
                    return
                time.sleep(0.01)

limiter = RateLimiter(MAX_REQ_PER_SEC)

def get_sp500_tickers():
    print("Fetching S&P 500 list...")
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124 Safari/537.36"}
    try:
        tables = pd.read_html(requests.get(url, headers=headers).text)
        return [t.replace('.', '-') for t in tables[0]['Symbol'].tolist()]
    except Exception as e:
        print(f"Error fetching list: {e}")
        return []

def download_html(ticker):
    try:
        limiter.wait_for_token()
        company = Company(ticker)
        
        # Get list of filings (Metadata only, fast)
        filings = company.get_filings(form=["10-K", "10-Q"])
        if not filings: return f"{ticker}: No filings."

        save_dir = os.path.join(RAW_DATA_DIR, ticker)
        os.makedirs(save_dir, exist_ok=True)
        
        count = 0
        for filing in filings:
            date = filing.filing_date
            form = filing.form.replace("/", "-")
            
            # Filename: TICKER_FORM_DATE.html
            fname = f"{ticker}_{form}_{date}.html"
            fpath = os.path.join(save_dir, fname)
            
            if os.path.exists(fpath):
                continue
            
            limiter.wait_for_token()
            try:
                # FAST: Just grab raw HTML string, no parsing
                html_content = filing.html() 
                if html_content:
                    with open(fpath, "w", encoding="utf-8") as f:
                        f.write(html_content)
                    count += 1
            except Exception:
                pass
                
        return f"{ticker}: Downloaded {count} raw HTML files."
    except Exception as e:
        return f"{ticker}: Error {e}"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

def main():
    tickers = get_sp500_tickers()
    if not tickers: return
    
    print(f"--- Starting RAW Download for {len(tickers)} companies ---")
    
    # 20 threads to keep the network busy
    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = {executor.submit(download_html, t): t for t in tickers}
        for future in as_completed(futures):
            print(future.result())

if __name__ == "__main__":
    main()

Fetching S&P 500 list...


  tables = pd.read_html(requests.get(url, headers=headers).text)


--- Starting RAW Download for 503 companies ---
ABNB: Downloaded 20 raw HTML files.
GOOGL: Downloaded 44 raw HTML files.
ACN: Downloaded 67 raw HTML files.


Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'


ABBV: Downloaded 53 raw HTML files.
AMD: Downloaded 112 raw HTML files.
ALLE: Downloaded 48 raw HTML files.


Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'


AKAM: Downloaded 104 raw HTML files.
ABT: Downloaded 102 raw HTML files.


Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'


ALGN: Downloaded 103 raw HTML files.
AOS: Downloaded 103 raw HTML files.
GOOG: Downloaded 44 raw HTML files.


Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'


A: Downloaded 109 raw HTML files.


Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'


ADBE: Downloaded 109 raw HTML files.


Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'


ALB: Downloaded 111 raw HTML files.


Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']									IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']									IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']															IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']															IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']		IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']									IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']									IRS NUMBER'
Subheader 'COMPANY DATA' not found in header ']															IRS NUMBER'
Subheader 'COMPANY

MMM: Downloaded 101 raw HTML files.
