In [None]:
import os
import time
import requests
import pandas as pd
import boto3
from botocore.exceptions import ClientError
from edgar import Company, set_identity
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from IPython.display import clear_output

In [None]:
# --- CONFIGURATION ---
# 1. Identity (REQUIRED by SEC)
set_identity("Ashish juttua@clarkson.edu") 

# 2. AWS Settings
# CHANGE THIS to a unique name (S3 buckets must be globally unique)
BUCKET_NAME = "sec-filings-raw-data-ashish-v1" 
AWS_REGION = "us-east-1"
S3_FOLDER = "raw_html/"

# 3. Performance Settings
MAX_REQ_PER_SEC = 10  # SEC Limit
MAX_WORKERS = 8       # Number of parallel threads

In [None]:
# --- RATE LIMITER ---
class RateLimiter:
    def __init__(self, rate_limit):
        self.rate_limit = rate_limit
        self.tokens = rate_limit
        self.last_update = time.time()
        self.lock = Lock()

    def wait_for_token(self):
        with self.lock:
            while True:
                now = time.time()
                elapsed = now - self.last_update
                if elapsed > 1.0:
                    self.tokens = self.rate_limit
                    self.last_update = now
                if self.tokens > 0:
                    self.tokens -= 1
                    return
                time.sleep(0.05)

limiter = RateLimiter(MAX_REQ_PER_SEC)
s3_client = boto3.client('s3', region_name=AWS_REGION)

In [None]:
def create_s3_bucket():
    """Creates the S3 bucket if it doesn't exist."""
    try:
        s3_client.head_bucket(Bucket=BUCKET_NAME)
        print(f"âœ… Bucket '{BUCKET_NAME}' found.")
    except ClientError:
        print(f"Creating bucket '{BUCKET_NAME}'...")
        try:
            if AWS_REGION == "us-east-1":
                s3_client.create_bucket(Bucket=BUCKET_NAME)
            else:
                s3_client.create_bucket(
                    Bucket=BUCKET_NAME,
                    CreateBucketConfiguration={'LocationConstraint': AWS_REGION}
                )
            print(f"Successfully created bucket '{BUCKET_NAME}'.")
        except Exception as e:
            print(f"CRITICAL ERROR: Could not create bucket. {e}")
            raise e

In [None]:
def get_sp500_tickers():
    """Fetches S&P 500 list from Wikipedia."""
    print("Fetching S&P 500 list...")
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    try:
        tables = pd.read_html(requests.get(url, headers=headers).text)
        tickers = [t.replace('.', '-') for t in tables[0]['Symbol'].tolist()]
        print(f"Loaded {len(tickers)} companies.")
        return tickers
    except Exception as e:
        print(f"Error fetching tickers: {e}")
        return []

In [None]:
def check_file_exists_s3(key):
    """Checks if a file is already in S3 to skip re-downloading."""
    try:
        s3_client.head_object(Bucket=BUCKET_NAME, Key=key)
        return True
    except ClientError:
        return False

In [None]:
def process_company(ticker):
    """Downloads filings for one company and uploads to S3."""
    try:
        limiter.wait_for_token()
        company = Company(ticker)
        
        # Get metadata for 10-K (Annual) and 10-Q (Quarterly)
        filings = company.get_filings(form=["10-K", "10-Q"])
        if not filings:
            return f"{ticker}: No filings found."

        upload_count = 0
        
        for filing in filings:
            date = filing.filing_date
            form = filing.form.replace("/", "-")
            
            # S3 Key: raw_html/AAPL/AAPL_10-K_2023-01-01.html
            s3_key = f"{S3_FOLDER}{ticker}/{ticker}_{form}_{date}.html"
            
            # 1. Check S3 first (Save bandwidth)
            if check_file_exists_s3(s3_key):
                continue
            
            # 2. Download Content (Respect Rate Limit)
            limiter.wait_for_token()
            try:
                html_content = filing.html()
                if html_content:
                    # 3. Upload directly to S3
                    s3_client.put_object(
                        Bucket=BUCKET_NAME,
                        Key=s3_key,
                        Body=html_content.encode('utf-8'),
                        ContentType='text/html'
                    )
                    upload_count += 1
            except Exception as e:
                pass
                
        return f"{ticker}: Uploaded {upload_count} new filings."
    
    except Exception as e:
        return f"{ticker}: Failed - {str(e)[:50]}"

In [None]:
# --- MAIN EXECUTION ---

# 1. Setup Storage
create_s3_bucket()

# 2. Get Targets
tickers = get_sp500_tickers()

if tickers:
    print(f"--- Processing {len(tickers)} companies ---")
    print(f"--- Target: s3://{BUCKET_NAME}/{S3_FOLDER} ---")
    
    # 3. Run Parallel Jobs
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_ticker = {executor.submit(process_company, t): t for t in tickers}
        
        counter = 0
        total = len(tickers)
        
        for future in as_completed(future_to_ticker):
            counter += 1
            result = future.result()
            print(f"[{counter}/{total}] {result}")
            
    print("--- JOB COMPLETE ---")

In [3]:
# Check how many files are in the bucket
s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)
count = sum(1 for _ in bucket.objects.all())
print(f"Total files in S3: {count}")

Total files in S3: 38183
