In [4]:
import os
import requests
from sec_api import QueryApi, PdfGeneratorApi
from datetime import datetime
import time

In [5]:
# API Configuration
# You need to get your API key from https://sec-api.io/
# For demonstration, we'll use a placeholder - replace with your actual API key
SEC_API_KEY = "e9c0943506eb9a2f756d1f3dfb442dccdfab40ca5ef81b605a560b241baedf41"  # Replace with your actual sec-api.io API key


# Initialize the APIs
query_api = QueryApi(api_key=SEC_API_KEY)
pdf_generator = PdfGeneratorApi(api_key=SEC_API_KEY)

In [6]:

# Create the data/raw directory if it doesn't exist
raw_data_dir = "../data/raw/MSFT/10-K/PDFs"
os.makedirs(raw_data_dir, exist_ok=True)


In [7]:

# Microsoft's ticker symbol and CIK (Central Index Key)
ticker = "MSFT"
cik = "0000789019"  # Microsoft's CIK


In [9]:

print(f"🔍 Searching for {ticker} 10-K filings from the last 2 years...")
print(f"📁 PDFs will be saved to: {os.path.abspath(raw_data_dir)}")

if SEC_API_KEY == "e9c0943506eb9a2f756d1f3dfb442dccdfab40ca5ef81b605a560b241baedf41":
    try:
        # Query for Microsoft 10-K filings from the last 2 years
        query = {
            "query": f"ticker:{ticker} AND formType:\"10-K\" AND filedAt:[2022-01-01 TO 2024-12-31]",
            "from": "0",
            "size": "10",
            "sort": [{"filedAt": {"order": "desc"}}]
        }
        
        print("🔍 Querying SEC database...")
        response = query_api.get_filings(query)
        
        filings = response["filings"]
        print(f"📋 Found {len(filings)} 10-K filings for {ticker}")
        
        if len(filings) == 0:
            print("❌ No filings found. Try adjusting the date range or check the ticker symbol.")
        else:
            print("\n📄 Available filings:")
            for i, filing in enumerate(filings):
                filed_date = filing["filedAt"][:10]  # Extract date part
                accession_no = filing["accessionNo"]
                print(f"  {i+1}. Filed: {filed_date} | Accession: {accession_no}")
    
    except Exception as e:
        print(f"❌ Error querying filings: {e}")
        print("Make sure your API key is valid and you have internet connection.")
        filings = []
else:
    print("⏩ Skipping API calls - please configure your API key first")
    filings = []


🔍 Searching for MSFT 10-K filings from the last 2 years...
📁 PDFs will be saved to: /Users/smatcha/Documents/BigData/pdf-parser/data/raw/MSFT/10-K/PDFs
🔍 Querying SEC database...
📋 Found 3 10-K filings for MSFT

📄 Available filings:
  1. Filed: 2024-07-30 | Accession: 0000950170-24-087843
  2. Filed: 2023-07-27 | Accession: 0000950170-23-035122
  3. Filed: 2022-07-28 | Accession: 0001564590-22-026876


In [10]:
# Generate PDFs from the found filings using sec-api PdfGeneratorApi
if SEC_API_KEY == "e9c0943506eb9a2f756d1f3dfb442dccdfab40ca5ef81b605a560b241baedf41" and 'filings' in locals() and len(filings) > 0:
    print("🔄 Converting SEC filings to PDF format...")
    print(f"📂 Saving PDFs to: {raw_data_dir}")
    
    successfully_downloaded = 0
    
    for i, filing in enumerate(filings):
        try:
            # Extract filing information
            filed_date = filing["filedAt"][:10].replace("-", "")  # Format: YYYYMMDD
            accession_no = filing["accessionNo"].replace("-", "")
            form_type = filing["formType"]
            
            # Create filename
            pdf_filename = f"{ticker}_{form_type}_{filed_date}_{accession_no}.pdf"
            pdf_path = os.path.join(raw_data_dir, pdf_filename)
            
            # Skip if file already exists
            if os.path.exists(pdf_path):
                print(f"  ⏩ Skipping {pdf_filename} (already exists)")
                successfully_downloaded += 1
                continue
            
            print(f"  🔄 Processing filing {i+1}/{len(filings)}: {filed_date}")
            
            # Get the filing URL
            filing_url = filing["linkToFilingDetails"]
            
            # Generate PDF using sec-api
            print(f"    📄 Generating PDF from: {filing_url}")
            
            # Add a small delay to respect rate limits
            if i > 0:
                time.sleep(1)  # 1 second delay between requests
            
            pdf_content = pdf_generator.get_pdf(filing_url)
            
            # Save PDF to file
            with open(pdf_path, 'wb') as pdf_file:
                pdf_file.write(pdf_content)
            
            # Check file size
            file_size = os.path.getsize(pdf_path) / (1024 * 1024)  # Size in MB
            
            if file_size > 0.1:  # At least 100KB
                print(f"    ✅ Successfully saved: {pdf_filename} ({file_size:.1f} MB)")
                successfully_downloaded += 1
            else:
                print(f"    ⚠️  Warning: Small file size for {pdf_filename} ({file_size:.1f} MB)")
                successfully_downloaded += 1
                
        except Exception as e:
            print(f"    ❌ Error processing filing {i+1}: {e}")
            continue
    
    print(f"\n📊 Download Summary:")
    print(f"  Total filings found: {len(filings)}")
    print(f"  PDFs successfully downloaded: {successfully_downloaded}")
    print(f"  Success rate: {(successfully_downloaded/len(filings)*100):.1f}%")
    
    # List final PDF files
    if os.path.exists(raw_data_dir):
        pdf_files = [f for f in os.listdir(raw_data_dir) if f.lower().endswith('.pdf')]
        
        if pdf_files:
            print(f"\n📋 PDF files ready for parsing in {raw_data_dir}:")
            total_size = 0
            for pdf_file in sorted(pdf_files):
                file_path = os.path.join(raw_data_dir, pdf_file)
                file_size = os.path.getsize(file_path) / (1024 * 1024)
                total_size += file_size
                print(f"  📄 {pdf_file} ({file_size:.1f} MB)")
            
            print(f"\n💾 Total size: {total_size:.1f} MB")
            print(f"🎯 Ready for PDF parsing with your PDF parser!")
        else:
            print(f"\n❌ No PDF files were created. Check the error messages above.")

elif SEC_API_KEY == "YOUR_API_KEY_HERE":
    print("⚠️  Please configure your SEC API key first before generating PDFs")
    print("📋 Steps:")
    print("  1. Sign up at https://sec-api.io/")
    print("  2. Get your API key from the dashboard")
    print("  3. Replace 'YOUR_API_KEY_HERE' in the first cell")
    print("  4. Re-run both cells")
    
elif 'filings' not in locals() or len(filings) == 0:
    print("❌ No filings available for PDF generation")
    print("Please run the first cell successfully to fetch filings first")
    
else:
    print("⚠️  Unexpected state - please re-run the first cell")


🔄 Converting SEC filings to PDF format...
📂 Saving PDFs to: ../data/raw/MSFT/10-K/PDFs
  🔄 Processing filing 1/3: 20240730
    📄 Generating PDF from: https://www.sec.gov/Archives/edgar/data/789019/000095017024087843/msft-20240630.htm
    ✅ Successfully saved: MSFT_10-K_20240730_000095017024087843.pdf (12.2 MB)
  🔄 Processing filing 2/3: 20230727
    📄 Generating PDF from: https://www.sec.gov/Archives/edgar/data/789019/000095017023035122/msft-20230630.htm
    ✅ Successfully saved: MSFT_10-K_20230727_000095017023035122.pdf (11.9 MB)
  🔄 Processing filing 3/3: 20220728
    📄 Generating PDF from: https://www.sec.gov/Archives/edgar/data/789019/000156459022026876/msft-10k_20220630.htm
    ✅ Successfully saved: MSFT_10-K_20220728_000156459022026876.pdf (11.7 MB)

📊 Download Summary:
  Total filings found: 3
  PDFs successfully downloaded: 3
  Success rate: 100.0%

📋 PDF files ready for parsing in ../data/raw/MSFT/10-K/PDFs:
  📄 MSFT_10-K_20220728_000156459022026876.pdf (11.7 MB)
  📄 MSFT_10-K