<a href="https://colab.research.google.com/github/DivyakshiTare/Data-Scraper/blob/main/Data_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
import time
import logging
from typing import List, Dict, Optional
from dataclasses import dataclass
import json
from pathlib import Path
from datetime import datetime
import urllib.parse


@dataclass
class CompanyFiling:
    cik: str
    company_name: str
    filing_type: str
    filing_date: str
    filing_link: Optional[str] = None

    def to_dict(self):
        return {k: v for k, v in self.__dict__.items() if v is not None}

class EDGARCollector:
    def __init__(self, email: str, rate_limit: float = 0.1):
        """
        Initialize the SEC EDGAR collector

        Args:
            email: Email address for SEC request header
            rate_limit: Minimum time between requests in seconds (SEC requires 0.1s)
        """
        self.base_url = "https://data.sec.gov/submissions"
        self.headers = {
            'User-Agent': f'CompanyResearch research@{email}',
        }
        self.rate_limit = rate_limit
        self.last_request = 0

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def _respect_rate_limit(self):
        """Ensure we don't exceed SEC's rate limits"""
        now = time.time()
        time_passed = now - self.last_request
        if time_passed < self.rate_limit:
            time.sleep(self.rate_limit - time_passed)
        self.last_request = time.time()

    def get_company_info(self, cik: str) -> Dict:
        """
        Get company information from SEC EDGAR

        Args:
            cik: Company's CIK number (SEC identifier)
        """
        self._respect_rate_limit()

        # Pad CIK to 10 digits
        cik_padded = str(cik).zfill(10)

        try:
            response = requests.get(
                f"{self.base_url}/CIK{cik_padded}.json",
                headers=self.headers
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Error fetching company info: {str(e)}")
            return None

    def search_companies(self, company_name: str) -> List[Dict]:
        """
        Search for companies using the SEC company search
        This is a simplified implementation that gets the first page of results
        """
        self._respect_rate_limit()

        search_url = "https://www.sec.gov/cgi-bin/browse-edgar"
        params = {
            'company': company_name,
            'owner': 'exclude',
            'action': 'getcompany',
            'output': 'atom'
        }

        try:
            response = requests.get(search_url, params=params, headers=self.headers)
            response.raise_for_status()

            # Parse the XML response to get CIKs
            # This is a simplified implementation
            companies = []
            ciks = set()

            # Extract CIKs from the response text
            # Note: In a production environment, use proper XML parsing
            for line in response.text.split('\n'):
                if 'CIK=' in line:
                    cik = line.split('CIK=')[1].split('&')[0]
                    if cik not in ciks:
                        ciks.add(cik)
                        company_info = self.get_company_info(cik)
                        if company_info:
                            companies.append(company_info)

            return companies

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Error searching companies: {str(e)}")
            return []

    def get_recent_filings(self, cik: str, filing_types: List[str] = None) -> List[CompanyFiling]:
        """
        Get recent filings for a company

        Args:
            cik: Company's CIK number
            filing_types: List of filing types to include (e.g., ['10-K', '10-Q'])
        """
        company_info = self.get_company_info(cik)
        if not company_info:
            return []

        filings = []
        recent_filings = company_info.get('filings', {}).get('recent', {})

        if not recent_filings:
            return []

        # Get the filing information
        for i in range(len(recent_filings.get('accessionNumber', []))):
            filing_type = recent_filings['form'][i]

            # Skip if not in requested filing types
            if filing_types and filing_type not in filing_types:
                continue

            filing = CompanyFiling(
                cik=cik,
                company_name=company_info.get('name', ''),
                filing_type=filing_type,
                filing_date=recent_filings['filingDate'][i],
                filing_link=self._construct_filing_link(
                    recent_filings['accessionNumber'][i],
                    recent_filings.get('primaryDocument', [''])[i]
                )
            )
            filings.append(filing)

        return filings

    def _construct_filing_link(self, accession_number: str, primary_doc: str) -> str:
        """Construct the SEC archive URL for a filing"""
        acc_no = accession_number.replace('-', '')
        return f"https://www.sec.gov/Archives/edgar/data/{acc_no}/{primary_doc}"

    def collect_company_data(self, search_terms: List[str], filing_types: List[str] = None) -> pd.DataFrame:
        """
        Collect company data and recent filings based on search terms

        Args:
            search_terms: List of company names or keywords to search
            filing_types: List of filing types to include
        """
        all_filings = []

        for term in search_terms:
            self.logger.info(f"Searching for: {term}")
            companies = self.search_companies(term)

            for company in companies:
                cik = company.get('cik', '')
                if cik:
                    filings = self.get_recent_filings(cik, filing_types)
                    all_filings.extend(filings)

        # Convert to DataFrame
        df = pd.DataFrame([filing.to_dict() for filing in all_filings])

        if not df.empty:
            # Convert dates to datetime
            df['filing_date'] = pd.to_datetime(df['filing_date'])

            # Remove duplicates
            df = df.drop_duplicates()

            # Sort by filing date
            df = df.sort_values('filing_date', ascending=False)

        return df

def main():
    # Example usage
    collector = EDGARCollector(
        email="divyakshitare09@gmail.com",  # Replace with your email
        rate_limit=0.1  # SEC requires 100ms between requests
    )

    # Example: Collect recent 10-K and 10-Q filings for technology companies
    df = collector.collect_company_data(
        search_terms=["software", "technology"],
        filing_types=['10-K', '10-Q']
    )

    # Save results
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    output_file = output_dir / f"sec_filings_{datetime.now().strftime('%Y%m%d')}.csv"
    df.to_csv(output_file, index=False)

    print(f"Collected {len(df)} filings")
    print(f"Data saved to {output_file}")

if __name__ == "__main__":
    main()

Collected 284 filings
Data saved to output/sec_filings_20241220.csv
