In [2]:
import requests_random_user_agent
import requests
import pandas as pd 
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
import re
import csv
import os

In [3]:
class EDGARDataFetcher:

    def __init__(self, cik_file_path, filing_type):
        
        """
        Initialize the EDGARDataFetcher class.

        Args:
            cik_file_path (str): Path to the file containing CIK numbers and company names.
            filing_type (str): The type of filing to fetch (e.g., '10-k').
        """
        self.cik_file_path = cik_file_path
        self.filing_type = filing_type
        self.cik_data = {} # Dictionary to store CIK numbers and company names.
        self.EDGAR_search_results = {} # Dictionary to store search results.
        self.filing_data = {} # Dictionary to store filing data.
        self.base_url = r"https://www.sec.gov"

    def read_cik_data(self):

        """
        Read CIK data from the input file and populate the cik_data dictionary.
        """

        with open(self.cik_file_path, 'r') as file:
            lines = file.readlines()
            
        for line in lines:
            line = line.strip()
            parts = line.split()

            if len(parts) == 2:
                word = parts[0]
                cik = parts[1]
                self.cik_data[word] = cik

    def fetch_filings(self):
        
        """
        Fetch company filings from the SEC's EDGAR database.
        """
        endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"
        
        for company, cik in self.cik_data.items():
            param_dict = {'action':'getcompany',
                          'CIK': cik,
                          'type': self.filing_type,
                          'owner':'exclude',
                          'output':'',
                          'count':'100'}

            response = requests.get(url=endpoint, params=param_dict)
            if response.status_code == 200:
                self.EDGAR_search_results[company] = response
            else:
                print('Request Failed')

    def get_filing_data(self):

        """
        Extract filing data from the search results and populate the filing_data dictionary.
        """
        
        for company, url in self.EDGAR_search_results.items():
            soup = BeautifulSoup(url.content, 'html.parser')
            doc_table = soup.find_all('table', class_='tableFile2')
            master_list = []
            
            for row in doc_table[0].find_all('tr'):
                cols = row.find_all('td')
                
                if len(cols) != 0:
                    filing_type = cols[0].text.strip()
                    filing_date = cols[3].text.strip()
                    filing_numb = cols[4].text.strip()
                    filing_doc_href = cols[1].find('a', {'href':True, 'id':'documentsbutton'})
                    filing_int_href = cols[1].find('a', {'href':True, 'id':'interactiveDataBtn'})
                    filing_num_href = cols[4].find('a')

                    if filing_doc_href != None:
                        filing_doc_link = self.base_url + filing_doc_href['href'] 
                    else:
                        filing_doc_link = 'no link'

                    if filing_int_href != None:
                        filing_int_link = self.base_url + filing_int_href['href'] 
                    else:
                        filing_int_link = 'no link'

                    if filing_num_href != None:
                        filing_num_link = self.base_url + filing_num_href['href'] 
                    else:
                        filing_num_link = 'no link'

                    file_dict = {}
                    file_dict['file_type'] = filing_type
                    file_dict['file_number'] = filing_numb
                    file_dict['file_date'] = filing_date
                    file_dict['links'] = {}
                    file_dict['links']['documents'] = filing_doc_link

                    document_response = requests.get(filing_doc_link)
                    document_soup = BeautifulSoup(document_response.content, 'html.parser')
                    document_links = document_soup.find_all('a', {'href': lambda href: href.endswith('.txt')})
                    document = [document['href'] for document in document_links]
                    full_txt_url = "https://www.sec.gov" + document[0]
                    pattern = r"/\d{12,20}/"
                    modified_url = re.sub(pattern, "/",full_txt_url)
                    document_url = modified_url.replace('-','').replace('.txt', '/index.json')

                    try:
                        content = requests.get(document_url).json()
                        for file in content['directory']['item']:
                            if file['name'] == 'FilingSummary.xml':
                                xml_summary = self.base_url + content['directory']['name']  + "/" + file['name']
                                file_dict['links']['documents_xml'] = xml_summary
                    except requests.exceptions.RequestException as e:
                        print("Error occurred for:", document_url)
                        continue

                    file_dict['links']['interactive_data'] = filing_int_link
                    file_dict['links']['filing_number'] = filing_num_link
                    master_list.append(file_dict)

                self.filing_data[f"filing_{company}"] = master_list

Error occurred for: https://www.sec.gov/Archives/edgar/data/320193/a2038036z10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/320193/a10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10qa/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10qa/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/0001032210010002730001/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/0001032210005000380001/index.json


In [7]:
if __name__ == "__main__":
    cik_file_path = "cik/liste_cik.txt"
    filing_type = "10-q"
    
    data_fetcher = EDGARDataFetcher(cik_file_path, filing_type)
    data_fetcher.read_cik_data()
    data_fetcher.fetch_filings()
    data_fetcher.get_filing_data()
    
    filing_data = data_fetcher.filing_data
    filing_data

Error occurred for: https://www.sec.gov/Archives/edgar/data/320193/a2038036z10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/320193/a10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10qa/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10qa/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/d10q/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/0001032210010002730001/index.json
Error occurred for: https://www.sec.gov/Archives/edgar/data/789019/0001032210005000380001/index.json


In [11]:
filing_data.keys()

dict_keys(['filing_aapl', 'filing_msft'])

In [13]:
# Create the "filing_type" directory if it doesn't exist
folder_name = "filing_documents"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Iterate through the filing_data dictionary
for key, data_list in filing_data.items():
    # Specify the name of the CSV file
    csv_file = os.path.join(folder_name, f'{key}.csv')

    # Create or open the CSV file in write mode
    with open(csv_file, mode='w', newline='') as file:
        # Define field names for the CSV file
        fieldnames = ['file_type', 'file_number', 'file_date', 'documents', 'documents_xml', 'interactive_data', 'filing_number']

        # Create a CSV writer and write the header row
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        # Iterate through each row in the data_list
        for row in data_list:
            # Extract the links dictionary from the row
            links = row.pop('links')

            # Update the row with link data
            row.update(links)

            # Write the row to the CSV file
            writer.writerow(row)

    # Print a message indicating where the data has been saved
    print(f'Data has been saved to {csv_file}')


Data has been saved to filing_documents/filing_aapl.csv
Data has been saved to filing_documents/filing_msft.csv
