In [4]:
import requests
import pandas as pd
import sqlite3
import numpy as np
from io import StringIO, BytesIO
import glob
import os
import boto3
import json

#### Instructions:
1. Change AWS hardcoded credentials
2. Change the subset to your own subset
3. RUn the code

#### Creating Company Data DF

In [5]:
headers = {'User-Agent': "felix.agosto@bts.tech"}

companyTickers = requests.get("https://www.sec.gov/files/company_tickers.json", headers = headers)
companyData = pd.DataFrame.from_dict(companyTickers.json(), orient = 'index')
# We need to add zeros because some CIKs differ in digits and the API needs 10 digit CIK. 
companyData['cik_str'] = companyData['cik_str'].astype(str).str.zfill(10)

#### Creating temporary tickers

#### Function:
- Insert AWS Keys when using.

In [53]:
def processDF(tickers, companyData, s3_bucket):
    s3 = boto3.client("s3",
                      aws_access_key_id="",aws_secret_access_key="",
                      aws_session_token="")
    
    for ticker in tickers:
        try:
            # Retrieve the CIK value corresponding to the ticker
            try:
                cik_val = str(companyData.loc[companyData['ticker'] == ticker, 'cik_str'].values[0])
            except IndexError:
                print(f"{ticker} not found in Company Data")
                continue

            # Fetch financial data for the company from EDGAR
            print(f"Successful CIK extraction for {ticker}")
            response = requests.get(f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik_val}.json', headers = {'User-Agent': "felix.agosto@bts.tech"})
            
            # Check if the request was successful
            response.raise_for_status()
            
            # Convert JSON response to dictionary
            companyFacts = response.json()
            json_str=json.dumps(companyFacts)
            json_bytes=json_str.encode("utf-8")
            print("Successfully converted to json bytes")
            
            # Preparing the JSON for s3 upload
            json_buffer=BytesIO()
            json_buffer.write(json_bytes)
            json_buffer.seek(0)
            
            # Defining the s3 bucket location.
            s3_file_path = f"edgar_data/{ticker}.json"
            
            #Upload to S3 bucket
            s3.upload_fileobj(json_buffer, s3_bucket, s3_file_path)
            print(f"Data for {ticker} uploaded to S3: {s3_file_path}")
                
        except KeyError:
            print(f"{ticker} not found in Company Data")
        except requests.HTTPError as e:
            print(f"HTTP error for {ticker}: {e}")
        except requests.RequestException as e:
            print(f"Request exception for {ticker}: {e}")

In [None]:
# Dividing the dataset in three parts. 
sub_Jose=companyData[:2572]
sub_Karen=companyData[2572:5144]
sub_Felix=companyData[5144:7716]
sub_Arturo=companyData[7716:]

In [54]:
#tickers = list(YOUR_SUBSET['ticker'])
tickers = list(sub_Jose['ticker'])

# Run the Ingestion Function
processDF(tickers, companyData, "bucket-name")

Successful CIK extraction for MSFT
Successfully converted to json bytes
Data for MSFT uploaded to S3: edgar_data/MSFT.json
Successful CIK extraction for AAPL
Successfully converted to json bytes
Data for AAPL uploaded to S3: edgar_data/AAPL.json
Successful CIK extraction for NVDA
Successfully converted to json bytes
Data for NVDA uploaded to S3: edgar_data/NVDA.json
