#### Instructions:
1. Have the .env file updated in your current directory 
2. Change your email in the user_agent
3. Change the subset to your own subset
4. Run the code

In [1]:
import requests
import pandas as pd
import sqlite3
import numpy as np
from io import StringIO, BytesIO
import glob
import os
import boto3
import json

import os
from dotenv import load_dotenv

from datetime import date

#### change your email

In [2]:
user_agent = "jose.trindade@bts.tech" # Change your email

### Loading the aws credentials as env variables 

In [3]:
today_date = str(date.today())

# loading variables from .env file
load_dotenv() 

# accessing and printing value
aws_access_key_id = os.getenv("aws_access_key_id")
aws_secret_access_key = os.getenv("aws_secret_access_key")
aws_session_token = os.getenv("aws_session_token")
s3_bucket = "financy"

#### Getting all the tickers available in the EDGAR Database

In [4]:
headers = {'User-Agent': user_agent}

companyTickers = requests.get("https://www.sec.gov/files/company_tickers.json", headers = headers)
companyData = pd.DataFrame.from_dict(companyTickers.json(), orient = 'index')
# We need to add zeros because some CIKs differ in digits and the API needs 10 digit CIK. 
companyData['cik_str'] = companyData['cik_str'].astype(str).str.zfill(10)

#### Diving computational power among the group
Each member will run the code

In [5]:
# Dividing the dataset in three parts. 
sub_Jose=companyData[:2572]
sub_Karen=companyData[2572:5144]
sub_Felix=companyData[5144:7716]
sub_Arturo=companyData[7716:]

#### From EDGAR Database to private s3_bucket
Function to pull company data from the API and upload it directly to s3

In [6]:
def processDF(tickers, companyData, s3_bucket):
    s3 = boto3.client("s3",
                      aws_access_key_id=aws_access_key_id,
                      aws_secret_access_key=aws_secret_access_key,
                      aws_session_token=aws_session_token)

    counter_total_tries = 0
    counter_total_uploaded = 0
    
    for ticker in tickers:
        try:
            # Retrieve the CIK value corresponding to the ticker
            try:
                cik_val = str(companyData.loc[companyData['ticker'] == ticker, 'cik_str'].values[0])
            except IndexError:
                print(f"{ticker} not found in Company Data")
                continue

            # Fetch financial data for the company from EDGAR
            print(f"Successful CIK extraction for {ticker}")
            response = requests.get(f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik_val}.json', headers = {'User-Agent': "felix.agosto@bts.tech"})
            
            # Check if the request was successful
            response.raise_for_status()
            
            # Convert JSON response to dictionary
            companyFacts = response.json()
            json_str=json.dumps(companyFacts)
            json_bytes=json_str.encode("utf-8")
            print("Successfully converted to json bytes")
            
            # Preparing the JSON for s3 upload
            json_buffer=BytesIO()
            json_buffer.write(json_bytes)
            json_buffer.seek(0)
            
            # Defining the s3 bucket location.
            s3_file_path = f"egdar_data/raw/{today_date}/{ticker}.json"
            
            #Upload to S3 bucket
            s3.upload_fileobj(json_buffer, s3_bucket, s3_file_path)
            print(f"Data for {ticker} uploaded to S3: {s3_file_path}")
            counter_total_uploaded += 1
            print("total companies uploaded:", counter_total_uploaded)
                
        except KeyError:
            print(f"{ticker} not found in Company Data")
        except requests.HTTPError as e:
            print(f"HTTP error for {ticker}: {e}")
        except requests.RequestException as e:
            print(f"Request exception for {ticker}: {e}")

        counter_total_tries +=1
        print("total tries:", counter_total_tries)
        
    print("total companies uploaded:", counter_total_uploaded)

#### change your sub_section of data!
sub_section options:
sub_Jose;
sub_Karen;
sub_Felix;
sub_Arturo

In [7]:
sub_section = sub_Jose # HERE!

#### Run the code

In [None]:
tickers = list(sub_section['ticker'])

# Run the Ingestion Function
processDF(tickers, companyData, s3_bucket)

Successful CIK extraction for MSFT
Successfully converted to json bytes
Data for MSFT uploaded to S3: egdar_data/raw/2024-05-31/MSFT.json
Successful CIK extraction for AAPL
Successfully converted to json bytes
Data for AAPL uploaded to S3: egdar_data/raw/2024-05-31/AAPL.json
Successful CIK extraction for NVDA
Successfully converted to json bytes
Data for NVDA uploaded to S3: egdar_data/raw/2024-05-31/NVDA.json
Successful CIK extraction for GOOGL
Successfully converted to json bytes
Data for GOOGL uploaded to S3: egdar_data/raw/2024-05-31/GOOGL.json
Successful CIK extraction for AMZN
Successfully converted to json bytes
Data for AMZN uploaded to S3: egdar_data/raw/2024-05-31/AMZN.json
Successful CIK extraction for META
Successfully converted to json bytes
Data for META uploaded to S3: egdar_data/raw/2024-05-31/META.json
Successful CIK extraction for BRK-B
Successfully converted to json bytes
Data for BRK-B uploaded to S3: egdar_data/raw/2024-05-31/BRK-B.json
Successful CIK extraction fo