#### Instructions:
1. Have the .env file updated in your current directory 
2. Change your email in the user_agent
3. Change the subset to your own subset
4. Run the code

In [1]:
import requests
import pandas as pd
import sqlite3
import numpy as np
from io import StringIO, BytesIO
import glob
import os
import boto3
import json

import os
from dotenv import load_dotenv

from datetime import date

#### change your email

In [2]:
user_agent = "felix.agosto@bts.tech" # Change your email

### Loading the aws credentials as env variables 

In [4]:
today_date = str(date.today())

# loading variables from .env file
load_dotenv() 

# accessing and printing value
aws_access_key_id = os.getenv("aws_access_key_id")
aws_secret_access_key = os.getenv("aws_secret_access_key")
aws_session_token = os.getenv("aws_session_token")
s3_bucket = "financy"

#### Getting all the tickers available in the EDGAR Database

In [5]:
headers = {'User-Agent': user_agent}

companyTickers = requests.get("https://www.sec.gov/files/company_tickers.json", headers = headers)
companyData = pd.DataFrame.from_dict(companyTickers.json(), orient = 'index')
# We need to add zeros because some CIKs differ in digits and the API needs 10 digit CIK. 
companyData['cik_str'] = companyData['cik_str'].astype(str).str.zfill(10)

#### Diving computational power among the group
Each member will run the code

In [11]:
# Dividing the dataset in three parts. 
sub_Jose=companyData[:2572]
sub_Karen=companyData[2572:5144]
sub_Felix=companyData[5144:7716]
sub_Arturo=companyData[7716:]

#### From EDGAR Database to private s3_bucket
Function to pull company data from the API and upload it directly to s3

In [None]:
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
aws_session_token=aws_session_token

In [25]:
def processDF(tickers, companyData, s3_bucket):
    s3 = boto3.client("s3",
                      aws_access_key_id="ASIA2HBTK5GYLLA6XRVJ",
                      aws_secret_access_key="hRn4og0BRaQ8iYTSyykWWK+IOOsP6jx8GpKQSrIu",
                      aws_session_token="IQoJb3JpZ2luX2VjENL//////////wEaCXVzLXdlc3QtMiJHMEUCIQCr9dee6seXv0Ar1+/QO3hTcYSgbQOjYqmecsvNKGRStwIgbMRk3on7z20KJ0Ch7m8FjPy1M6ZMeD5iuPMTGwY7nCwqpwIIWxAAGgw3MDIzMzQ4MjI4MzIiDC/tZTCVx3rDp16BpSqEAmtAqvhOqCieQM7l12D3/0jHWdb5ktzFtHEs4Yhk53UylN841rjHqiGHK/s0WeoJ+Y3Uy7Bwwexmdx6IHuq2xAUbt4hYemV5IzeDnBLLqtNTfML+Z+PmpuHAEMsGbUMMlYvDXcylN46MiM8YXkzsg6AX98erXt7ofMUNOdr9+ykp+Uikl9weCq+hQ45sXpwn9LnOjhv1WRvmYg7wsCvhQ+FPxQQwPPURnHNxoFTc0PXQwv5eW/iHKEjnYJ8zxUUu0v8GCjcurz45AMvQKg1VkmdFhehjAjFN/FbbbUA1hWG71KsFWF1POpxje7uV81ZD100u4WZKv21m65UwNVP1xvF0nHAQMO7j67IGOp0BCvRD3Kg7c5aC4BHX1eaQZW8UwE0sNo9r3MhH72cgEt7J/sKwohSVJefC7r4tRk9aahES8ioEftMWXINc+T6YZyD22kAR7SOFjQGd0GZbShTE5gcpf33Qn/fto5hll5sKNWwaECfIxZhZTmeMxugWWP0z/0+aOKEpT8IFWZWgdVF59C/6INPrO6K63GeO+FRR6Ov4o06RaH+fwt1FHg==")

    counter_total_tries = 0
    counter_total_uploaded = 0
    
    for ticker in tickers:
        try:
            # Retrieve the CIK value corresponding to the ticker
            try:
                cik_val = str(companyData.loc[companyData['ticker'] == ticker, 'cik_str'].values[0])
            except IndexError:
                print(f"{ticker} not found in Company Data")
                continue

            # Fetch financial data for the company from EDGAR
            print(f"Successful CIK extraction for {ticker}")
            response = requests.get(f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik_val}.json', headers = {'User-Agent': "felix.agosto@bts.tech"})
            
            # Check if the request was successful
            response.raise_for_status()
            
            # Convert JSON response to dictionary
            companyFacts = response.json()
            json_str=json.dumps(companyFacts)
            json_bytes=json_str.encode("utf-8")
            print("Successfully converted to json bytes")
            
            # Preparing the JSON for s3 upload
            json_buffer=BytesIO()
            json_buffer.write(json_bytes)
            json_buffer.seek(0)
            
            # Defining the s3 bucket location.
            s3_file_path = f"egdar_data/raw/{today_date}/{ticker}.json"
            
            #Upload to S3 bucket
            s3.upload_fileobj(json_buffer, s3_bucket, s3_file_path)
            print(f"Data for {ticker} uploaded to S3: {s3_file_path}")
            counter_total_uploaded += 1
            print("total companies uploaded:", counter_total_uploaded)
                
        except KeyError:
            print(f"{ticker} not found in Company Data")
        except requests.HTTPError as e:
            print(f"HTTP error for {ticker}: {e}")
        except requests.RequestException as e:
            print(f"Request exception for {ticker}: {e}")

        counter_total_tries +=1
        print("total tries:", counter_total_tries)
        
    print("total companies uploaded:", counter_total_uploaded)

#### change your sub_section of data!
sub_section options:
sub_Jose;
sub_Karen;
sub_Felix;
sub_Arturo

In [15]:
sub_Felix=companyData[5144:7716]
sub_section = sub_Felix[:500] # HERE!

#### Run the code

In [16]:
tickers = list(sub_section['ticker'])

# Run the Ingestion Function
processDF(tickers, companyData, s3_bucket)

Successful CIK extraction for IDN
Successfully converted to json bytes
Data for IDN uploaded to S3: egdar_data/raw/2024-06-01/IDN.json
total companies uploaded: 1
total tries: 1
Successful CIK extraction for AEON
Successfully converted to json bytes
Data for AEON uploaded to S3: egdar_data/raw/2024-06-01/AEON.json
total companies uploaded: 2
total tries: 2
Successful CIK extraction for PET
Successfully converted to json bytes
Data for PET uploaded to S3: egdar_data/raw/2024-06-01/PET.json
total companies uploaded: 3
total tries: 3
Successful CIK extraction for LGCB
Successfully converted to json bytes
Data for LGCB uploaded to S3: egdar_data/raw/2024-06-01/LGCB.json
total companies uploaded: 4
total tries: 4
Successful CIK extraction for REDW
HTTP error for REDW: 404 Client Error: Not Found for url: https://data.sec.gov/api/xbrl/companyfacts/CIK0000942895.json
total tries: 5
Successful CIK extraction for FTCI
Successfully converted to json bytes
Data for FTCI uploaded to S3: egdar_data

In [17]:
sub_Felix=companyData[5144:7716]
sub_section = sub_Felix[500:1000] # HERE!

In [18]:
tickers = list(sub_section['ticker'])

# Run the Ingestion Function
processDF(tickers, companyData, s3_bucket)

Successful CIK extraction for SPRB
Successfully converted to json bytes
Data for SPRB uploaded to S3: egdar_data/raw/2024-06-01/SPRB.json
total companies uploaded: 1
total tries: 1
Successful CIK extraction for NTWK
Successfully converted to json bytes
Data for NTWK uploaded to S3: egdar_data/raw/2024-06-01/NTWK.json
total companies uploaded: 2
total tries: 2
Successful CIK extraction for BOLT
Successfully converted to json bytes
Data for BOLT uploaded to S3: egdar_data/raw/2024-06-01/BOLT.json
total companies uploaded: 3
total tries: 3
Successful CIK extraction for STBX
Successfully converted to json bytes
Data for STBX uploaded to S3: egdar_data/raw/2024-06-01/STBX.json
total companies uploaded: 4
total tries: 4
Successful CIK extraction for NCNC
Successfully converted to json bytes
Data for NCNC uploaded to S3: egdar_data/raw/2024-06-01/NCNC.json
total companies uploaded: 5
total tries: 5
Successful CIK extraction for RAYA
Successfully converted to json bytes
Data for RAYA uploaded 

In [19]:
sub_Felix = companyData[5144:7716]
sub_section = sub_Felix[1000:1500]  # HERE!

In [20]:
tickers = list(sub_section['ticker'])

# Run the Ingestion Function
processDF(tickers, companyData, s3_bucket)

Successful CIK extraction for BLIN
Successfully converted to json bytes
Data for BLIN uploaded to S3: egdar_data/raw/2024-06-01/BLIN.json
total companies uploaded: 1
total tries: 1
Successful CIK extraction for TNLX
Successfully converted to json bytes
Data for TNLX uploaded to S3: egdar_data/raw/2024-06-01/TNLX.json
total companies uploaded: 2
total tries: 2
Successful CIK extraction for SBFM
Successfully converted to json bytes
Data for SBFM uploaded to S3: egdar_data/raw/2024-06-01/SBFM.json
total companies uploaded: 3
total tries: 3
Successful CIK extraction for HNRA
Successfully converted to json bytes
Data for HNRA uploaded to S3: egdar_data/raw/2024-06-01/HNRA.json
total companies uploaded: 4
total tries: 4
Successful CIK extraction for NIVF
HTTP error for NIVF: 404 Client Error: Not Found for url: https://data.sec.gov/api/xbrl/companyfacts/CIK0001981662.json
total tries: 5
Successful CIK extraction for XERI
Successfully converted to json bytes
Data for XERI uploaded to S3: egda

In [26]:
sub_Felix = companyData[5144:7716]
sub_section = sub_Felix[1500:2572]

In [27]:
tickers = list(sub_section['ticker'])

# Run the Ingestion Function
processDF(tickers, companyData, s3_bucket)

Successful CIK extraction for PTPI
Successfully converted to json bytes
Data for PTPI uploaded to S3: egdar_data/raw/2024-06-01/PTPI.json
total companies uploaded: 1
total tries: 1
Successful CIK extraction for FULO
Successfully converted to json bytes
Data for FULO uploaded to S3: egdar_data/raw/2024-06-01/FULO.json
total companies uploaded: 2
total tries: 2
Successful CIK extraction for TC
Successfully converted to json bytes
Data for TC uploaded to S3: egdar_data/raw/2024-06-01/TC.json
total companies uploaded: 3
total tries: 3
Successful CIK extraction for LEJUY
Successfully converted to json bytes
Data for LEJUY uploaded to S3: egdar_data/raw/2024-06-01/LEJUY.json
total companies uploaded: 4
total tries: 4
Successful CIK extraction for LTUM
Successfully converted to json bytes
Data for LTUM uploaded to S3: egdar_data/raw/2024-06-01/LTUM.json
total companies uploaded: 5
total tries: 5
Successful CIK extraction for CING
Successfully converted to json bytes
Data for CING uploaded to 

In [28]:
# 2210 companies were downloaded from 2572.