# Introduction to Using SEC Edgar Data with AlgoSeek Datasets

## Prerequisite
Before you run through this notebook, go to [SEC Edgar](https://www.sec.gov/edgar/sec-api-documentation), scroll down to the bottom of the page, and click on the [companyfacts.zip](http://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip) to download it to your computer. Unzip it into your data folder. Since there are limitations to the Edgar APIs, it's easier to bulk download the raw files and process them locally. The unzipped folder should have a bunch of json files that look like 'CIK0000001750.json'.

## 1) Introduction

This notebook demonstrates how to fetch sec edgar filing data and use it with AlgoSeek's Equity Data

### 1.1) Example using the Edgar API

In [None]:
# import libraries
import os
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import json


In order to use the SEC Edgar API, you must use a valid email address. This is simply an anti-bot measure

In [None]:
# headers = {'User-Agent': "your@email.com"}
headers = {'User-Agent': "julian@julianwiley.com"}


Fetch SEC Edgar index of tickers and CIK numbers for reference

In [None]:
tickers_cik = requests.get("https://www.sec.gov/files/company_tickers.json", headers=headers)

In [None]:
tickers_cik.json()

In [None]:
tickers_cik = pd.json_normalize(pd.json_normalize(tickers_cik.json(),
max_level=0).values[0])
tickers_cik["cik_str"] = tickers_cik["cik_str"].astype(str).str.zfill(10)
tickers_cik.set_index("ticker",inplace=True)

In [None]:
tickers_cik

In [None]:
tickers_cik.to_parquet('data/us_equity/reference/cik_ref.parquet')

In [None]:
aapl_facts = requests.get(f"https://data.sec.gov/api/xbrl/companyfacts/CIK0000320193.json",headers=headers)

In [None]:
aapl_f = aapl_facts.json()

In [None]:
len(aapl_f['facts']['us-gaap'].keys())

In [None]:
for key in aapl_f['facts']['us-gaap'].keys():
    print(key)

In [None]:
for key in aapl_f['facts']['dei'].keys():
    print(key)

In [None]:
for key in aapl_f['facts']['dei'].keys():
            fact = aapl_f['facts']['dei'][key]

            # USD

            try:
                fact = pd.json_normalize(fact['units']['USD'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                print(fact)
            except:
                pass

In [None]:
aapl_f['facts']['us-gaap']

In [None]:
pd.json_normalize(aapl_f['facts']['us-gaap'])

In [None]:
aapl_assets = aapl_f['facts']['us-gaap']['Assets']
aapl_assets = pd.json_normalize(aapl_assets['units']['USD'])
aapl_assets["filed"] = pd.to_datetime(aapl_assets["filed"])
aapl_assets = aapl_assets.sort_values("end")


In [None]:
aapl_assets

### 1.2) Using the files from our intial bulk download

In [None]:
ex1 = pd.read_json('data/us_equity/companyfacts/CIK0000320193.json')
ex1.head()

In [None]:
for key in ex1['facts']['us-gaap'].keys():
    print(key)

In [None]:
for key in ex1['facts']['dei'].keys():
            fact = ex1['facts']['dei'][key]

            # USD

            try:
                fact = pd.json_normalize(fact['units']['USD'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                print(fact)
            except:
                pass

In [None]:
for tick in tickers_cik['cik_str']:
    file_dir = f'data/us_equity/companyfacts/CIK{tick}.json'.format(tick=tick)
    print(os.path.isfile(file_dir))
    # print(tick)

In [None]:
for index, row in tickers_cik.iterrows():
    file_dir = f'data/us_equity/companyfacts/CIK{tick}.json'.format(tick=index)
    print(os.path.isfile(file_dir))
    print(row['cik_str'])

Fetching Financial data

## 2) Download SEC filings data

In [None]:
os.chdir('C:/Users/julia/Documents/Coding/JupyterSamples/AlgoSeekNotebooks')
look = pd.read_parquet('data/us_equity/reference/lookup.parquet')
ref = pd.read_parquet('data/us_equity/reference/master.parquet')
tickers = look['Ticker'].unique()

In [None]:
tickers_cik['cik_str'].iloc[0]

In [None]:
from tqdm import tqdm
# os.mkdir('data/us_equity/fundamental/')
# os.mkdir('data/us_equity/fundamental/sec/')
for index, row in tickers_cik.iterrows():
    print(index)
    ticker = index

    file_dir = 'data/us_equity/companyfacts/CIK'+row['cik_str']+'.json'

    try:
        os.mkdir('data/us_equity/fundamentals/sec/{tick}/'.format(tick=ticker))
    except:
        print('mkdir failed for ticker: '+ticker)
        pass

    cik = row['cik_str']
    # print(ticker, cik)
    try:
        company_facts = pd.read_json(file_dir)
        company_facts.head()
    except:
        print(ticker + ' error')
        pass

    try:

        for key in company_facts['facts']['dei'].keys():
            fact = company_facts['facts']['dei'][key]

            # USD

            try:
                fact = pd.json_normalize(fact['units']['USD'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamentals/sec/'+ticker+'/'+key+'.parquet')
            except:
                print(key, 'fail - dei USD')

            # Shares
            try:
                fact = pd.json_normalize(fact['units']['shares'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamentals/sec/'+ticker+'/'+key+'.parquet')
            except:
                print(key, 'fail - dei shares')
    except:
        print(ticker + 'dei failed')

    try:
        for key in company_facts['facts']['us-gaap'].keys():
            fact = company_facts['facts']['us-gaap'][key]

            # USD
            try:
                fact = pd.json_normalize(fact['units']['USD'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamentals/sec/'+ticker+'/'+key+'.parquet')
            except:
                print(key, ' fail - us-gaap')

            # shares
            try:
                fact = pd.json_normalize(fact['units']['shares'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamentals/sec/'+ticker+'/'+key+'.parquet')
            except:
                print(key, ' wrong units')

            # USD/shares
            try:
                fact = pd.json_normalize(fact['units']['USD/shares'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamentals/sec/'+ticker+'/'+key+'.parquet')
            except:
                print(key, ' wrong units')
            # Store
            try:
                fact = pd.json_normalize(fact['units']['Store'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamentals/sec/'+ticker+'/'+key+'.parquet')
            except:
                print(key, ' wrong units')
            # pure
            try:
                fact = pd.json_normalize(fact['units']['pure'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamentals/sec/'+ticker+'/'+key+'.parquet')
            except:
                print(key, ' wrong units')
            # Year
            try:
                fact = pd.json_normalize(fact['units']['Year'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamentals/sec/'+ticker+'/'+key+'.parquet')
            except:
                print(key, ' wrong units')
    except:
        print('second fail')
    # print(tickers_cik.index[i]+'.parquet')

In [None]:
tickers = tickers_cik.index.values
tickers

In [None]:
from tqdm import tqdm
# os.mkdir('data/us_equity/fundamental/')
# os.mkdir('data/us_equity/fundamental/sec/')
for i in tqdm(range(len(tickers_cik))):
    print(tickers_cik.index[i])
    ticker = tickers_cik.index[i]
    try:
        os.mkdir('data/us_equity/fundamental/sec/{tick}/'.format(tick=ticker))
    except:
        print('mkdir failed for ticker: '+ticker)
        pass

    cik = tickers_cik['cik_str'].iloc[i]
    # print(ticker, cik)
    try:
        company_facts = requests.get("https://data.sec.gov/api/xbrl/companyfacts/CIK"+cik+'.json',headers=headers)
        # print(company_facts)
        company_facts = company_facts.json()
    except:
        print(ticker + ' error')
        pass

    try:
        for key in company_facts['facts']['dei'].keys():
            fact = company_facts['facts']['dei'][key]

            # USD

            try:
                fact = pd.json_normalize(fact['units']['USD'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamental/sec/{tick}/{key}.parquet'.format(tick=ticker,key=key))
            except:
                print(key, 'fail - dei USD')

            # Shares
            try:
                fact = pd.json_normalize(fact['units']['shares'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamental/sec/{tick}/{key}.parquet'.format(tick=ticker,key=key))
            except:
                print(key, 'fail - dei shares')




        for key in tqdm(company_facts['facts']['us-gaap'].keys()):
            fact = company_facts['facts']['us-gaap'][key]

            # USD
            try:
                fact = pd.json_normalize(fact['units']['USD'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamental/sec/{tick}/{key}.parquet'.format(tick=ticker,key=key))
            except:
                print(key, ' fail - us-gaap')

            # shares
            try:
                fact = pd.json_normalize(fact['units']['shares'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamental/sec/{tick}/{key}.parquet'.format(tick=ticker,key=key))
            except:
                print(key, ' wrong units')

            # USD/shares
            try:
                fact = pd.json_normalize(fact['units']['USD/shares'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamental/sec/{tick}/{key}.parquet'.format(tick=ticker,key=key))
            except:
                print(key, ' wrong units')
            # Stock
            try:
                fact = pd.json_normalize(fact['units']['Stock'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamental/sec/{tick}/{key}.parquet'.format(tick=ticker,key=key))
            except:
                print(key, ' wrong units')
            # pure
            try:
                fact = pd.json_normalize(fact['units']['pure'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamental/sec/{tick}/{key}.parquet'.format(tick=ticker,key=key))
            except:
                print(key, ' wrong units')
            # Year
            try:
                fact = pd.json_normalize(fact['units']['Year'])
                fact["filed"] = pd.to_datetime(fact["filed"])
                fact = fact.sort_values("end")
                fact.to_parquet('data/us_equity/fundamental/sec/{tick}/{key}.parquet'.format(tick=ticker,key=key))
            except:
                print(key, ' wrong units')
    except:
        print(ticker + ' did not work')
        pass

    print(tickers_cik.index[i]+'.parquet')

In [None]:
def get_financials(stock_cik):
    """

    :param stock_cik:
    :type stock_cik:
    :return:
    :rtype:
    """

    response = requests.get(f"https://data.sec.gov/api/xbrl/companyconcept/{stock_cik}/us-gaap/Assets.json".format(stock_cik=stock_cik), headers=headers)

    return response

In [None]:
response = requests.get("https://data.sec.gov/api/xbrl/companyconcept/CIK0000320193/us-gaap/Assets.json", headers=headers)