In [1]:
import datetime
import json
import requests
import sqlite3
import time
import pandas as pd
from myfuncs import get_fred, call_fmp_api, get_submissions
# for parsing fillings with bs4 (deployment)
import secedgar 
from secedgar.cik_lookup import CIKLookup
from secedgar.client import NetworkClient
# event loop fix for jupyter notebooks
import nest_asyncio
nest_asyncio.apply()

## Constituents

In [3]:
def get_wikipedia_sp500():
    """Gets tables from wikipedia and """
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    data = pd.read_html(url)
    data[0].to_csv('../data/raw/companies_wiki.csv', index=False)
    data[1].to_csv('../data/raw/historical_companies_wiki.csv', index=False)
    data = pd.read_csv('https://raw.githubusercontent.com/fja05680/sp500/master/S%26P%20500%20Historical%20Components%20%26%20Changes(03-14-2022).csv')
    data.to_csv('../data/raw/historical_companies_TradingEvolved.csv', index=False)
    return None

### CIK SEC list

Get from the .json url

In [6]:
def get_sec_cik():
    url = "https://www.sec.gov/files/company_tickers.json"
    headers = {'User-Agent': 'Freelance data scientist calling the API for learning purposes. francisco.palab@gmail.com'}
    r = requests.get(url, headers=headers)
    df = pd.DataFrame(json.loads(r.content)).T
    df.to_csv('.././data/raw/CIK.csv')
    return None

#get_sec_cik()

When downloading the statements below some symbols do not return results. Attempting to infer their CIK from other sources

In [25]:
# from secedgar
client = NetworkClient(user_agent="francisco pala (francisco.palab@gmail.com)")
lookups = CIKLookup(['Yahoo'], client = client, user_agent="francisco pala (francisco.palab@gmail.com)")
lookups.lookup_dict

                          Found multiple companies matching 'Yahoo':
                          ALTABA INC.
YAHOO INC
Yahoo! Japan Corp
Yahoo! Korea CORP


{}

## Yearly Statements

In [16]:
symbols = pd.read_csv('../data/clean/companies.csv').symbol

#### Balance

In [17]:
# call API
for symbol in symbols:
    # balance sheet endpoint
    df = call_fmp_api(endpoint='balance_yearly', ticker=symbol, periods=30)
    if len(df) != 0:
        df.to_csv('.././data/raw/balance/{}.csv'.format(symbol),index = False)

#### Income

In [18]:
# call API
for symbol in symbols:
    # balance sheet endpoint
    df = call_fmp_api(endpoint='income_yearly', ticker=symbol, periods=30)
    if len(df) != 0:
        df.to_csv('.././data/raw/income/{}.csv'.format(symbol),index = False)

#### Cash Flow

In [19]:
# call API
for symbol in symbols:
    # balance sheet endpoint
    df = call_fmp_api(endpoint='cflow_yearly', ticker=symbol, periods=30)
    if len(df) != 0:
        df.to_csv('.././data/raw/cash_flow/{}.csv'.format(symbol),index = False)

## Market Cap

In [20]:
periods = 30*365
# call API
for symbol in symbols:
    # balance sheet endpoint
    df = call_fmp_api(endpoint='market_cap', ticker=symbol, periods=periods)
    if len(df) != 0:
        df.to_csv('.././data/raw/marketCaps/{}.csv'.format(symbol),index = False)

## FRED Series

#### corePCE

In [3]:
core_pce = get_fred('PCEPILFE', 'corePCE')
core_pce.to_csv('../data/raw/fred/corePCE.csv', index=False)

#### GDP

In [None]:
gdp = get_fred('GDP')
gdp.to_csv('./data/raw/fred/GDP.csv', index=False)

## SEC Submissions

In [2]:
def bulk_download_submissions():
    # submissions from SEC
    companies = pd.read_csv('./data/raw/companies.csv').convert_dtypes()
    symbol_dict = companies.drop_duplicates(subset = 'cik')
    symbol_dict = symbol_dict.set_index('cik').symbol.to_dict()

    for cik in companies.cik.unique().tolist():
        time.sleep(1)
        data_dict = get_submissions(cik)
        df = pd.DataFrame(data_dict)
        df['cik'] = cik
        df['symbol'] = symbol_dict[cik]
        df.to_csv('./data/submissions/{}.csv'.format(symbol_dict[cik]))

def build_submissions_dataframe():
    from pathlib import Path
    files = Path('.././data/submissions').glob('*.csv')
    df_list=[]
    for file in files:
        df = pd.read_csv(file,index_col = 0)
        df_list.append(df)
    submissions = pd.concat(df_list)
    submissions.shape
    cond1 = submissions.form.str.contains('10-Q')
    cond2 = submissions.form.str.contains('10-K')
    cond = cond1 | cond2
    submissions = submissions.loc[cond, :].reset_index(drop=True)
    return submissions

# submissions.to_csv('./data/raw/submissions.csv', index=False)