In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import json
import time
from tqdm.auto import tqdm
from multiprocessing import Pool
import glob

In [None]:
# # !python -m spacy download en_core_web_sm
# import spacy

# nlp = spacy.load('en_core_web_sm')

In [None]:
def make_request(url, params=None, headers=None, max_retries = 5):

    current_tries = 1
    while current_tries < max_retries:
        response = requests.get(url=url, params=params, headers=headers)
        
        if response.status_code == 200:
            return response
        else:
            time.sleep(1)
            current_tries += 1
            
    return requests.get(url=url, params=params, headers=headers)

# Companies by SIC 

In [None]:
def SIC_search(search_params):
    """
    Search the list of companies by SIC code
    >>> https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&filenum=&State=&Country=&SIC=1000&myowner=exclude&action=getcompany
    """
    
    url = "https://www.sec.gov/cgi-bin/browse-edgar"
    
    headers = {
        'User-Agent': 'kuleuven amin.tavakkolnia@kuleuven.be',
        'Host': 'www.sec.gov'
    }

    res = make_request(url, params=search_params, headers=headers, max_retries=10)

    search = BeautifulSoup(res.content, 'html.parser')
    
    try:
        search_df = pd.read_html(str(search.table))[0]
        search_df['SIC'] = search_params['SIC']
        return search_df
    except:
        return None

In [None]:
# Retreive list of all SIC codes from www.sec.gov

# SIC_list = pd.read_html('https://www.sec.gov/corpfin/division-of-corporation-finance-standard-industrial-classification-sic-code-list')[0]
# SIC_list.set_index('SIC Code', inplace=True)
# SIC_list.to_excel('SIC_list.xlsx')

In [None]:
SIC_list = pd.read_excel('SIC_list.xlsx', index_col=0)
SIC_list.head()

In [None]:
all_companies = pd.DataFrame()

for i, sic in enumerate(SIC_list.index):
    start_cnt = 0
    search_params = {
        'action': 'getcompany',
        'SIC': sic,
        'start': start_cnt,
        'count': 100,
        'owner': 'include'
    }
    com = SIC_search(search_params)

    while com is not None:
        com['Industry'] = SIC_list.loc[sic, 'Office']
        all_companies = pd.concat([all_companies, com])
        start_cnt += 100
        search_params = {
            'action': 'getcompany',
            'SIC': sic,
            'start': start_cnt,
            'count': 100,
            'owner': 'include'
        }
        com = SIC_search(search_params)

    if i%50 == 0:
        print(f"{i} SICs added")
        all_companies.to_csv('all_companies.csv', index=False)
    time.sleep(1)
    
all_companies.to_excel('all_companies.xlsx', index=False)

# Get list of filings per company

In [None]:
all_companies = pd.read_excel('all_companies.xlsx', index_col=0)
all_companies.sample(5)

In [None]:
CIK = 320193
headers = {
    'User-Agent': 'kuleuven amin.tavakkolnia@kuleuven.be'
}
res = make_request(f"https://data.sec.gov/submissions/CIK{CIK:010d}.json", headers=headers)
recent_files = res.json()['filings']['recent']

if 'files' in res.json()['filings'].keys():
    add_files_name = res.json()['filings']['files'][0]['name']
    add_files = make_request(f"https://data.sec.gov/submissions/{add_files_name}", headers=headers).json()
    
recent_files_df = pd.DataFrame(recent_files)
add_files_df = pd.DataFrame(add_files)

filings_df = pd.concat([recent_files_df, add_files_df])

filings_df['accessionNumber'] = filings_df['accessionNumber'].str.replace('-', '')
filings_df[filings_df['form']=='10-K']

In [None]:
def get_filings(CIK):
    headers = {
    'User-Agent': 'kuleuven amin.tavakkolnia@kuleuven.be'
    }
    res = make_request(f"https://data.sec.gov/submissions/CIK{CIK:010d}.json", headers=headers)
    filings = res.json()['filings']
    recent_files = filings['recent']
    temp = pd.DataFrame(recent_files)
    
    if filings.get('files'):
        add_files_name = filings['files'][0]['name']
        add_files = make_request(f"https://data.sec.gov/submissions/{add_files_name}", headers=headers).json()
        add_files_df = pd.DataFrame(add_files)

        temp = pd.concat([temp, add_files_df])

    temp["CIK"] = CIK
    
    return temp

In [None]:
if __name__ == '__main__':
    with Pool(processes=12) as p:
        output = list(tqdm(p.imap(get_filings, all_companies.index[:10000]), total=len(all_companies.index)))
    p.join()
    
    filings_df = pd.concat([df for df in output])

In [None]:
# filings_df.drop(columns=["filmNumber", "act", "form", "items", "size", "isXBRL", 
#                         "isInlineXBRL", "primaryDocument", "primaryDocDescription"], inplace=True)
# filings_df['accessionNumber'] = filings_df['accessionNumber'].str.replace('-', '')

https://www.sec.gov/Archives/edgar/data/1632053/000107997419000277/apotheca10k_1312019.htm

In [None]:
accessionNumber = '000110465906084288'
doc_name = 'a06-25759_210k.htm'
url = "https://www.sec.gov/Archives/edgar/data/" + str(CIK) + "/" + accessionNumber + "/" + doc_name

report = make_request(url, max_retries=10)

html = BeautifulSoup(report.content, 'html.parser')

print(url)

In [None]:
# regex to get "Item 1A"

# item = 'Item 1A risk factors'.lower()
# pattern = "(item[\-_\s]?1a)?[\s\-_:]*(risk factor[s])?"

# re.fullmatch(pattern=pattern, string=item)

# def find_risks(tag):
#     pattern = "(item[\-_\s]?1a[\.]?)[\s\-_:]*(risk factor[s]?)"
#     if re.fullmatch(pattern=pattern, string=str(tag.string).lower()):
#         return True
#     else:
#         return False
    
# tags = html.find_all(find_risks)

# SEC API

In [None]:
from sec_api import QueryApi

queryApi = QueryApi(api_key="5d6c2684759d3f17c599fb26a023edb256b25f67ac81a97c85ed69bb71760227")

query = {
  "query": { "query_string": { 
      "query": "cik:1318605 AND filedAt:{2019-01-01 TO 2021-12-31} AND formType:\"10-K\"" 
    } },
  "from": "0",
  "size": "10",
  "sort": [{ "filedAt": { "order": "desc" } }]
}

filings = queryApi.get_filings(query)

filings

In [None]:
from sec_api import ExtractorApi

extractorApi = ExtractorApi("355b10a7c3b55716e8d0ec69c6b24c724d99b050369b066d94fe94dce289b65e")

In [None]:
filing_url = "https://www.sec.gov/Archives/edgar/data/1318605/000156459021004599/tsla-10k_20201231.htm"

# get the standardized and cleaned text of section 1A "Risk Factors"
section_text = extractorApi.get_section(filing_url, "1A", "html")

print(section_text)

In [None]:
section_text

In [None]:
filing_url = "https://www.sec.gov/Archives/edgar/data/1750/000104746914006243/a2220733z10-k.htm"

section_text = extractorApi.get_section(filing_url, "1A", "text")

print(section_text)

# EIKON

In [1]:
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from tqdm.auto import tqdm
import time

import eikon as ek
# import refinitiv.data.eikon as ek
ek.set_app_key('9f63922b6e2f465393b927960fd87954bc4a73d8')

### Submissions

In [None]:
# Get all submissions
submissions = glob.glob("Data\submissions\*")

firm_info = []
for file in tqdm(submissions):
    with open(file, 'rb') as f:
        content = json.load(f)
        try:
            firm_info.append(
                (content["cik"], content["entityType"],
                content["sic"], content["sicDescription"], 
                content["name"], content["tickers"], 
                content["exchanges"], content["category"])
            )
        except:
            continue

firm_info_df = pd.DataFrame(
    firm_info, 
    columns=["CIK", "entityType", "SIC", "sicDescription", "name", "tickers", "exchanges", "category"]
)

firm_info_df.to_csv("firm_info.csv", index=False)

In [None]:
firm_info_df = pd.read_csv("firm_info.csv")
firm_info_df["CIK"] = firm_info_df["CIK"].astype(int)

tickers = (
    firm_info_df.set_index(["CIK", "SIC"])['tickers']
    .str.strip('[]').str.replace("'", "")
    .replace(r'^\s*$', np.nan, regex=True)
    .str.split(",").dropna()
    .explode()
).reset_index()

"""
tik_txt = '\n'.join(tickers.str.replace("\s*'*[*]*", ""))

with open('tickers.txt', "w") as f:
    f.write(tik_txt)
"""

tickers

In [2]:
all_files = pd.read_csv("Data/all_files.csv")
CIKs = "\n".join(all_files.CIK.astype(str).unique().tolist())
with open('CIK.txt', "w") as f:
    f.write(CIKs)

### Get data

In [2]:
filings_df = pd.read_csv("Data\clean_docs_3.csv", index_col=0)
filings_df.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,CIK,report_dt,filing_dt,cleaned_txt,category,SIC,Industry
rf_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,20,2006-12-30,2007-03-09,business operation subject number risk uncerta...,,3823,Office of Life Sciences
1,20,2006-12-30,2007-03-09,competition adversely affect business result o...,,3823,Office of Life Sciences
2,20,2006-12-30,2007-03-09,substantial sale abroad subject risk adverse f...,,3823,Office of Life Sciences
3,20,2006-12-30,2007-03-09,operate cyclical industry industrial capital g...,,3823,Office of Life Sciences
4,20,2006-12-30,2007-03-09,loss large customer adverse effect operating r...,,3823,Office of Life Sciences


In [3]:
obs_df = filings_df[["CIK", "filing_dt"]].groupby("CIK")['filing_dt'].agg(["min", 'max']).reset_index()
obs_df.head()

Unnamed: 0,CIK,min,max
0,20,2007-03-09,2010-03-15
1,1750,2006-07-17,2021-07-21
2,1800,2007-02-23,2021-02-19
3,1961,2009-10-07,2021-03-16
4,2034,2006-09-12,2018-09-28


In [4]:
obs_df["SDate"] = (
    obs_df["min"].astype("datetime64").dt.date 
    - relativedelta(months=6)
    ).astype(str)

obs_df["EDate"] = (
    obs_df["max"].astype("datetime64").dt.date 
    + relativedelta(months=6)
    ).astype(str)
    
obs_df["CIK"] = obs_df["CIK"].apply(lambda cik: f"{cik:010d}")

obs = obs_df.to_records()

### Prices

In [None]:
data = pd.DataFrame()

for o in tqdm(obs):
    SDate = o["SDate"]
    EDate = o["EDate"]

    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"], 
                fields = [
                    'TR.CLOSEPRICE(Adjusted=1)',
                    'TR.CLOSEPRICE.date',
                    'TR.OPENPRICE(Adjusted=1)',
                    'TR.OPENPRICE.date',
                    'TR.PRICECLOSE',
                    'TR.PRICECLOSEDATE', 
                    'TR.Volume',
                    'TR.Volume.date',
                    'TR.CompanyMarketCap(Scale=6)',
                    'TR.CompanyMarketCap(Scale=6).date',
                    'TR.TtlCmnSharesOut(Period=FQ0)',
                    'TR.TtlCmnSharesOut(Period=FQ0).date'
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                field_name=True
            )
            if not err:
                data = pd.concat([data, df])
            break
        except:
            time.sleep(20)
            continue

In [7]:
data.to_csv("Data\EIKON_prices.csv", index=False)

### Bid-Ask

In [None]:
data = pd.DataFrame()

for o in tqdm(obs):
    SDate = o["SDate"]
    EDate = o["EDate"]

    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"], 
                fields = [
                    'TR.HIGHPRICE(Adjusted=1)',
                    'TR.HIGHPRICE.date',
                    'TR.LOWPRICE(Adjusted=1)',
                    'TR.LOWPRICE.date',
                    'TR.BIDPRICE(Adjusted=1)',
                    'TR.BIDPRICE.date',
                    'TR.ASKPRICE(Adjusted=1)',
                    'TR.ASKPRICE.date'
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                field_name=True
            )
            if not err:
                data = pd.concat([data, df])
            break
        except:
            time.sleep(20)
            continue

In [6]:
data.to_csv("Data\EIKON_bidask.csv", index=False)

### Beta

In [None]:
data = pd.DataFrame()

for o in tqdm(obs):
    SDate = o["SDate"]
    EDate = o["EDate"]

    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"], 
                fields = [
                    'TR.WACCBeta',
                    'TR.WACCBeta.date',
                    'TR.BetaDaily180D',
                    'TR.BetaDaily180D.date',
                    'TR.BetaDaily90D',
                    'TR.BetaDaily90D.date',
                    'TR.WACC',
                    'TR.WACC.date',
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                field_name=True
            )
            if not err:
                data = pd.concat([data, df])
            break
        except:
            time.sleep(30)
            continue

In [8]:
data.to_csv("Data\EIKON_beta.csv", index=False)

In [None]:
help(ek.get_data)
ek.get_symbology("MSFC", from_symbol_type='ticker', to_symbol_type=['RIC', 'ISIN'])

### Clean Data

In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
import pickle
import re

In [2]:
prices = pd.read_csv("Data\EIKON_prices.csv")
prices.drop_duplicates(inplace=True)
prices.columns

Index(['Instrument', 'TR.CLOSEPRICE(ADJUSTED=1)', 'TR.CLOSEPRICE.DATE',
       'TR.OPENPRICE(ADJUSTED=1)', 'TR.OPENPRICE.DATE', 'TR.PRICECLOSE',
       'TR.PRICECLOSEDATE', 'TR.VOLUME', 'TR.VOLUME.DATE',
       'TR.COMPANYMARKETCAP(SCALE=6)', 'TR.COMPANYMARKETCAP(SCALE=6).DATE',
       'TR.TTLCMNSHARESOUT(PERIOD=FQ0)',
       'TR.TTLCMNSHARESOUT(PERIOD=FQ0).DATE'],
      dtype='object')

In [3]:
dates = prices.columns[2::2]
values = prices.columns[1::2]

dfs = [(
    prices[['Instrument', a, b]]
    .dropna(subset=[a])
    .drop_duplicates()
    .set_index(['Instrument', a])
) for a, b in zip(dates, values)]

# Handling duplicates for shares outstanding
dfs[5] = dfs[5][
    ~(dfs[5].index.duplicated(keep=False))&(dfs[5]['TR.TTLCMNSHARESOUT(PERIOD=FQ0)'].notna())
]

In [4]:
prices_df = pd.concat(dfs, axis=1).reset_index()

del prices

In [5]:
prices_df['Date'] = pd.to_datetime(prices_df["level_1"]).dt.tz_localize(None)
prices_df.drop(columns=['level_1'], inplace=True)

prices_df.columns = [
    'Instrument', 'CLOSEPRICE', 'OPENPRICE', 'PRICECLOSE', 'VOLUME', 
    'COMPANYMARKETCAP', 'TTLCMNSHARESOUT', 'Date'
]

In [6]:
# Replacing missing close prices with the open price of the same day
prices_df["CLOSEPRICE"].fillna(prices_df['PRICECLOSE'], inplace=True)
prices_df["CLOSEPRICE"].fillna(prices_df["OPENPRICE"], inplace=True)

prices_df.drop(columns=['OPENPRICE', 'PRICECLOSE'], inplace=True)

In [7]:
# Replacing missing TTLCMNSHARESOUT with previouse values
prices_df["TTLCMNSHARESOUT"] = prices_df.groupby(["Instrument"])["TTLCMNSHARESOUT"].fillna(method='ffill')
prices_df["TTLCMNSHARESOUT"] = prices_df.groupby(["Instrument"])["TTLCMNSHARESOUT"].fillna(method='bfill')

prices_df["VOLUME"].fillna(0, inplace=True)

In [8]:
prices_df.isna().sum()

Instrument                0
CLOSEPRICE          1750775
VOLUME                    0
COMPANYMARKETCAP     670776
TTLCMNSHARESOUT      133828
Date                      0
dtype: int64

In [20]:
prices_df.to_csv("Data\Prices.csv", index=False)

In [10]:
BidAsk = pd.read_csv("Data\EIKON_bidask.csv").drop_duplicates()
BidAsk.drop_duplicates(inplace=True)
BidAsk.columns

Index(['Instrument', 'TR.HIGHPRICE(ADJUSTED=1)', 'TR.HIGHPRICE.DATE',
       'TR.LOWPRICE(ADJUSTED=1)', 'TR.LOWPRICE.DATE',
       'TR.BIDPRICE(ADJUSTED=1)', 'TR.BIDPRICE.DATE',
       'TR.ASKPRICE(ADJUSTED=1)', 'TR.ASKPRICE.DATE'],
      dtype='object')

In [11]:
dates = BidAsk.columns[2::2]
values = BidAsk.columns[1::2]

dfs = [(
    BidAsk[['Instrument', a, b]]
    .dropna(subset=[a])
    .drop_duplicates()
    .set_index(['Instrument', a])
) for a, b in zip(dates, values)]

In [12]:
BidAsk_df = pd.concat(dfs, axis=1).reset_index()

del BidAsk

In [15]:
BidAsk_df['Date'] = pd.to_datetime(BidAsk_df["level_1"]).dt.tz_localize(None)
BidAsk_df.drop(columns=['level_1'], inplace=True)

BidAsk_df.columns = [
    'Instrument', 'HIGHPRICE', 'LOWPRICE', 'BIDPRICE', 'ASKPRICE', 'Date'
]

In [18]:
# Replace missing values
BidAsk_df["BIDPRICE"].fillna(BidAsk_df["LOWPRICE"], inplace=True)
BidAsk_df["ASKPRICE"].fillna(BidAsk_df["HIGHPRICE"], inplace=True)

In [21]:
BidAsk_df.to_csv("Data\BidAsk.csv", index=False)