In [2]:
import pandas as pd
import numpy as np
import datetime
from pathlib import Path
from myfuncs import (
    init_companies_table,
    
)
from myvars import to_billions_features

In [3]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 200 

# Create data tables

## Companies

- A table for all unique companies
    - cik, symbol, name, sector, subsector, founded, etc
- A table for all the periods a company has been on the index.
    - cik, symbol, start date, end date, flag_current

#### Table for unique companies

In [4]:
# import companies csv
current_companies = pd.read_csv('../data/raw/companies_wiki.csv').drop(columns='SEC filings')
current_companies.columns = ['symbol', 'name', 'sector', 'subSector', 'hQ', 'dateFirstAdded', 'cik', 'founded']
# import wikipedia historical companies csv
spts = pd.read_csv('../data/raw/historical_companies_TradingEvolved.csv') 

In [None]:
# get all unique occurences of anything that is in tickers
# get the items on the list if there is no '-', if there is, get the first item (the ticker)
spts['tickers_filtered'] = spts.tickers.str.split(',')
spts.head()

Unnamed: 0,date,tickers,tickers_filtered
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."


Get all unique constituents

In [None]:
results = set()
spts.tickers_filtered.apply(results.update)
companies = pd.DataFrame(data = results, columns=['symbol'])
companies.shape

(1125, 1)

Add values from current companies

In [None]:
current_companies['currentConstituent'] = True
companies = companies.merge(current_companies, how='left')
companies = companies.drop(columns=['dateFirstAdded'])
companies.currentConstituent = companies.currentConstituent.fillna(False)

In [None]:
ciks = pd.read_csv('../data/raw/CIK.csv', index_col = 0)
ciks.columns = ['cik_sec_list', 'symbol', 'title']

In [None]:
companies = companies.merge(ciks, how='left')
companies.name = companies.name.fillna(companies.title)
companies.cik = companies.cik.fillna(companies.cik_sec_list)

#### Comment 1
There are cases when a company is introduced or removed without a counterpart joining or leaving the index. This is fine as we'll be melting this table by generating a variable indicating whether the action was to add or remove the ticker

In [5]:
# import historical companies csv
wiki_changes = pd.read_csv('../data/raw/historical_companies_wiki.csv')
mask = wiki_changes.isna().any(axis=1)
wiki_changes[mask]

Unnamed: 0,Date,Added,Added.1,Removed,Removed.1,Reason
3,"February 3, 2022",,,GPS,Gap,Market capitalization change.[8]
4,"February 2, 2022",CEG,Constellation Energy,,,S&P 500 and 100 constituent Exelon Corp. spun ...
14,"June 4, 2021",,,HFC,HollyFrontier,Market capitalization change.[14]
15,"June 3, 2021",OGN,Organon & Co.,,,S&P 500/100 constituent Merck & Co. spun off O...
26,"October 12, 2020",,,NBL,Noble Energy,Chevron acquired Noble Energy.[22]
27,"October 9, 2020",VNT,Vontier,,,S&P 500 constituent Fortive spun off Vontier.[22]
38,"April 6, 2020",,,M,Macy's,Market capitalization change.[28]
39,"April 6, 2020",,,RTN,Raytheon Company,United Technologies spun off Otis and Carrier ...
40,"April 3, 2020",OTIS,Otis Worldwide,,,United Technologies spun off Otis and Carrier ...
41,"April 3, 2020",CARR,Carrier,,,United Technologies spun off Otis and Carrier ...


In [11]:
wiki_changes = wiki_changes.drop(0, axis=0)

In [12]:
wiki_changes.columns = [
    'date', 
    'addedTicker', 
    'addedName', 
    'removedTicker', 
    'removedName', 
    'reason',
    ]

In [13]:
# added DataFrame
# Select relevant columns, add a variable indicating its added and rename for latter concat
added_colmask = ['date', 'addedTicker', 'addedName', 'reason']
wiki_changes_added = wiki_changes.loc[:,added_colmask]
wiki_changes_added.columns = ['date', 'symbol', 'name',' reason']
wiki_changes_added['action'] = 'add'
wiki_changes_added

Unnamed: 0,date,symbol,name,reason,action
1,"March 2, 2022",MOH,Molina Healthcare,S&P 500 constituent S&P Global Inc. acquired I...,add
2,"February 15, 2022",NDSN,Nordson,S&P 500 constituent Advanced Micro Devices acq...,add
3,"February 3, 2022",,,Market capitalization change.[8],add
4,"February 2, 2022",CEG,Constellation Energy,S&P 500 and 100 constituent Exelon Corp. spun ...,add
5,"December 20, 2021",SBNY,Signature Bank,Market capitalization change.[9],add
...,...,...,...,...,...
288,"December 5, 2000",AYE,Allegheny Energy,Market Cap changes.,add
289,"December 5, 2000",ABK,Ambac Financial,Market Cap changes.,add
290,"July 27, 2000",JDSU,JDS Uniphase,Market Cap change.[225],add
291,"June 7, 2000",SBUX,Starbucks,Siemens AG acquired Shared Medical Systems.[226],add


In [14]:
# removed DataFrame
# Select relevant columns, add a variable indicating its added and rename for latter concat
removed_colmask = ['date', 'removedTicker', 'removedName', 'reason']
wiki_changes_removed = wiki_changes.loc[:,removed_colmask]
wiki_changes_removed.columns = ['date', 'symbol', 'name',' reason']
wiki_changes_removed['action'] = 'remove'
wiki_changes_removed

Unnamed: 0,date,symbol,name,reason,action
1,"March 2, 2022",INFO,IHS Markit,S&P 500 constituent S&P Global Inc. acquired I...,remove
2,"February 15, 2022",XLNX,Xilinx,S&P 500 constituent Advanced Micro Devices acq...,remove
3,"February 3, 2022",GPS,Gap,Market capitalization change.[8],remove
4,"February 2, 2022",,,S&P 500 and 100 constituent Exelon Corp. spun ...,remove
5,"December 20, 2021",LEG,Leggett & Platt,Market capitalization change.[9],remove
...,...,...,...,...,...
288,"December 5, 2000",GRA,WR Grace,Market Cap changes.,remove
289,"December 5, 2000",CCK,Crown Holdings,Market Cap changes.,remove
290,"July 27, 2000",RAD,RiteAid,Market Cap change.[225],remove
291,"June 7, 2000",SMS,Shared Medical Systems,Siemens AG acquired Shared Medical Systems.[226],remove


In [15]:
wiki_changes_melted = pd.concat([wiki_changes_added, wiki_changes_removed])
wiki_changes_melted

Unnamed: 0,date,symbol,name,reason,action
1,"March 2, 2022",MOH,Molina Healthcare,S&P 500 constituent S&P Global Inc. acquired I...,add
2,"February 15, 2022",NDSN,Nordson,S&P 500 constituent Advanced Micro Devices acq...,add
3,"February 3, 2022",,,Market capitalization change.[8],add
4,"February 2, 2022",CEG,Constellation Energy,S&P 500 and 100 constituent Exelon Corp. spun ...,add
5,"December 20, 2021",SBNY,Signature Bank,Market capitalization change.[9],add
...,...,...,...,...,...
288,"December 5, 2000",GRA,WR Grace,Market Cap changes.,remove
289,"December 5, 2000",CCK,Crown Holdings,Market Cap changes.,remove
290,"July 27, 2000",RAD,RiteAid,Market Cap change.[225],remove
291,"June 7, 2000",SMS,Shared Medical Systems,Siemens AG acquired Shared Medical Systems.[226],remove


In [16]:
duplicated_mask = wiki_changes_melted.duplicated(keep=False)
wiki_changes_melted[duplicated_mask]

Unnamed: 0,date,symbol,name,reason,action
40,"April 3, 2020",,,United Technologies spun off Otis and Carrier ...,remove
41,"April 3, 2020",,,United Technologies spun off Otis and Carrier ...,remove
154,"September 18, 2015",,,Share class methodology change,remove
155,"September 18, 2015",,,Share class methodology change,remove


In [17]:
a = wiki_changes_melted.symbol.to_list()
a = set(a)
len(a)

488

In [18]:
not_in = [x for x in list(a) if x not in companies.symbol.to_list()] 
len(not_in)

27

In [22]:
companies.shape

(1125, 10)

#### Comment 2
There are cases of companies added or removed to the sp500 where we only have an inclusion or exclusion date but not both and determining their stays will therefore be tricky. Our objective is to get to a Minimum Viable Product ASAP so we'll be ignoring this issue for no. 

Included in future developements

In [21]:
mask = wiki_changes_melted.symbol.isin(not_in)
wiki_changes_melted[mask].dropna(subset='symbol')

Unnamed: 0,date,symbol,name,reason,action
210,"December 21, 2012",DLPH,Delphi Automotive,TIE acquired by Precision Cast Parts (PCP)[164],add
239,"June 1, 2011",ANR,Alpha Natural Resources,Alpha Natural Resources acquired Massey Energy...,add
250,"August 26, 2010",TYC,Tyco International,Acquired by Schlumberger (SLB)[198],add
261,"November 3, 2009",PCLN,Priceline.com,Acquired by Merck & Co. (MRK)[205],add
277,"September 27, 2007",TSO,Tesoro Corporation,MXIM delisted from NASDAQ exchange.[216],add
279,"August 24, 2007",LUK,Leucadia National,Acquired by National Grid plc[217],add
280,"March 30, 2007",KFT,Kraft Foods,Taken Private[218],add
289,"December 5, 2000",ABK,Ambac Financial,Market Cap changes.,add
290,"July 27, 2000",JDSU,JDS Uniphase,Market Cap change.[225],add
292,"December 7, 1999",YHOO,Yahoo!,Market capitalization change.[227],add


In [19]:
not_in

[nan,
 'DLPH',
 'LEH',
 'JDSU',
 'FNM',
 'ABK',
 'WFR',
 'TYC',
 'BS',
 'SLE',
 'WYN',
 'LUK',
 'ANR',
 'YHOO',
 'BTU',
 'LDW',
 'AA',
 'TSO',
 'CSC',
 'KFT',
 'FRE',
 'DV',
 'EK',
 'PCLN',
 'ACE',
 'DPS',
 'RSH']

In [53]:
companies

Unnamed: 0,symbol,name,sector,subSector,hQ,cik,founded,currentConstituent,cik_sec_list,title
0,S,"SentinelOne, Inc.",,,,1583708.0,,False,1583708.0,"SentinelOne, Inc."
1,AHM,,,,,,,False,,
2,ANDW,,,,,,,False,,
3,FSLR,"FIRST SOLAR, INC.",,,,1274494.0,,False,1274494.0,"FIRST SOLAR, INC."
4,INTU,Intuit,Information Technology,Application Software,"Mountain View, California",896878.0,1983,True,896878.0,INTUIT INC
...,...,...,...,...,...,...,...,...,...,...
1120,MEE,,,,,,,False,,
1121,RJF,Raymond James,Financials,Investment Banking & Brokerage,"St. Petersburg, Florida",720005.0,1962,True,720005.0,RAYMOND JAMES FINANCIAL INC
1122,TMUS,T-Mobile,Communication Services,Wireless Telecommunication Services,"Bellevue, Washington",1283699.0,1994,True,1283699.0,"T-Mobile US, Inc."
1123,SDS,,,,,,,False,,


In [None]:
companies.to_csv('../data/preSQL/companies.csv', index=False)

#### Table for historical constituents

In [34]:
sp500_stays = pd.DataFrame(columns=['symbol', 'added', 'removed'])
# iterate over the list of tickers for every day and
previous_set=set()
for date, list_tickers in spts.set_index('date').tickers_filtered.items():
    # for the first iteration there is no previous set
    new_set = set(list_tickers)
    # check which values from the list of values was not in the previous date
    diff_new = new_set-previous_set
    # for each value in diff_new
    for diff_ticker in diff_new:
        if diff_ticker in new_set:
            # the ticker has been added
            new_row_index = 0 if len(sp500_stays) == 0 else sp500_stays.index.max()+1
            new_row_data={
                'symbol': diff_ticker,
                'added': date,
                'removed': 'not_yet_removed',
                }
            new_row = pd.DataFrame(data = new_row_data, index=[new_row_index])
            sp500_stays = pd.concat([sp500_stays, new_row], axis=0)
    diff_old = previous_set-new_set
    for diff_ticker in diff_old:
        if diff_ticker in previous_set:
            # the ticker has been removed
            # get the index of the last occurence of the ticker in the dataframe
            mask = sp500_stays.symbol == diff_ticker
            idx = sp500_stays[mask].index.max()
            # update that index with the date removed
            sp500_stays.loc[idx, 'removed'] = date
    # this iteration ends, the new set becomes obsolete
    previous_set = new_set.copy()

to .csv

In [45]:
sp500_stays.to_csv('../data/clean/sp500_movements.csv', index=False)

#### Aditional Cleaning Steps

## SEC submissions
Submissions from SEC
- 10-KA/405A and 10QA are text amendments which contain no financial information. <a href="https://www.sec.gov/Archives/edgar/data/320193/0001047469-98-001822.txt">example</a>
- NT 10-Q and NT 10-K are notifications about delay in statements
- 10KT and 10QT dennote transition in companies which alter fiscal years. Usually after merger of acquisitions

In [None]:
submissions=pd.read_csv('.././data/raw/submissions.csv')
sub_cols = [
    'filingDate',
    'reportDate',
    'symbol',
    'cik',
    'form',
    ]
sec = submissions.loc[:,sub_cols]
mask = sec.form.isin(['10-K', '10-Q', '10-K405', '10-KT', '10-QT'])
sec = sec[mask]

## FRED Series
- Inflation
- GDP

### Inflation (CorePCE)

In [162]:
# read inflation csv
inflation = pd.read_csv('../data/raw/fred/corePCE.csv').convert_dtypes()
# get last date
last_date = inflation.date.idxmax()
# get the coefficient of that date
today_inflation = inflation.loc[last_date, 'corePCE']
# calculate multiplier
inflation['inflationMultiplier'] = today_inflation/inflation.corePCE
# export csv
inflation.to_csv('../data/preSQL/inflation.csv', index=False)

## 10-K Statements
- Balance Sheet, Cash Flow and Income Statements

#### Walk the path in a directory and generate the dataframe.
- We're only interested in dates when the symbol belonged to the sp500
- Risk: get the past value of a current ticker instead of getting the values of a former ticker at that time.  

In [105]:
def merge_csv(path):
    df_list = []
    path_dir = Path(path)
    sp500_dates = pd.read_csv('../data/raw/sp500_movements.csv')
    sp500_dates.removed = sp500_dates.removed.replace('not_yet_removed', '2022-12-31')
    sp500_dates.added = pd.to_datetime(sp500_dates.added)
    sp500_dates.removed = pd.to_datetime(sp500_dates.removed)
    for file in path_dir.glob('*.csv'):
        csv_path = os.path.join(file.parent, file.name)
        df = pd.read_csv(csv_path)
        # convert fillingDates to datetime
        df.fillingDate = pd.to_datetime(df.fillingDate)
        # keep only the the combinations of symbol and date which belonged to the sp500
        mask = sp500_dates.symbol == file.name.split('.')[0]
        added = sp500_dates[mask].added
        removed = sp500_dates[mask].removed
        for stay in tuple(zip(added, removed)):
            df_list.append(df[df.fillingDate.between(stay[0], stay[1])])
    return pd.concat(df_list)

In [107]:
def do_merge():
    balance = merge_csv('../data/raw/balance')
    balance.to_csv('../data/raw/balance.csv', index=False)
    print(balance.shape)

    cash_flow = merge_csv('../data/raw/cash_flow')
    cash_flow.to_csv('../data/raw/cash_flow.csv', index=False)
    print(cash_flow.shape)

    income = merge_csv('../data/raw/income')
    income.to_csv('../data/raw/income.csv', index=False)
    print(income.shape)

In [108]:
# do_merge()

(10179, 54)
(10309, 40)
(10328, 38)


### Table for the join of all the historical financial statements in the SP500
- Generate the primary keys
- Clean the primary keys: symbol + year

#### Generate the primary keys of symbol, year

In [16]:
sp500_dates = pd.read_csv('../data/raw/sp500_movements.csv')
sp500_dates.removed = sp500_dates.removed.replace('not_yet_removed', '2022-12-31')
sp500_dates.added = pd.to_datetime(sp500_dates.added).dt.year
sp500_dates.removed = pd.to_datetime(sp500_dates.removed).dt.year

In [17]:
indices = []
for ticker in sp500_dates.symbol.unique():
    mask = sp500_dates.symbol == ticker 
    added = sp500_dates[mask].added
    removed = sp500_dates[mask].removed
    for stay in tuple(zip(added, removed)):
        # convert to years and generate their sequence
        yearlist = list(range(stay[0], stay[1])) # get two years prior for increases
        for y in yearlist:
            indices.append((ticker, y)) 

In [18]:
statements = pd.DataFrame(index = set(indices)).reset_index()
statements.columns = ['symbol', 'calendarYear']

#### Left join with balance, cash flow and income statements

In [19]:
balance = pd.read_csv('../data/raw/balance.csv')
cash_flow = pd.read_csv('../data/raw/cash_flow.csv')
income = pd.read_csv('../data/raw/income.csv')

In [20]:
statements = statements.merge(right = balance, on = ['symbol', 'calendarYear'], how='left')

In [21]:
statements = statements.merge(right = income, on = ['symbol', 'calendarYear'], how='left')
statements = statements.merge(right = cash_flow, on = ['symbol', 'calendarYear'], how='left')

In [22]:
statements.to_csv('../data/raw/statements.csv', index= False)

#### Drop duplicated columns

In [23]:
statements = pd.read_csv('../data/raw/statements.csv')
statements.shape

(12985, 128)

In [24]:
repeated = [
    ['date','date_x','date_y'],
    ['acceptedDate','acceptedDate_x','acceptedDate_y'],
    ['cik','cik_x','cik_y'],
    ['depreciationAndAmortization_x','depreciationAndAmortization_y'],
    ['fillingDate', 'fillingDate_x','fillingDate_y'],
    ['finalLink','finalLink_x','finalLink_y'],
    ['inventory_x','inventory_y'],
    ['link','link_x','link_y'],
    ['netIncome_x','netIncome_y'],
    ['period','period_x','period_y'],
    ['reportedCurrency','reportedCurrency_x','reportedCurrency_y',]
]
for group in repeated:
    base_feature = group[0]
    for dup_feature in group[1:]:
        statements[base_feature] = statements[base_feature].fillna(statements[dup_feature])
        statements = statements.drop(columns=dup_feature)    
statements.shape

(12985, 109)

In [25]:
col_rename = {
    'depreciationAndAmortization_x': 'depreciationAndAmortization',
    'inventory_x': 'inventory',
    'netIncome_x': 'netIncome',
}
statements = statements.rename(columns=col_rename)

#### Convert to billions and adjust for inflation

In [26]:
# convert to billions
for feat in to_billions_features:
    statements[feat] = statements[feat].astype(float)/1e9
# sort values
statements = statements.sort_values(by=['symbol', 'calendarYear'])

Adjust for inflation

In [27]:
statements['date'] = pd.to_datetime(statements['date'])
statements['month'] = statements['date'].dt.month
inflation = pd.read_csv('../data/clean/inflation.csv')
inflation = inflation.rename(columns={'year':'calendarYear'})
statements = statements.merge(
    right = inflation.loc[:,['calendarYear','month','inflationMultiplier']],
    how='left', 
    left_on=['calendarYear', 'month'], 
    right_on = ['calendarYear', 'month'])

  statements['month'] = statements['date'].dt.month


In [28]:
for feat in to_billions_features:
    statements[feat] = statements[feat] * statements.inflationMultiplier

In [29]:
statements.to_csv('../data/clean/statements.csv', index=False)

## Market Capitalization

#### Target variable. Join with the statements dataframe on filingDate + 10 days

In [4]:
statements = pd.read_csv('../data/clean/statements.csv')
filldate_mask = statements.fillingDate.notnull()
symbols = statements[filldate_mask].symbol.unique()
# add 10 days to let the price stabilize
statements['mcapDate'] = pd.to_datetime(statements.fillingDate) + datetime.timedelta(10)
# extract year and week from that date
statements['mcapYear'] = statements.mcapDate.dt.year
statements['mcapWeek'] = statements.mcapDate.dt.isocalendar().week
# createa target column
statements['target'] = np.nan 

In [5]:
for ticker in symbols:
    # open the market caps dataframe
    try:
        df = pd.read_csv('../data/raw/marketCaps/{}.csv'.format(ticker)).sort_values(by='date').reset_index(drop='True')
    except:
        continue
    # convert market cap to billions
    df.marketCap = df.marketCap/1e9
    # exctract year and week
    df.date = pd.to_datetime(df.date)
    df['mcapYear'] = df.date.dt.year
    df['mcapWeek'] = df.date.dt.isocalendar().week
    df['marketCapSMA10'] = df.marketCap.rolling(10).mean()
    df = df.rename(columns={'date':'mcapDate'})
    df.to_csv('../data/raw/marketCaps/{}_clean.csv'.format(ticker), index=False)
    # attempt o left join on statements on mcapDdate 
    statements = statements.merge(
        right=df.loc[:,['symbol','mcapDate','marketCapSMA10']],
        on = ['symbol', 'mcapDate'],
        how='left'
    )
    # update values and drop column
    statements.target = statements.target.fillna(statements.marketCapSMA10)
    statements = statements.drop(columns='marketCapSMA10')
    # join again on mcapYear and mcapYeek in case the date does not exist
    statements = statements.merge(
            right=df.groupby(['symbol','mcapYear', 'mcapWeek']).marketCapSMA10.mean(),
            on = ['symbol', 'mcapYear', 'mcapWeek'],
            how='left'
    )
    # update values and drop column
    statements.target = statements.target.fillna(statements.marketCapSMA10)
    statements = statements.drop(columns='marketCapSMA10')

Adjust inflation

In [34]:
statements.target = statements.inflationMultiplier * statements.target

In [35]:
statements.to_csv('../data/clean/statements_mcap.csv', index= False)

#### Aditional Cleaning

# Null Cleaning

In [57]:
data = pd.read_csv('../data/clean/statements_mcap.csv')
data.shape

(12985, 115)

#### Drop all rows with null target

In [58]:
cols = ['target']
data = data.dropna(subset=cols)
(data.isna().sum() / len(data)).sort_values(ascending=False).head(5)

link                                     0.197290
finalLink                                0.197290
cik                                      0.032517
totalLiabilitiesAndStockholdersEquity    0.010943
retainedEarnings                         0.010943
dtype: float64

In [59]:
data.shape

(9595, 115)

#### Duplicates clean

In [60]:
data = data.drop_duplicates(subset=['symbol', 'calendarYear'], keep='first')

In [61]:
mask =data.duplicated(subset=['symbol', 'calendarYear'], keep=False)
data[mask].sort_values(by='date').loc[:, ['symbol', 'calendarYear', 'date', 'fillingDate']]

Unnamed: 0,symbol,calendarYear,date,fillingDate


# Incorrect observations clean

In [62]:
def drop_rows(data, symbol, dates_inclusive):
    drop_mask = (data.symbol == symbol) & data.calendarYear.between(dates_inclusive[0], dates_inclusive[1])
    idx_drop = data[drop_mask].index
    return data.drop(index=idx_drop)

In [63]:
mask = data.symbol == 'NBR'
data.loc[mask, 'target'] = data.loc[mask, 'target']/100

In [64]:
data = drop_rows(data, 'MBI', [2002,2008])
data = drop_rows(data, 'SLM', [2004,2013])
data = drop_rows(data, 'STI', [2002,2009])
data = drop_rows(data, 'CHK', [2006,2017])
data = drop_rows(data, 'SIG', [2016,2016])
data = drop_rows(data, 'SYMC', [2010,2010])
data = drop_rows(data, 'THC', [2002,2002])
data = drop_rows(data, 'CBS', [2005,2012])
data = drop_rows(data, 'SUN', [2010,2011])
data = drop_rows(data, 'TT', [2002,2007])
data = drop_rows(data, 'KMG', [1999,2005])
data = drop_rows(data, 'MO', [1996,2007])
data = drop_rows(data, 'GILD', [2013,2014])

In [65]:
data.shape

(9492, 115)

In [66]:
data.to_csv('../data/processed/data.csv', index=False)

## Economic Situation Table

In [48]:
path_dir = Path('../data/raw/fred')
for i, file in enumerate(path_dir.glob('*.csv')):
    csv_path = os.path.join(file.parent, file.name)
    df = pd.read_csv(csv_path)
    if i == 0:
        fred = df.copy()
    else:
        fred = fred.merge(df, on=['date', 'year', 'month', 'day'], how = 'outer')

In [49]:
fred.date = pd.to_datetime(fred.date)
fred = fred.set_index('date').sort_index()

In [54]:
fred.index.max()

Timestamp('2022-04-19 00:00:00')

In [56]:
idx = pd.date_range('01-01-1991', '2022-04-19')
fred = fred.reindex(idx)

In [61]:
fred = fred.interpolate()
fred.to_csv('../data/clean/fred.csv')

## join with market caps

In [74]:
data = pd.read_csv('../data/processed/data.csv')

In [75]:
data.fillingDate = pd.to_datetime(data.fillingDate)
data = data.merge(
    fred.drop(columns = ['year', 'month', 'day']),
    how = 'left',
    left_on='fillingDate',
    right_index=True
)

In [76]:
data.to_csv('../data/processed/data.csv', index=False)

In [79]:
data.columns.tolist()

['symbol',
 'calendarYear',
 'cashAndCashEquivalents',
 'shortTermInvestments',
 'cashAndShortTermInvestments',
 'netReceivables',
 'inventory',
 'otherCurrentAssets',
 'totalCurrentAssets',
 'propertyPlantEquipmentNet',
 'goodwill',
 'intangibleAssets',
 'goodwillAndIntangibleAssets',
 'longTermInvestments',
 'taxAssets',
 'otherNonCurrentAssets',
 'totalNonCurrentAssets',
 'otherAssets',
 'totalAssets',
 'accountPayables',
 'shortTermDebt',
 'taxPayables',
 'deferredRevenue',
 'otherCurrentLiabilities',
 'totalCurrentLiabilities',
 'longTermDebt',
 'deferredRevenueNonCurrent',
 'deferredTaxLiabilitiesNonCurrent',
 'otherNonCurrentLiabilities',
 'totalNonCurrentLiabilities',
 'otherLiabilities',
 'capitalLeaseObligations',
 'totalLiabilities',
 'preferredStock',
 'commonStock',
 'retainedEarnings',
 'accumulatedOtherComprehensiveIncomeLoss',
 'othertotalStockholdersEquity',
 'totalStockholdersEquity',
 'totalLiabilitiesAndStockholdersEquity',
 'minorityInterest',
 'totalEquity',
 'tot