In [18]:
import pandas as pd
import numpy as np
import datetime
from pathlib import Path
import os
from vars import currency_features

In [3]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 200 

# Create data tables

## Companies

- A table for all unique companies
    - cik, symbol, name, sector, subsector, founded, etc
- A table for all the periods a company has been on the index.
    - cik, symbol, start date, end date, flag_current

#### Table for unique companies

In [3]:
# import companies csv
current_companies = pd.read_csv('../data/raw/companies_wiki.csv').drop(columns='SEC filings')
current_companies.columns = ['symbol', 'name', 'sector', 'subSector', 'hQ', 'dateFirstAdded', 'cik', 'founded']
# import historical companies csv
historical_v1 = pd.read_csv('../data/raw/historical_companies_wiki.csv')
# import wikipedia historical companies csv
spts = pd.read_csv('../data/raw/historical_companies_TradingEvolved.csv') 

In [4]:
# get all unique occurences of anything that is in tickers
# get the items on the list if there is no '-', if there is, get the first item (the ticker)
spts['tickers_filtered'] = spts.tickers.str.split(',')
spts.head()

Unnamed: 0,date,tickers,tickers_filtered
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."


Get all unique constituents

In [5]:
results = set()
spts.tickers_filtered.apply(results.update)
companies = pd.DataFrame(data = results, columns=['symbol'])
companies.shape

(1125, 1)

Add values from current companies

In [24]:
current_companies['currentConstituent'] = True
companies = companies.merge(current_companies, how='left')
companies = companies.drop(columns=['dateFirstAdded'])
companies.currentConstituent = companies.currentConstituent.fillna(False)

In [25]:
ciks = pd.read_csv('../data/raw/CIK.csv', index_col = 0)
ciks.columns = ['cik_sec_list', 'symbol', 'title']

In [26]:
companies = companies.merge(ciks, how='left')
companies.name = companies.name.fillna(companies.title)
companies.cik = companies.cik.fillna(companies.cik_sec_list)

In [None]:
companies.to_csv('../data/preSQL/companies.csv', index=False)

#### Table for historical constituents

In [34]:
sp500_stays = pd.DataFrame(columns=['symbol', 'added', 'removed'])
# iterate over the list of tickers for every day and
previous_set=set()
for date, list_tickers in spts.set_index('date').tickers_filtered.items():
    # for the first iteration there is no previous set
    new_set = set(list_tickers)
    # check which values from the list of values was not in the previous date
    diff_new = new_set-previous_set
    # for each value in diff_new
    for diff_ticker in diff_new:
        if diff_ticker in new_set:
            # the ticker has been added
            new_row_index = 0 if len(sp500_stays) == 0 else sp500_stays.index.max()+1
            new_row_data={
                'symbol': diff_ticker,
                'added': date,
                'removed': 'not_yet_removed',
                }
            new_row = pd.DataFrame(data = new_row_data, index=[new_row_index])
            sp500_stays = pd.concat([sp500_stays, new_row], axis=0)
    diff_old = previous_set-new_set
    for diff_ticker in diff_old:
        if diff_ticker in previous_set:
            # the ticker has been removed
            # get the index of the last occurence of the ticker in the dataframe
            mask = sp500_stays.symbol == diff_ticker
            idx = sp500_stays[mask].index.max()
            # update that index with the date removed
            sp500_stays.loc[idx, 'removed'] = date
    # this iteration ends, the new set becomes obsolete
    previous_set = new_set.copy()

to .csv

In [45]:
sp500_stays.to_csv('../data/clean/sp500_movements.csv', index=False)

#### Aditional Cleaning Steps

## SEC submissions
Submissions from SEC
- 10-KA/405A and 10QA are text amendments which contain no financial information. <a href="https://www.sec.gov/Archives/edgar/data/320193/0001047469-98-001822.txt">example</a>
- NT 10-Q and NT 10-K are notifications about delay in statements
- 10KT and 10QT dennote transition in companies which alter fiscal years. Usually after merger of acquisitions

In [None]:
submissions=pd.read_csv('.././data/raw/submissions.csv')
sub_cols = [
    'filingDate',
    'reportDate',
    'symbol',
    'cik',
    'form',
    ]
sec = submissions.loc[:,sub_cols]
mask = sec.form.isin(['10-K', '10-Q', '10-K405', '10-KT', '10-QT'])
sec = sec[mask]

## FRED Series
- Inflation
- GDP

### Inflation (CorePCE)

In [162]:
# read inflation csv
inflation = pd.read_csv('../data/raw/fred/corePCE.csv').convert_dtypes()
# get last date
last_date = inflation.date.idxmax()
# get the coefficient of that date
today_inflation = inflation.loc[last_date, 'corePCE']
# calculate multiplier
inflation['inflationMultiplier'] = today_inflation/inflation.corePCE
# export csv
inflation.to_csv('../data/preSQL/inflation.csv', index=False)

## 10-K Statements
- Balance Sheet, Cash Flow and Income Statements

#### Walk the path in a directory and generate the dataframe.
- We're only interested in dates when the symbol belonged to the sp500
- Risk: get the past value of a current ticker instead of getting the values of a former ticker at that time.  

In [105]:
def merge_csv(path):
    df_list = []
    path_dir = Path(path)
    sp500_dates = pd.read_csv('../data/raw/sp500_movements.csv')
    sp500_dates.removed = sp500_dates.removed.replace('not_yet_removed', '2022-12-31')
    sp500_dates.added = pd.to_datetime(sp500_dates.added)
    sp500_dates.removed = pd.to_datetime(sp500_dates.removed)
    for file in path_dir.glob('*.csv'):
        csv_path = os.path.join(file.parent, file.name)
        df = pd.read_csv(csv_path)
        # convert fillingDates to datetime
        df.fillingDate = pd.to_datetime(df.fillingDate)
        # keep only the the combinations of symbol and date which belonged to the sp500
        mask = sp500_dates.symbol == file.name.split('.')[0]
        added = sp500_dates[mask].added
        removed = sp500_dates[mask].removed
        for stay in tuple(zip(added, removed)):
            df_list.append(df[df.fillingDate.between(stay[0], stay[1])])
    return pd.concat(df_list)

In [107]:
def do_merge():
    balance = merge_csv('../data/raw/balance')
    balance.to_csv('../data/raw/balance.csv', index=False)
    print(balance.shape)

    cash_flow = merge_csv('../data/raw/cash_flow')
    cash_flow.to_csv('../data/raw/cash_flow.csv', index=False)
    print(cash_flow.shape)

    income = merge_csv('../data/raw/income')
    income.to_csv('../data/raw/income.csv', index=False)
    print(income.shape)

In [108]:
# do_merge()

(10179, 54)
(10309, 40)
(10328, 38)


### Table for the join of all the historical financial statements in the SP500
- Generate the primary keys
- Clean the primary keys: symbol + year

#### Generate the primary keys of symbol, year

In [32]:
sp500_dates = pd.read_csv('../data/raw/sp500_movements.csv')
sp500_dates.removed = sp500_dates.removed.replace('not_yet_removed', '2022-12-31')
sp500_dates.added = pd.to_datetime(sp500_dates.added).dt.year
sp500_dates.removed = pd.to_datetime(sp500_dates.removed).dt.year

In [33]:
indices = []
for ticker in sp500_dates.symbol.unique():
    mask = sp500_dates.symbol == ticker 
    added = sp500_dates[mask].added
    removed = sp500_dates[mask].removed
    for stay in tuple(zip(added, removed)):
        # convert to years and generate their sequence
        yearlist = list(range(stay[0], stay[1]))
        for y in yearlist:
            indices.append((ticker, y)) 

In [34]:
statements = pd.DataFrame(index = set(indices)).reset_index()
statements.columns = ['symbol', 'calendarYear']

#### Left join with balance, cash flow and income statements

In [35]:
balance = pd.read_csv('../data/raw/balance.csv')
cash_flow = pd.read_csv('../data/raw/cash_flow.csv')
income = pd.read_csv('../data/raw/income.csv')

In [36]:
statements = statements.merge(right = balance, on = ['symbol', 'calendarYear'], how='left')

In [37]:
statements = statements.merge(right = income, on = ['symbol', 'calendarYear'], how='left')
statements = statements.merge(right = cash_flow, on = ['symbol', 'calendarYear'], how='left')

In [38]:
statements.to_csv('../data/raw/statements.csv', index= False)

#### Drop duplicated columns

In [88]:
statements = pd.read_csv('../data/raw/statements.csv')
statements.shape

(12985, 128)

In [89]:
repeated = [
    ['date','date_x','date_y'],
    ['acceptedDate','acceptedDate_x','acceptedDate_y'],
    ['cik','cik_x','cik_y'],
    ['depreciationAndAmortization_x','depreciationAndAmortization_y'],
    ['fillingDate', 'fillingDate_x','fillingDate_y'],
    ['finalLink','finalLink_x','finalLink_y'],
    ['inventory_x','inventory_y'],
    ['link','link_x','link_y'],
    ['netIncome_x','netIncome_y'],
    ['period','period_x','period_y'],
    ['reportedCurrency','reportedCurrency_x','reportedCurrency_y',]
]
for group in repeated:
    base_feature = group[0]
    for dup_feature in group[1:]:
        statements[base_feature] = statements[base_feature].fillna(statements[dup_feature])
        statements = statements.drop(columns=dup_feature)    
statements.shape

(12985, 109)

In [90]:
col_rename = {
    'depreciationAndAmortization_x': 'depreciationAndAmortization',
    'inventory_x': 'inventory',
    'netIncome_x': 'netIncome',
}
statements = statements.rename(columns=col_rename)

#### Convert to billions and adjust for inflation

In [91]:
also_convert = [
    'weightedAverageShsOut',
    'weightedAverageShsOutDil',
    'preferredStock',
    'commonStock',
    'eps',
    'epsdiluted'
]

In [92]:
# convert to billions
for feat in currency_features + also_convert:
    statements[feat] = statements[feat].astype(float)/1e9
# sort values
statements = statements.sort_values(by=['symbol', 'calendarYear'])

Adjust for inflation

In [93]:
statements['date'] = pd.to_datetime(statements['date'])
statements['month'] = statements['date'].dt.month
inflation = pd.read_csv('../data/clean/inflation.csv')
inflation = inflation.rename(columns={'year':'calendarYear'})
statements = statements.merge(
    right = inflation.loc[:,['calendarYear','month','inflationMultiplier']],
    how='left', 
    left_on=['calendarYear', 'month'], 
    right_on = ['calendarYear', 'month'])

In [94]:
for feat in currency_features:
    statements[feat] = statements[feat] * statements.inflationMultiplier

In [95]:
statements.to_csv('../data/clean/statements.csv', index=False)

## Market Capitalization

#### Target variable. Join with the statements dataframe on filingDate + 10 days

In [96]:
statements = pd.read_csv('../data/clean/statements.csv')
filldate_mask = statements.fillingDate.notnull()
symbols = statements[filldate_mask].symbol.unique()
# add 10 days to let the price stabilize
statements['mcapDate'] = pd.to_datetime(statements.fillingDate) + datetime.timedelta(10)
# extract year and week from that date
statements['mcapYear'] = statements.mcapDate.dt.year
statements['mcapWeek'] = statements.mcapDate.dt.isocalendar().week
# createa target column
statements['target'] = np.nan 

In [97]:
for ticker in symbols:
    # open the market caps dataframe
    try:
        df = pd.read_csv('../data/raw/marketCaps/{}.csv'.format(ticker)).sort_values(by='date').reset_index(drop='True')
    except:
        continue
    # convert market cap to billions
    df.marketCap = df.marketCap/1e9
    # exctract year and week
    df.date = pd.to_datetime(df.date)
    df['mcapYear'] = df.date.dt.year
    df['mcapWeek'] = df.date.dt.isocalendar().week
    df['marketCapSMA10'] = df.marketCap.rolling(10).mean()
    df = df.rename(columns={'date':'mcapDate'})
    # attempt o left join on statements on mcapDdate 
    statements = statements.merge(
        right=df.loc[:,['symbol','mcapDate','marketCapSMA10']],
        on = ['symbol', 'mcapDate'],
        how='left'
    )
    # update values and drop column
    statements.target = statements.target.fillna(statements.marketCapSMA10)
    statements = statements.drop(columns='marketCapSMA10')
    # join again on mcapYear and mcapYeek in case the date does not exist
    statements = statements.merge(
            right=df.groupby(['symbol','mcapYear', 'mcapWeek']).marketCapSMA10.mean(),
            on = ['symbol', 'mcapYear', 'mcapWeek'],
            how='left'
    )
    # update values and drop column
    statements.target = statements.target.fillna(statements.marketCapSMA10)
    statements = statements.drop(columns='marketCapSMA10')

Adjust inflation

In [98]:
statements.target = statements.inflationMultiplier * statements.target

In [99]:
statements.sample(10)

Unnamed: 0,symbol,calendarYear,cashAndCashEquivalents,shortTermInvestments,cashAndShortTermInvestments,netReceivables,inventory,otherCurrentAssets,totalCurrentAssets,propertyPlantEquipmentNet,goodwill,intangibleAssets,goodwillAndIntangibleAssets,longTermInvestments,taxAssets,otherNonCurrentAssets,totalNonCurrentAssets,otherAssets,totalAssets,accountPayables,shortTermDebt,taxPayables,deferredRevenue,otherCurrentLiabilities,totalCurrentLiabilities,longTermDebt,deferredRevenueNonCurrent,deferredTaxLiabilitiesNonCurrent,otherNonCurrentLiabilities,totalNonCurrentLiabilities,otherLiabilities,capitalLeaseObligations,totalLiabilities,preferredStock,commonStock,retainedEarnings,accumulatedOtherComprehensiveIncomeLoss,othertotalStockholdersEquity,totalStockholdersEquity,totalLiabilitiesAndStockholdersEquity,minorityInterest,totalEquity,totalLiabilitiesAndTotalEquity,totalInvestments,totalDebt,netDebt,revenue,costOfRevenue,grossProfit,grossProfitRatio,researchAndDevelopmentExpenses,generalAndAdministrativeExpenses,sellingAndMarketingExpenses,sellingGeneralAndAdministrativeExpenses,otherExpenses,operatingExpenses,costAndExpenses,interestIncome,interestExpense,depreciationAndAmortization,ebitda,ebitdaratio,operatingIncome,operatingIncomeRatio,totalOtherIncomeExpensesNet,incomeBeforeTax,incomeBeforeTaxRatio,incomeTaxExpense,netIncome,netIncomeRatio,eps,epsdiluted,weightedAverageShsOut,weightedAverageShsOutDil,date,reportedCurrency,cik,fillingDate,acceptedDate,period,deferredIncomeTax,stockBasedCompensation,changeInWorkingCapital,accountsReceivables,accountsPayables,otherWorkingCapital,otherNonCashItems,netCashProvidedByOperatingActivities,investmentsInPropertyPlantAndEquipment,acquisitionsNet,purchasesOfInvestments,salesMaturitiesOfInvestments,otherInvestingActivites,netCashUsedForInvestingActivites,debtRepayment,commonStockIssued,commonStockRepurchased,dividendsPaid,otherFinancingActivites,netCashUsedProvidedByFinancingActivities,effectOfForexChangesOnCash,netChangeInCash,cashAtEndOfPeriod,cashAtBeginningOfPeriod,operatingCashFlow,capitalExpenditure,freeCashFlow,link,finalLink,month,inflationMultiplier,mcapDate,mcapYear,mcapWeek,target
12454,WFC,2018,188.455818,5.502931,193.958749,65.318488,0.0,1149.119699,1408.396936,19.527793,28.730521,18.093333,46.823854,587.092068,0.0,0.0,653.443716,2715.284368,2061.840652,0.0,14.526216,0.0,0.0,0.0,14.526216,249.093552,0.0,0.0,0.0,249.093552,2082.091656,0.039151,1847.52432,23.214,9.136,172.007926,-6.890627,13.038467,213.337549,2060.86187,0.978782,214.316332,2061.840652,592.595,263.619768,75.16395,93.971794,0.0,0.0,0.0,0.0,37.121947,0.0,37.121947,0.0,0.0,0.0,70.305927,15.934575,6.082588,52.52798,0.558976,0.0,0.0,0.0,31.036097,3.591808e-10,6.157628,24.353189,0.259154,4.31e-09,4.28e-09,4.7997,4.8384,2018-12-31,USD,72971.0,2019-02-27,2019-02-27 15:23:51,FY,0.0,2.452393,48.413832,0.0,0.0,39.767922,-42.071323,39.230679,0.0,-0.010875,-72.321131,66.58003,-2.680776,-8.432753,-44.11589,0.687323,-24.777328,-10.129309,1.143,-77.192204,0.0,-46.394278,188.455818,234.850095,39.230679,0.0,39.230679,https://www.sec.gov/Archives/edgar/data/72971/...,https://www.sec.gov/Archives/edgar/data/72971/...,12.0,1.087536,2019-03-09,2019.0,10.0,248.050791
4753,FL,1996,,,,,0.460799,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13.204229,9.207959,3.99627,0.302651,0.0,0.0,0.0,0.0,3.861402,3.861402,13.069361,0.0,0.0,0.383732,0.009633,0.00073,0.134868,0.010214,-0.508966,-0.374098,-4.548862e-11,-0.110785,-0.263314,-0.019942,-1.23e-09,-1.23e-09,0.133333,0.133333,1996-01-31,USD,850209.0,1996-01-31,1996-01-31,FY,-0.160557,0.0,1.552589,0.0,0.0,1.552589,-0.677552,0.834898,-0.268131,0.0,-0.102757,0.0,0.213541,-0.157346,0.0,0.011239,0.0,-0.032111,-0.769069,-0.789942,0.017661,-0.789942,0.0,0.115601,0.834898,-0.268131,0.566767,,,1.0,1.605573,1996-02-10,1996.0,6.0,2.470194
12883,XRX,2016,2.50977,0.0,2.50977,2.604606,0.94949,1.830111,7.893978,1.281417,4.275529,0.32741,4.602939,1.567054,0.0,5.140344,12.591754,0.0,20.485732,1.271256,1.141421,0.0,0.211123,2.630573,5.254373,5.989353,0.0,0.0,3.534904,9.524257,0.0,0.0,14.778629,0.214,1.014,5.689038,-4.908898,3.497647,5.6642,20.44283,0.042902,5.707103,20.485732,0.0,7.130773,4.621003,12.160475,7.3498,4.810675,0.395599,0.537405,3.042659,0.0,3.042659,0.065482,3.645546,10.995345,0.005645,0.204349,0.635628,0.371441,0.030545,1.16513,0.095813,-0.523857,0.641273,5.953698e-11,0.069998,-0.538534,-0.044286,2.32e-09,2.32e-09,0.253391,0.255995,2016-12-31,USD,1770450.0,2016-12-31,2016-12-31,FY,-0.010161,0.05645,1.024004,-0.028225,0.0,2.639605,0.068869,1.236257,-0.104997,-0.03387,0.0,0.0,-0.309346,-0.448214,-1.115453,0.010161,-0.001129,-0.373699,2.139458,0.659337,-0.03387,1.123356,2.50977,1.386414,1.236257,-0.155802,1.080454,,,12.0,1.129001,2017-01-10,2017.0,2.0,7.478455
12302,WAMUQ,1997,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,
3112,CSX,2003,0.519336,0.519336,1.038672,1.641271,0.239911,-0.234266,2.685587,19.376307,0.0,0.0,0.0,6.601774,5.294967,-3.250083,28.022966,0.0,30.708553,1.167094,0.60401,0.173582,0.191928,1.155804,3.118837,9.717789,0.0,0.0,8.765203,18.482993,0.0,0.0,21.60183,0.0,0.215,6.99551,-0.420549,2.228346,9.106723,30.708553,0.0,9.106723,30.708553,7.12111,10.3218,9.802464,10.997783,4.050255,6.947528,0.631721,0.0,4.665555,0.0,1.40136,4.662733,6.064093,10.114347,0.0,-0.589898,0.907426,0.771948,0.070191,0.883435,0.080328,-0.509457,0.373978,4.7989e-11,0.107254,0.347165,0.031567,6.362626e-11,6.362626e-11,3.866328,3.866328,2003-12-26,USD,277948.0,2004-03-10,2004-03-10 16:05:16,FY,0.167937,0.0,0.505223,-0.509457,0.069151,-0.43325,-0.793116,1.134636,-1.796507,0.0,0.0,0.095964,0.561673,-1.13887,-0.705619,0.0,0.0,-0.121367,1.069719,0.242733,0.0,0.340109,0.519336,0.179227,1.134636,-1.796507,-0.661871,https://www.sec.gov/Archives/edgar/data/277948...,https://www.sec.gov/Archives/edgar/data/277948...,12.0,1.411239,2004-03-20,2004.0,12.0,9.678588
10705,SNV,2000,0.834423,0.0,0.834423,0.0,0.0,0.0,0.0,0.782634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.140137,22.140137,0.0,0.0,0.0,0.0,19.624226,19.624226,1.248767,0.0,0.0,-0.837392,0.411375,0.0,0.0,19.915354,0.0,0.284818,1.515451,0.0,0.166216,2.104653,22.020006,0.12013,2.224783,22.140137,0.0,1.248767,0.414344,2.072982,0.0,0.0,0.0,0.0,0.735837,0.0,0.735837,0.0,0.0,0.0,1.63036,0.795236,0.119554,1.526261,0.736264,0.0,0.0,0.0,0.611471,4.380653e-10,0.221546,0.389926,0.188099,6.510007e-09,6.440006e-09,0.040507,0.040983,2000-12-31,USD,18349.0,2001-03-16,2001-03-16 00:00:00,FY,-0.006622,0.0,-0.045336,0.0,0.0,0.0,0.2172,0.674722,0.0,0.0,-0.859841,0.399824,-3.143004,-3.603021,-0.005611,0.0,0.0,-0.176746,3.246559,3.064202,0.0,0.135904,0.828771,0.692867,0.674722,-0.253299,0.421423,https://www.sec.gov/Archives/edgar/data/18349/...,https://www.sec.gov/Archives/edgar/data/18349/...,12.0,1.485109,2001-03-26,2001.0,13.0,4.98188
383,AEE,2017,0.011099,0.0,0.011099,0.930098,0.579369,0.268596,1.789163,23.82517,0.45617,0.0,0.45617,0.781371,0.0,1.944549,27.007261,0.0,28.796424,1.001132,1.470621,0.057715,0.119869,0.671491,3.263114,7.873649,0.0,2.835801,6.692713,17.402163,0.0,0.0,20.665277,0.0,0.002,1.842438,-0.019978,6.148861,7.973541,28.638818,0.157606,8.131147,28.796424,0.0,9.34427,9.333171,6.855869,3.713734,3.142134,0.458313,0.0,0.0,0.0,0.0,1.523896,1.523896,5.237631,0.037737,0.433972,1.056627,2.710382,0.395338,1.618238,0.236037,-0.391796,1.226442,1.985498e-10,0.639304,0.580479,0.084669,2.16e-09,2.14e-09,0.2426,0.2442,2017-12-31,USD,1002910.0,2018-02-28,2018-02-28 16:42:56,FY,0.598238,0.018868,-0.274146,0.0,0.035517,-1.473951,0.355169,2.335235,-0.069924,0.0,-0.45839,0.439521,-2.358543,-2.447335,-0.755844,0.0,0.0,-0.478368,1.347422,0.11321,0.0,0.00111,0.011099,0.009989,2.335235,-2.436236,-0.101001,https://www.sec.gov/Archives/edgar/data/100291...,https://www.sec.gov/Archives/edgar/data/100291...,12.0,1.109903,2018-03-10,2018.0,10.0,14.728048
6644,KBH,1998,0.097466,0.0,0.097466,0.772967,1.743941,0.0,2.614374,0.0,0.0,0.069948,0.069948,0.0,0.0,0.175409,0.245357,0.0,2.859731,0.338673,0.0,0.0,0.0,0.228293,0.566965,1.18251,0.0,0.0,0.07579,1.2583,0.0,0.0,1.825265,0.0,0.0,0.374185,0.0,0.660281,1.034465,2.859731,0.0,1.034465,2.859731,0.0,1.18251,1.085043,3.69419,2.969496,0.724695,0.196171,0.0,0.483488,0.0,0.483488,0.027826,0.511314,3.48081,0.0,0.0,0.027826,0.253197,0.068539,0.213381,0.057761,0.011991,0.225372,9.378766e-11,0.078865,0.146507,0.039659,1.205e-09,1.16e-09,0.079087,0.082155,1998-11-30,USD,795266.0,1999-02-26,1999-02-26 00:00:00,FY,0.000769,0.0,0.460275,0.0,0.0,2.047408,-0.655054,-0.019678,-0.27472,0.0,0.0,0.019831,0.006149,-0.248739,0.0,0.0,0.0,-0.018294,0.279332,0.261038,0.0,0.0,0.0,0.104846,-0.019678,-0.27472,-0.294398,https://www.sec.gov/Archives/edgar/data/795266...,https://www.sec.gov/Archives/edgar/data/795266...,11.0,1.537324,1999-03-08,1999.0,10.0,1.621095
5183,GLW,2009,3.193391,1.309529,4.50292,0.94633,0.727656,0.761588,6.938493,10.047683,0.639684,0.209877,0.84956,5.016929,3.747616,0.16212,19.823908,0.0,26.762401,0.69121,0.092999,0.0,0.0,1.149922,1.934132,2.425519,0.0,0.0,2.8038,5.229319,0.0,0.0,7.163451,0.0,0.808,4.569528,-0.503955,14.452577,19.5336,26.697051,0.065351,19.59895,26.762401,6.326458,2.518519,-0.674872,6.780144,4.149775,2.630369,0.387952,0.707548,1.107193,0.0,1.107193,0.012567,1.827308,5.977083,0.023878,0.103053,0.995343,3.528942,0.520482,0.803061,0.118443,1.627486,2.430546,4.505183e-10,-0.092999,2.523546,0.372196,1.3e-09,1.28e-09,1.544615,1.56875,2009-12-31,USD,24741.0,2010-02-10,2010-02-10 16:04:52,FY,-0.273971,0.159607,1.778295,-0.252606,0.0,5.004362,-2.572559,2.610261,0.0,-0.515266,-1.724255,1.609891,-1.092112,-1.721742,-0.120648,0.025135,0.0,-0.392105,0.468766,-0.018851,-0.030162,0.839506,3.193391,2.353885,2.610261,-1.118504,1.491757,https://www.sec.gov/Archives/edgar/data/24741/...,https://www.sec.gov/Archives/edgar/data/24741/...,12.0,1.256746,2010-02-20,2010.0,7.0,35.68327
4318,ESV,2007,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,


In [100]:
statements.to_csv('../data/clean/statements_mcap.csv', index= False)

#### Aditional Cleaning

# Null Cleaning

In [101]:
data = pd.read_csv('../data/clean/statements_mcap.csv')
data = data.drop(columns = ['link', 'finalLink', 'cik'])
data.shape

(12985, 112)

In [102]:
(data.isna().sum() / len(data)).sort_values(ascending=False).head(5)

target                              0.261070
commonStock                         0.225722
longTermDebt                        0.225722
deferredRevenueNonCurrent           0.225722
deferredTaxLiabilitiesNonCurrent    0.225722
dtype: float64

#### Drop all rows with null target

In [103]:
cols = ['target']
data = data.dropna(subset=cols)
(data.isna().sum() / len(data)).sort_values(ascending=False).head(5)

totalStockholdersEquity             0.010943
otherCurrentLiabilities             0.010943
longTermDebt                        0.010943
deferredRevenueNonCurrent           0.010943
deferredTaxLiabilitiesNonCurrent    0.010943
dtype: float64

# Incorrect observations clean

In [104]:
data = pd.read_csv('../data/clean/statements_mcap.csv')

In [105]:
def drop_rows(data, symbol, dates_inclusive):
    drop_mask = (data.symbol == symbol) & data.calendarYear.between(dates_inclusive[0], dates_inclusive[1])
    idx_drop = data[drop_mask].index
    return data.drop(index=idx_drop)

In [106]:
data = drop_rows(data, 'MBI', [2002,2008])
data = drop_rows(data, 'SLM', [2004,2013])
data = drop_rows(data, 'STI', [2002,2009])
data = drop_rows(data, 'CHK', [2006,2017])

In [107]:
data.to_csv('../data/processed/data.csv', index=False)