# Cleaning
Take all the raw data and upload it to the database

In [215]:
import pandas as pd
import numpy as np
import datetime
from pathlib import Path
import os
from myfuncs import explore_numerical

In [37]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 100 

## Companies

- A table for all unique companies
    - cik, symbol, name, sector, subsector, founded, etc
- A table for all the periods a company has been on the index.
    - cik, symbol, start date, end date, flag_current

#### Table for unique companies

In [4]:
# import companies csv
current_companies = pd.read_csv('../data/raw/companies_wiki.csv').drop(columns='SEC filings')
current_companies.columns = ['symbol', 'name', 'sector', 'subSector', 'hQ', 'dateFirstAdded', 'cik', 'founded']
# import historical companies csv
historical_v1 = pd.read_csv('../data/raw/historical_companies_wiki.csv')
# import wikipedia historical companies csv
spts = pd.read_csv('../data/raw/historical_companies_TradingEvolved.csv') 

In [5]:
# get all unique occurences of anything that is in tickers
# get the items on the list if there is no '-', if there is, get the first item (the ticker)
spts['tickers_filtered'] = spts.tickers.str.split(',')
spts.head()

Unnamed: 0,date,tickers,tickers_filtered
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."


Get all unique constituents

In [7]:
results = set()
spts.tickers_filtered.apply(results.update)
companies = pd.DataFrame(data = results, columns=['symbol'])
companies.shape

(1125, 1)

Add values from current companies

In [24]:
current_companies['currentConstituent'] = True
companies = companies.merge(current_companies, how='left')
companies = companies.drop(columns=['dateFirstAdded'])
companies.currentConstituent = companies.currentConstituent.fillna(False)

In [25]:
ciks = pd.read_csv('../data/raw/CIK.csv', index_col = 0)
ciks.columns = ['cik_sec_list', 'symbol', 'title']

In [26]:
companies = companies.merge(ciks, how='left')
companies.name = companies.name.fillna(companies.title)
companies.cik = companies.cik.fillna(companies.cik_sec_list)

In [None]:
companies.to_csv('../data/preSQL/companies.csv', index=False)

#### Table for historical constituents

In [34]:
sp500_stays = pd.DataFrame(columns=['symbol', 'added', 'removed'])
# iterate over the list of tickers for every day and
previous_set=set()
for date, list_tickers in spts.set_index('date').tickers_filtered.items():
    # for the first iteration there is no previous set
    new_set = set(list_tickers)
    # check which values from the list of values was not in the previous date
    diff_new = new_set-previous_set
    # for each value in diff_new
    for diff_ticker in diff_new:
        if diff_ticker in new_set:
            # the ticker has been added
            new_row_index = 0 if len(sp500_stays) == 0 else sp500_stays.index.max()+1
            new_row_data={
                'symbol': diff_ticker,
                'added': date,
                'removed': 'not_yet_removed',
                }
            new_row = pd.DataFrame(data = new_row_data, index=[new_row_index])
            sp500_stays = pd.concat([sp500_stays, new_row], axis=0)
    diff_old = previous_set-new_set
    for diff_ticker in diff_old:
        if diff_ticker in previous_set:
            # the ticker has been removed
            # get the index of the last occurence of the ticker in the dataframe
            mask = sp500_stays.symbol == diff_ticker
            idx = sp500_stays[mask].index.max()
            # update that index with the date removed
            sp500_stays.loc[idx, 'removed'] = date
    # this iteration ends, the new set becomes obsolete
    previous_set = new_set.copy()

to .csv

In [45]:
sp500_stays.to_csv('../data/preSQL/sp500_movements.csv', index=False)

## SEC submissions

Submissions from SEC
- 10-KA/405A and 10QA are text amendments which contain no financial information. <a href="https://www.sec.gov/Archives/edgar/data/320193/0001047469-98-001822.txt">example</a>
- NT 10-Q and NT 10-K are notifications about delay in statements
- 10KT and 10QT dennote transition in companies which alter fiscal years. Usually after merger of acquisitions

In [None]:
submissions=pd.read_csv('.././data/raw/submissions.csv')
sub_cols = [
    'filingDate',
    'reportDate',
    'symbol',
    'cik',
    'form',
    ]
sec = submissions.loc[:,sub_cols]
mask = sec.form.isin(['10-K', '10-Q', '10-K405', '10-KT', '10-QT'])
sec = sec[mask]

## FRED Series

### Inflation (CorePCE)

In [162]:
# read inflation csv
inflation = pd.read_csv('../data/raw/fred/corePCE.csv').convert_dtypes()
# get last date
last_date = inflation.date.idxmax()
# get the coefficient of that date
today_inflation = inflation.loc[last_date, 'corePCE']
# calculate multiplier
inflation['inflationMultiplier'] = today_inflation/inflation.corePCE
# export csv
inflation.to_csv('../data/preSQL/inflation.csv', index=False)

## 10-K Statements

#### Walk the path in a directory and generate the dataframe.
- We're only interested in dates when the symbol belonged to the sp500
- Risk: get the past value of a current ticker instead of getting the values of a former ticker at that time.  

In [105]:
def merge_csv(path):
    df_list = []
    path_dir = Path(path)
    sp500_dates = pd.read_csv('../data/raw/sp500_movements.csv')
    sp500_dates.removed = sp500_dates.removed.replace('not_yet_removed', '2022-12-31')
    sp500_dates.added = pd.to_datetime(sp500_dates.added)
    sp500_dates.removed = pd.to_datetime(sp500_dates.removed)
    for file in path_dir.glob('*.csv'):
        csv_path = os.path.join(file.parent, file.name)
        df = pd.read_csv(csv_path)
        # convert fillingDates to datetime
        df.fillingDate = pd.to_datetime(df.fillingDate)
        # keep only the the combinations of symbol and date which belonged to the sp500
        mask = sp500_dates.symbol == file.name.split('.')[0]
        added = sp500_dates[mask].added
        removed = sp500_dates[mask].removed
        for stay in tuple(zip(added, removed)):
            df_list.append(df[df.fillingDate.between(stay[0], stay[1])])
    return pd.concat(df_list)

In [107]:
def do_merge():
    balance = merge_csv('../data/raw/balance')
    balance.to_csv('../data/raw/balance.csv', index=False)
    print(balance.shape)

    cash_flow = merge_csv('../data/raw/cash_flow')
    cash_flow.to_csv('../data/raw/cash_flow.csv', index=False)
    print(cash_flow.shape)

    income = merge_csv('../data/raw/income')
    income.to_csv('../data/raw/income.csv', index=False)
    print(income.shape)

In [108]:
# do_merge()

(10179, 54)
(10309, 40)
(10328, 38)


### Table for the join of all the historical financial statements in the SP500
- Generate the primary keys
- Clean the primary keys: symbol + year

#### Generate the primary keys of symbol, year

In [135]:
sp500_dates = pd.read_csv('../data/raw/sp500_movements.csv')
sp500_dates.removed = sp500_dates.removed.replace('not_yet_removed', '2022-12-31')
sp500_dates.added = pd.to_datetime(sp500_dates.added).dt.year
sp500_dates.removed = pd.to_datetime(sp500_dates.removed).dt.year

In [136]:
indices = []
for ticker in sp500_dates.symbol.unique():
    mask = sp500_dates.symbol == ticker 
    added = sp500_dates[mask].added
    removed = sp500_dates[mask].removed
    for stay in tuple(zip(added, removed)):
        # convert to years and generate their sequence
        yearlist = list(range(stay[0], stay[1]))
        for y in yearlist:
            indices.append((ticker, y)) 

In [137]:
statements = pd.DataFrame(index = set(indices)).reset_index()
statements.columns = ['symbol', 'calendarYear']

#### Left join with balance, cash flow and income statements

In [138]:
balance = pd.read_csv('../data/raw/balance.csv')
cash_flow = pd.read_csv('../data/raw/cash_flow.csv')
income = pd.read_csv('../data/raw/income.csv')

In [139]:
statements = statements.merge(right = balance, on = ['symbol', 'calendarYear'], how='left')

In [140]:
statements = statements.merge(right = income, on = ['symbol', 'calendarYear'], how='left')
statements = statements.merge(right = cash_flow, on = ['symbol', 'calendarYear'], how='left')

In [141]:
# statements.to_csv('../data/preSQL/statements.csv', index= False)

#### Drop duplicated columns

In [174]:
statements = pd.read_csv('../data/preSQL/statements.csv')
statements.shape

(12985, 128)

In [175]:
repeated = [
    ['date','date_x','date_y'],
    ['acceptedDate','acceptedDate_x','acceptedDate_y'],
    ['cik','cik_x','cik_y'],
    ['depreciationAndAmortization_x','depreciationAndAmortization_y'],
    ['fillingDate', 'fillingDate_x','fillingDate_y'],
    ['finalLink','finalLink_x','finalLink_y'],
    ['inventory_x','inventory_y'],
    ['link','link_x','link_y'],
    ['netIncome_x','netIncome_y'],
    ['period','period_x','period_y'],
    ['reportedCurrency','reportedCurrency_x','reportedCurrency_y',]
]
for group in repeated:
    base_feature = group[0]
    for dup_feature in group[1:]:
        statements[base_feature] = statements[base_feature].fillna(statements[dup_feature])
        statements = statements.drop(columns=dup_feature)    
statements.shape

(12985, 109)

In [176]:
col_rename = {
    'depreciationAndAmortization_x': 'depreciationAndAmortization',
    'inventory_x': 'inventory',
    'netIncome_x': 'netIncome',
}
statements = statements.rename(columns=col_rename)

#### Convert to billions and adjust for inflation

In [177]:
currency_features = [
    'cashAndCashEquivalents',
    'shortTermInvestments',
    'cashAndShortTermInvestments',
    'netReceivables',
    'inventory',
    'otherCurrentAssets',
    'totalCurrentAssets',
    'propertyPlantEquipmentNet',
    'goodwill',
    'intangibleAssets',
    'goodwillAndIntangibleAssets',
    'longTermInvestments',
    'taxAssets',
    'otherNonCurrentAssets',
    'totalNonCurrentAssets',
    'otherAssets',
    'totalAssets',
    'accountPayables',
    'shortTermDebt',
    'taxPayables',
    'deferredRevenue',
    'otherCurrentLiabilities',
    'totalCurrentLiabilities',
    'longTermDebt',
    'deferredRevenueNonCurrent',
    'deferredTaxLiabilitiesNonCurrent',
    'otherNonCurrentLiabilities',
    'totalNonCurrentLiabilities',
    'otherLiabilities',
    'capitalLeaseObligations',
    'totalLiabilities',
    'retainedEarnings',
    'accumulatedOtherComprehensiveIncomeLoss',
    'othertotalStockholdersEquity',
    'totalStockholdersEquity',
    'totalLiabilitiesAndStockholdersEquity', # drop candidate
    'minorityInterest',
    'totalEquity',
    'totalLiabilitiesAndTotalEquity', # drop candidate
    'totalInvestments',
    'totalDebt',
    'netDebt',
    'revenue',
    'costOfRevenue',
    'grossProfit',
    'researchAndDevelopmentExpenses',
    'generalAndAdministrativeExpenses',
    'sellingAndMarketingExpenses',
    'sellingGeneralAndAdministrativeExpenses',
    'otherExpenses',
    'operatingExpenses',
    'costAndExpenses',
    'interestIncome',
    'interestExpense',
    'depreciationAndAmortization',
    'ebitda',
    'operatingIncome',
    'totalOtherIncomeExpensesNet',
    'incomeBeforeTax',
    'incomeBeforeTaxRatio',
    'incomeTaxExpense',
    'netIncome',
    'deferredIncomeTax',
    'stockBasedCompensation',
    'changeInWorkingCapital',
    'accountsReceivables',
    'accountsPayables',
    'otherWorkingCapital',
    'otherNonCashItems',
    'netCashProvidedByOperatingActivities',
    'investmentsInPropertyPlantAndEquipment',
    'acquisitionsNet',
    'purchasesOfInvestments',
    'salesMaturitiesOfInvestments',
    'otherInvestingActivites',
    'netCashUsedForInvestingActivites',
    'debtRepayment',
    'commonStockIssued',
    'commonStockRepurchased',
    'dividendsPaid',
    'otherFinancingActivites',
    'netCashUsedProvidedByFinancingActivities',
    'effectOfForexChangesOnCash',
    'netChangeInCash',
    'cashAtEndOfPeriod',
    'cashAtBeginningOfPeriod',
    'operatingCashFlow',
    'capitalExpenditure',
    'freeCashFlow',
]

convert to billions

In [178]:
# convert to billions
for feat in currency_features:
    statements[feat] = statements[feat].astype(float)/1e9
# sort values
statements = statements.sort_values(by=['symbol', 'calendarYear'])

In [179]:
statements.head()

Unnamed: 0,symbol,calendarYear,cashAndCashEquivalents,shortTermInvestments,cashAndShortTermInvestments,netReceivables,inventory,otherCurrentAssets,totalCurrentAssets,propertyPlantEquipmentNet,goodwill,intangibleAssets,goodwillAndIntangibleAssets,longTermInvestments,taxAssets,otherNonCurrentAssets,totalNonCurrentAssets,otherAssets,totalAssets,accountPayables,shortTermDebt,taxPayables,deferredRevenue,otherCurrentLiabilities,totalCurrentLiabilities,longTermDebt,deferredRevenueNonCurrent,deferredTaxLiabilitiesNonCurrent,otherNonCurrentLiabilities,totalNonCurrentLiabilities,otherLiabilities,capitalLeaseObligations,totalLiabilities,preferredStock,commonStock,retainedEarnings,accumulatedOtherComprehensiveIncomeLoss,othertotalStockholdersEquity,totalStockholdersEquity,totalLiabilitiesAndStockholdersEquity,minorityInterest,totalEquity,totalLiabilitiesAndTotalEquity,totalInvestments,totalDebt,netDebt,revenue,costOfRevenue,grossProfit,grossProfitRatio,researchAndDevelopmentExpenses,generalAndAdministrativeExpenses,sellingAndMarketingExpenses,sellingGeneralAndAdministrativeExpenses,otherExpenses,operatingExpenses,costAndExpenses,interestIncome,interestExpense,depreciationAndAmortization,ebitda,ebitdaratio,operatingIncome,operatingIncomeRatio,totalOtherIncomeExpensesNet,incomeBeforeTax,incomeBeforeTaxRatio,incomeTaxExpense,netIncome,netIncomeRatio,eps,epsdiluted,weightedAverageShsOut,weightedAverageShsOutDil,date,reportedCurrency,cik,fillingDate,acceptedDate,period,deferredIncomeTax,stockBasedCompensation,changeInWorkingCapital,accountsReceivables,accountsPayables,otherWorkingCapital,otherNonCashItems,netCashProvidedByOperatingActivities,investmentsInPropertyPlantAndEquipment,acquisitionsNet,purchasesOfInvestments,salesMaturitiesOfInvestments,otherInvestingActivites,netCashUsedForInvestingActivites,debtRepayment,commonStockIssued,commonStockRepurchased,dividendsPaid,otherFinancingActivites,netCashUsedProvidedByFinancingActivities,effectOfForexChangesOnCash,netChangeInCash,cashAtEndOfPeriod,cashAtBeginningOfPeriod,operatingCashFlow,capitalExpenditure,freeCashFlow,link,finalLink
3968,A,2000,0.996,0.0,0.996,2.201,1.853,0.605,5.655,1.741,0.0,0.557,0.557,0.0,0.0,0.472,2.77,0.0,8.425,0.857,0.11,0.0,0.372,1.419,2.758,0.0,0.0,0.0,0.402,0.402,0.0,0.0,3.16,0.0,5000000.0,0.757,-2.229,6.732,5.265,8.425,0.0,5.265,8.425,0.0,0.11,-0.886,10.773,5.522,5.251,0.487422,1.258,2.94,0.0,2.94,0.0,4.198,9.72,0.0,0.0,0.495,1.659,0.153996,1.053,0.097744,0.111,1.164,1.080479e-10,0.407,0.757,0.070268,1.68,1.66,449000000.0,455000000.0,2000-10-31,USD,1090872.0,2001-01-17,2001-01-17 00:00:00,FY,-0.059,0.0,1.04,0.0,0.0,2.897,-1.395,0.838,-0.824,-0.691,-0.032,0.06,0.37,-1.117,0.0,2.152,-2.068,0.0,1.191,1.275,0.0,0.0,0.996,0.0,0.838,-0.824,0.014,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
1420,A,2001,1.17,0.0,1.17,0.977,1.491,1.161,4.799,1.848,0.0,1.07,1.07,0.0,0.0,0.269,3.187,0.0,7.986,0.386,0.006,0.0,0.279,1.331,2.002,0.0,0.0,0.0,0.325,0.325,0.0,0.0,2.327,0.0,5000000.0,0.931,-2.277,7.0,5.659,7.986,0.0,5.659,7.986,0.0,0.006,-1.164,8.396,5.166,3.23,0.384707,1.349,2.659,0.0,2.659,0.0,4.008,9.174,0.0,0.0,0.734,0.831,0.098976,-0.778,-0.092663,0.301,-0.477,-5.681277e-11,-0.071,0.168,0.02001,0.38,0.38,458000000.0,458000000.0,2001-10-31,USD,1090872.0,2002-01-22,2002-01-22 00:00:00,FY,-0.094,0.0,-0.1,0.0,0.0,2.797,0.794,1.502,-0.881,-0.904,-0.027,0.074,0.373,-1.365,0.0,0.15,0.0,0.0,-0.113,0.037,0.0,0.174,1.17,0.996,1.502,-0.881,0.621,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
11784,A,2002,1.844,0.0,1.844,1.119,1.184,0.733,4.88,1.579,0.0,0.685,0.685,0.0,0.635,0.424,3.323,0.0,8.203,0.305,0.0,0.0,0.244,1.632,2.181,1.15,0.0,0.0,0.245,1.395,0.0,0.0,3.576,0.0,5000000.0,-0.101,-0.149,4.872,4.627,8.203,0.0,4.627,8.203,0.0,1.15,-0.694,6.01,3.694,2.316,0.385358,1.169,2.754,0.0,2.754,0.0,3.923,7.617,0.0,0.0,0.735,-0.822,-0.136772,-1.607,-0.267388,0.06,-1.547,-2.574043e-10,-0.525,-1.032,-0.171714,-2.22,-2.22,465000000.0,465000000.0,2002-10-31,USD,1090872.0,2002-12-20,2002-12-20 17:27:53,FY,-0.664,0.0,-0.098,0.0,0.0,2.699,0.561,-0.498,-0.301,-0.015,-0.023,0.0,0.268,-0.071,0.0,0.121,0.0,0.0,1.122,1.243,0.0,0.674,1.844,1.17,-0.498,-0.301,-0.799,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
1852,A,2003,1.607,0.0,1.607,1.086,0.995,0.201,3.889,1.447,0.0,0.402,0.402,0.0,0.027,0.532,2.408,0.0,6.297,0.441,0.0,0.0,0.262,1.203,1.906,1.15,0.0,0.0,0.417,1.567,0.0,0.0,3.473,0.0,5000000.0,-2.159,-0.006,4.984,2.824,6.297,0.0,2.824,6.297,0.0,1.15,-0.457,6.056,3.762,2.294,0.378798,1.051,1.968,0.0,1.968,0.0,3.019,6.781,0.0,0.0,0.362,-0.596,-0.098415,-0.725,-0.119716,0.035,-0.69,-1.139366e-10,1.1,-2.058,-0.339828,-4.35,-4.35,473000000.0,473000000.0,2003-10-31,USD,1090872.0,2003-12-22,2003-12-22 17:25:07,FY,1.071,0.0,-0.716,0.0,0.0,1.983,1.197,-0.144,-0.205,0.0,-0.004,0.0,0.006,-0.203,0.0,0.112,0.0,0.0,-0.002,0.11,0.0,-0.237,1.607,1.844,-0.144,-0.205,-0.349,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
12231,A,2004,2.315,0.0,2.315,1.044,1.026,0.192,4.577,1.258,0.0,0.443,0.443,0.0,0.0,0.778,2.479,0.0,7.056,0.441,0.0,0.0,0.284,1.146,1.871,1.15,0.0,0.0,0.466,1.616,0.0,0.0,3.487,0.0,5000000.0,-1.81,0.179,5.195,3.569,7.056,0.0,3.569,7.056,0.0,1.15,-1.165,7.181,4.058,3.123,0.434898,0.933,1.804,0.0,1.804,0.0,2.737,6.795,0.0,0.0,0.292,0.732,0.101936,0.386,0.053753,0.054,0.44,6.12728e-11,0.091,0.349,0.0486,0.72,0.71,483000000.0,490000000.0,2004-10-31,USD,1090872.0,2004-12-21,2004-12-21 06:14:39,FY,-0.033,0.0,0.723,0.0,0.0,2.706,-0.668,0.663,-0.118,-0.018,0.0,0.0,0.022,-0.114,0.0,0.144,0.0,0.0,0.001,0.145,0.014,0.708,2.315,1.607,0.663,-0.118,0.545,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...


Adjust for inflation

In [181]:
statements['date'] = pd.to_datetime(statements['date'])
statements['month'] = statements['date'].dt.month
inflation = pd.read_csv('../data/preSQL/inflation.csv')
inflation = inflation.rename(columns={'year':'calendarYear'})
statements = statements.merge(
    right = inflation.loc[:,['calendarYear','month','inflationMultiplier']],
    how='left', 
    left_on=['calendarYear', 'month'], 
    right_on = ['calendarYear', 'month'])

In [186]:
for feat in currency_features:
    statements[feat] = statements[feat] * statements.inflationMultiplier

In [188]:
statements.to_csv('../data/preSQL/statements.csv', index=False)

#### Aditional Cleaning steps

## Market Capitalization

In [190]:
statements = pd.read_csv('../data/preSQL/statements.csv')

In [199]:
filldate_mask = statements.fillingDate.notnull()
symbols = statements[filldate_mask].symbol.unique()

In [220]:
# add 10 days to let the price stabilize
statements['mcapDate'] = pd.to_datetime(statements.fillingDate) + datetime.timedelta(10)
# extract year and week from that date
statements['mcapYear'] = statements.mcapDate.dt.year
statements['mcapWeek'] = statements.mcapDate.dt.isocalendar().week

In [None]:
statements.query()

In [230]:
for ticker in symbols:
    # open the market caps dataframe
    df = pd.read_csv('../data/raw/marketCaps/{}.csv'.format(ticker)).sort_values(by='date').reset_index(drop='True')
    # convert market cap to billions
    df.marketCap = df.marketCap/1e9
    # exctract year and week
    df.date = pd.to_datetime(df.date)
    df['mcapYear'] = df.date.dt.year
    df['mcapWeek'] = df.date.dt.isocalendar().week
    df['marketCapSMA10'] = df.marketCap.rolling(10).mean()
    df = df.rename(columns={'date':'mcapDate'})
    # attempt o left join on statements on mcapDdate 
    statemenst = statements.merge(
        right=df.loc[:,['symbol','mcapDate','marketCapSMA10']],
        on = ['symbol', 'mcapDate'],
        how='left'
    )
# join again on mcapYear and mcapYeek in case the date does not exist
    statemenst = statements.merge(
            right=df.groupby(['symbol','mcapYear', 'mcapWeek']).marketCapSMA10.mean(),
            on = ['symbol', 'mcapYear', 'mcapWeek'],
            how='left'
    )

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/marketCaps/AABA.csv'

In [None]:
mcaps = pd.read_csv('.././data/raw/mcaps.csv').convert_dtypes()
mcaps['date'] = pd.to_datetime(mcaps['date'])
# convert to billions
mcaps['marketCap'] = mcaps['marketCap']/1e9
# get the absolute value of negatives
mcaps['marketCap'] = abs(mcaps['marketCap'])
# get the companies valued at more than 1 Trillion which should be
giants = [ 'GOOGL', 'GOOG', 'AMZN', 'FB', 'MSFT', 'AAPL', 'TSLA']
cond1 = mcaps.marketCap > 1e3
cond2 = ~mcaps.symbol.isin(giants)
aux = mcaps[cond1&cond2].sort_values(by=['marketCap', 'symbol', 'date'])
# SHW: /1e9
idx = aux[aux.symbol=='SHW'].index
mcaps.loc[idx, 'marketCap'] = mcaps.loc[idx, 'marketCap']/ 1e6
# REST: / 1e6
idx = aux[~(aux.symbol=='SHW')].index
mcaps.loc[idx, 'marketCap'] = mcaps.loc[idx, 'marketCap']/ 1e3
# GOOG wrong values
mcaps.loc[161468:161502, 'marketCap'] = mcaps.loc[161468:161502, 'marketCap'] + 1000
mcaps.loc[161656:161659, 'marketCap'] = mcaps.loc[161656:161659, 'marketCap'] * 2
mcaps.loc[161621:161649, 'marketCap'] = mcaps.loc[161621:161649, 'marketCap'] * 2


In [None]:
# send to feature engineering
mcaps.to_csv('.././data/feng/mcaps.csv', index=False)