# Cleaning
Take all the raw data and upload it to the database

In [89]:
import pandas as pd
import numpy as np
import datetime
from pathlib import Path
import os
from myfuncs import explore_numerical

In [53]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 100 

## Companies

- A table for all unique companies
    - cik, symbol, name, sector, subsector, founded, etc
- A table for all the periods a company has been on the index.
    - cik, symbol, start date, end date, flag_current

#### Table for unique companies

In [4]:
# import companies csv
current_companies = pd.read_csv('../data/raw/companies_wiki.csv').drop(columns='SEC filings')
current_companies.columns = ['symbol', 'name', 'sector', 'subSector', 'hQ', 'dateFirstAdded', 'cik', 'founded']
# import historical companies csv
historical_v1 = pd.read_csv('../data/raw/historical_companies_wiki.csv')
# import wikipedia historical companies csv
spts = pd.read_csv('../data/raw/historical_companies_TradingEvolved.csv') 

In [5]:
# get all unique occurences of anything that is in tickers
# get the items on the list if there is no '-', if there is, get the first item (the ticker)
spts['tickers_filtered'] = spts.tickers.str.split(',')
spts.head()

Unnamed: 0,date,tickers,tickers_filtered
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...","[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."


Get all unique constituents

In [7]:
results = set()
spts.tickers_filtered.apply(results.update)
companies = pd.DataFrame(data = results, columns=['symbol'])
companies.shape

(1125, 1)

Add values from current companies

In [24]:
current_companies['currentConstituent'] = True
companies = companies.merge(current_companies, how='left')
companies = companies.drop(columns=['dateFirstAdded'])
companies.currentConstituent = companies.currentConstituent.fillna(False)

In [25]:
ciks = pd.read_csv('../data/raw/CIK.csv', index_col = 0)
ciks.columns = ['cik_sec_list', 'symbol', 'title']

In [26]:
companies = companies.merge(ciks, how='left')
companies.name = companies.name.fillna(companies.title)
companies.cik = companies.cik.fillna(companies.cik_sec_list)

In [None]:
companies.to_csv('../data/preSQL/companies.csv', index=False)

#### Table for historical constituents

In [34]:
sp500_stays = pd.DataFrame(columns=['symbol', 'added', 'removed'])
# iterate over the list of tickers for every day and
previous_set=set()
for date, list_tickers in spts.set_index('date').tickers_filtered.items():
    # for the first iteration there is no previous set
    new_set = set(list_tickers)
    # check which values from the list of values was not in the previous date
    diff_new = new_set-previous_set
    # for each value in diff_new
    for diff_ticker in diff_new:
        if diff_ticker in new_set:
            # the ticker has been added
            new_row_index = 0 if len(sp500_stays) == 0 else sp500_stays.index.max()+1
            new_row_data={
                'symbol': diff_ticker,
                'added': date,
                'removed': 'not_yet_removed',
                }
            new_row = pd.DataFrame(data = new_row_data, index=[new_row_index])
            sp500_stays = pd.concat([sp500_stays, new_row], axis=0)
    diff_old = previous_set-new_set
    for diff_ticker in diff_old:
        if diff_ticker in previous_set:
            # the ticker has been removed
            # get the index of the last occurence of the ticker in the dataframe
            mask = sp500_stays.symbol == diff_ticker
            idx = sp500_stays[mask].index.max()
            # update that index with the date removed
            sp500_stays.loc[idx, 'removed'] = date
    # this iteration ends, the new set becomes obsolete
    previous_set = new_set.copy()

to .csv

In [45]:
sp500_stays.to_csv('../data/preSQL/sp500_movements.csv', index=False)

## SEC submissions

Submissions from SEC
- 10-KA/405A and 10QA are text amendments which contain no financial information. <a href="https://www.sec.gov/Archives/edgar/data/320193/0001047469-98-001822.txt">example</a>
- NT 10-Q and NT 10-K are notifications about delay in statements
- 10KT and 10QT dennote transition in companies which alter fiscal years. Usually after merger of acquisitions

In [None]:
submissions=pd.read_csv('.././data/raw/submissions.csv')
sub_cols = [
    'filingDate',
    'reportDate',
    'symbol',
    'cik',
    'form',
    ]
sec = submissions.loc[:,sub_cols]
mask = sec.form.isin(['10-K', '10-Q', '10-K405', '10-KT', '10-QT'])
sec = sec[mask]

## 10-K Statements

#### Walk the path in a directory and generate the dataframe.
- We're only interested in dates when the symbol belonged to the sp500

In [105]:
def merge_csv(path):
    df_list = []
    path_dir = Path(path)
    sp500_dates = pd.read_csv('../data/raw/sp500_movements.csv')
    sp500_dates.removed = sp500_dates.removed.replace('not_yet_removed', '2022-12-31')
    sp500_dates.added = pd.to_datetime(sp500_dates.added)
    sp500_dates.removed = pd.to_datetime(sp500_dates.removed)
    for file in path_dir.glob('*.csv'):
        csv_path = os.path.join(file.parent, file.name)
        df = pd.read_csv(csv_path)
        # convert fillingDates to datetime
        df.fillingDate = pd.to_datetime(df.fillingDate)
        # keep only the the combinations of symbol and date which belonged to the sp500
        mask = sp500_dates.symbol == file.name.split('.')[0]
        added = sp500_dates[mask].added
        removed = sp500_dates[mask].removed
        for stay in tuple(zip(added, removed)):
            df_list.append(df[df.fillingDate.between(stay[0], stay[1])])
    return pd.concat(df_list)

In [107]:
def do_merge():
    balance = merge_csv('../data/raw/balance')
    balance.to_csv('../data/raw/balance.csv', index=False)
    print(balance.shape)

    cash_flow = merge_csv('../data/raw/cash_flow')
    cash_flow.to_csv('../data/raw/cash_flow.csv', index=False)
    print(cash_flow.shape)

    income = merge_csv('../data/raw/income')
    income.to_csv('../data/raw/income.csv', index=False)
    print(income.shape)

In [108]:
do_merge()

(10179, 54)
(10309, 40)
(10328, 38)


#### Table for the join of all the historical financial statements in the SP500
- Clean the primary keys: symbol + year

In [71]:
balance = pd.read_csv('../data/raw/balance.csv')

In [72]:
balance.fillingDate.isna().sum()

0

In [74]:
cash_flow = pd.read_csv('../data/raw/cash_flow.csv', index_col = 0)
income = pd.read_csv('../data/raw/income.csv', index_col = 0)


Join cash flow and income

In [None]:
joincols = ['symbol', 'calendarYear']
left_drop = ['netIncome', 'depreciationAndAmortization']
right_drop = [
    'date', 
    'reportedCurrency',
    'cik',
    'fillingDate',
    'acceptedDate',
    'period',
    'link',
    'finalLink']
statements = (
    cflow.drop(columns=left_drop,)
    .merge(income.drop(columns=right_drop),on=joincols,how='inner'))

Join cflow+income and balance

In [None]:
joincols = ['symbol', 'calendarYear']
right_drop = [
    'date', 
    'reportedCurrency',
    'cik',
    'fillingDate',
    'acceptedDate',
    'period',
    'inventory',
    'link',
    'finalLink']
statements = statements.merge(balance.drop(columns=right_drop),on=joincols,how='inner')

In [None]:
statements.to_csv('.././data/feng/statements_y.csv', index=False)

## market cap
- Read raw data from API
- Convert to billions
- Fix market cap severe mistakes (order of magnitude)

In [None]:
mcaps = pd.read_csv('.././data/raw/mcaps.csv').convert_dtypes()
mcaps['date'] = pd.to_datetime(mcaps['date'])
# convert to billions
mcaps['marketCap'] = mcaps['marketCap']/1e9
# get the absolute value of negatives
mcaps['marketCap'] = abs(mcaps['marketCap'])
# get the companies valued at more than 1 Trillion which should be
giants = [ 'GOOGL', 'GOOG', 'AMZN', 'FB', 'MSFT', 'AAPL', 'TSLA']
cond1 = mcaps.marketCap > 1e3
cond2 = ~mcaps.symbol.isin(giants)
aux = mcaps[cond1&cond2].sort_values(by=['marketCap', 'symbol', 'date'])
# SHW: /1e9
idx = aux[aux.symbol=='SHW'].index
mcaps.loc[idx, 'marketCap'] = mcaps.loc[idx, 'marketCap']/ 1e6
# REST: / 1e6
idx = aux[~(aux.symbol=='SHW')].index
mcaps.loc[idx, 'marketCap'] = mcaps.loc[idx, 'marketCap']/ 1e3
# GOOG wrong values
mcaps.loc[161468:161502, 'marketCap'] = mcaps.loc[161468:161502, 'marketCap'] + 1000
mcaps.loc[161656:161659, 'marketCap'] = mcaps.loc[161656:161659, 'marketCap'] * 2
mcaps.loc[161621:161649, 'marketCap'] = mcaps.loc[161621:161649, 'marketCap'] * 2


In [None]:
# send to feature engineering
mcaps.to_csv('.././data/feng/mcaps.csv', index=False)