0. ffill
1. medianfill
2. ebit
3. ttm
4. lags
5. dropna

In [1]:
# Patching sklearn for improved performance.
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Importing necessary packages.
import pandas as pd
import numpy as np
from IPython.display import display
from datetime import datetime
from sklearn.model_selection import train_test_split
from preprocessing import *

In [3]:
# Reading the dataframe.
df = pd.read_csv (r'/Users/Colin/code/hugo_research/final_data_v3.csv')
df['datadate'] = pd.to_datetime(df.datadate, format='%d/%m/%Y')
df.head()

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,cusip,...,niq,ppegtq,rectq,revtq,txpq,uaptq,xoprq,xsgaq,costat,prccq
0,1000,1970-03-31,1970,1.0,INDL,C,D,STD,AE.2,32102,...,0.605,,,9.478,,,,,I,
1,1000,1970-06-30,1970,2.0,INDL,C,D,STD,AE.2,32102,...,0.648,,,9.916,,,,,I,
2,1000,1970-09-30,1970,3.0,INDL,C,D,STD,AE.2,32102,...,0.369,,,10.698,,,,,I,11.75
3,1000,1970-12-31,1970,4.0,INDL,C,D,STD,AE.2,32102,...,-1.064,,,10.919,,,35.749,8.249,I,10.0
4,1000,1971-03-31,1971,1.0,INDL,C,D,STD,AE.2,32102,...,0.346,,,7.983,,,,,I,9.75


In [4]:
# Selecting the interesting columns.
columns_list = df.columns.tolist().copy()
columns_list = columns_list[-20:]
columns_list.remove('costat')
columns_list.append('ajexq')
print(columns_list)

['acoq', 'aoq', 'cheq', 'cogsq', 'cshoq', 'dlcq', 'dpq', 'invtq', 'lcoq', 'ltq', 'niq', 'ppegtq', 'rectq', 'revtq', 'txpq', 'uaptq', 'xoprq', 'xsgaq', 'prccq', 'ajexq']


In [5]:
# Filling NaN forwardly.
df_ffill = ffill(df, columns_list)

# Dropping rows where prccq is NaN.                 #FIXME
df_ffill = df_ffill[df_ffill['prccq'].notna()]

# Filling NaN with cross-section median.
df_mfill = mfill(df_ffill, columns_list)

# Computing quartly EBIT as: ebitq = revtq - xoprq - dpq.
df_mfill['ebitq'] = df_mfill['revtq'] - df_mfill['xoprq'] - df_mfill['dpq']
df_mfill['mrkcapq'] = df_mfill['cshoq'] * df_mfill['prccq']                    #FIXME: creates fake market caps by filling median for data before IPO.
df_mfill['prccq_adj'] = df_mfill['prccq'] / df_mfill['ajexq']

# Converting quarterly data to trailing twelve months (TTM) data.
ttm_list = ['revtq', 'cogsq', 'xsgaq', 'niq', 'ebitq']
df_ttm = ttm(df_mfill, ttm_list)                                               #FIXME: this "undoes" the sorting by date, dangerous for lags functions

# Creating price momentum features.
df_momentum, price_lags, empty = lagged(df_ttm, ['prccq_adj'], lags_list=[-4, -3, -2, -1])
price_lags.remove('prccq_adj')

for col in price_lags:
    df_momentum.loc[:,col] = df_momentum.loc[:,'prccq_adj']/df_momentum.loc[:,col]

# Adding lags to dataframe.
fundamentals_list = ['revtq', 'cogsq', 'xsgaq', 'niq', 'ebitq', 'cheq', 'rectq', 'invtq', 'acoq', 'ppegtq', 'aoq', 'dlcq', 'uaptq', 'txpq', 'lcoq', 'ltq']
features_20 = price_lags + fundamentals_list
df_lagged_1, X_col_list_fundamentals, y_col_list = lagged(df_momentum, fundamentals_list, lags_list=[-20, -16, -12, -8, -4, 4])
df_lagged_2, X_col_list_price, empty = lagged(df_lagged_1, price_lags, lags_list=[-20, -16, -12, -8, -4])
X_col_list = X_col_list_fundamentals + X_col_list_price

# Dropping rows with NaN.
df_ready = df_lagged_2.dropna()

In [6]:
# Printing TSLA stock to check if data is correct.
df_mfill[['datadate', 'cusip', 'prccq_adj']].loc[df_mfill['cusip'] == '88160R101']

Unnamed: 0,datadate,cusip,prccq_adj
615490,2010-06-30,88160R101,4.766
615491,2010-09-30,88160R101,4.081
615492,2010-12-31,88160R101,5.326
615493,2011-03-31,88160R101,5.55
615494,2011-06-30,88160R101,5.826
615495,2011-09-30,88160R101,4.878
615496,2011-12-31,88160R101,5.712
615497,2012-03-31,88160R101,7.448
615498,2012-06-30,88160R101,6.258
615499,2012-09-30,88160R101,5.856


In [7]:
# Scaling all input time-steps by market cap of last input time step.
df_ready = df_ready[df_ready.mrkcapq != 0]

for col in X_col_list_fundamentals:
    df_ready.loc[:,col] = df_ready.loc[:,col]/df_ready.loc[:,'mrkcapq']

for col in y_col_list:
    df_ready.loc[:,col] = df_ready.loc[:,col]/df_ready.loc[:,'mrkcapq']

In [8]:
# Dropping inf, -inf and NaN values.
df_ready.replace([np.inf, -np.inf], np.nan, inplace=True)
df_ready = df_ready.dropna()
df_ready.shape

(250671, 158)

In [9]:
# Splitting dataframe into train (+val) and test sets.
df_train_val = df_ready[(df_ready.datadate >= datetime(1970, 1, 1)) & (df_ready.datadate <= datetime(1999, 12, 31))]
df_test = df_ready[(df_ready.datadate >= datetime(2000, 1, 1)) & (df_ready.datadate <= datetime(2016, 12, 31))]

In [10]:
# Splitting train + val set into train set and val set.
from sklearn.model_selection import GroupShuffleSplit
splitter = GroupShuffleSplit(test_size=.30, n_splits=2, random_state = 7)
split = splitter.split(df_train_val, groups=df_train_val['cusip'])
train_inds, test_inds = next(split)

df_train = df_train_val.iloc[train_inds]
df_val = df_train_val.iloc[test_inds]

In [11]:
# Checking distribution of unique stocks.
print(df_train_val['cusip'].nunique())
print(df_train['cusip'].nunique() + df_val['cusip'].nunique())

3404
3404


In [12]:
# Splitting dataframes into X and y dataframes.
X_train = df_train.filter(X_col_list)
y_train = df_train.filter(y_col_list)
X_val = df_val.filter(X_col_list)
y_val = df_val.filter(y_col_list)
X_test = df_test.filter(X_col_list)
y_test = df_test.filter(y_col_list)

In [13]:
# # Splitting dataframes into X and y dataframes.
# X_train_val = df_train_val.filter(X_col_list)
# y_train_val = df_train_val.filter(y_col_list)
# X_test = df_test.filter(X_col_list)
# y_test = df_test.filter(y_col_list)

In [14]:
# # Splitting train + val dataframe into training and validation sets.
# X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size=0.7, random_state=69) #FIXME: not 30% of rows, but 30% of stocks!

In [15]:
# Scaling all features to have zero mean and unit standard deviation.           #FIXME: still need to do scaling on last time step!
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
y_train = scaler.fit_transform(y_train)
y_val = scaler.transform(y_val)
y_test = scaler.transform(y_test)

In [16]:
# Saving all dataframes needed for modeling as pickle files.
df = [[X_train, X_val, X_test, y_train, y_val, y_test], ['X_train', 'X_val', 'X_test', 'y_train', 'y_val', 'y_test']]
for i in range(len(df[0])):
    name = str()
    pd.DataFrame(df[0][i]).to_pickle("./" + str(df[1][i]) + ".pkl")

In [17]:
# Checking size of final dataframes.
print(pd.DataFrame(y_train).shape)
print(pd.DataFrame(X_val).shape)
print(pd.DataFrame(X_test).shape)

(76662, 16)
(34441, 120)
(139564, 120)
