In [1]:
# Importing necessary packages.
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
# Reading the data.
df = pd.read_csv (r'/Users/Colin/Desktop/Research project/Data/Data - Final/Final Data.csv')
df['datadate'] = pd.to_datetime(df.datadate, format='%d/%m/%Y')
df.head()

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,cusip,...,txpq,uaptq,chechy,cogsy,dpy,niy,revty,xopry,xsgay,costat
0,1000,1970-03-31,1970,1.0,INDL,C,D,STD,AE.2,32102,...,,,,,,0.605,9.478,,,I
1,1000,1970-06-30,1970,2.0,INDL,C,D,STD,AE.2,32102,...,,,,,,,,,,I
2,1000,1970-09-30,1970,3.0,INDL,C,D,STD,AE.2,32102,...,,,,,,,,,,I
3,1000,1970-12-31,1970,4.0,INDL,C,D,STD,AE.2,32102,...,,,,,,,,,,I
4,1000,1971-03-31,1971,1.0,INDL,C,D,STD,AE.2,32102,...,,,,,,0.346,7.983,,,I


In [3]:
# Creating data wrangling functions.

# Function to fill in missing values in a forward manner.
def fillForwardDataframe(df, id, columns_list):
    # type(df) = pandas DataFrame
    # type(id) = str
    # type(columns_list) = list
    df_ffill = df.copy()
    for column in columns_list:
        df_ffill[column] = (df_ffill.groupby(id)[column].ffill())
    return df_ffill

# Function to fill in missing values by taking median of cross-section.
def fillMedianDataframe(df, date, columns_list):
    # type(df) = pandas DataFrame
    # type(date) = str
    # type(columns_list) = list
    df_fill_median = df.copy()
    for column in columns_list:
        df_fill_median[column] = df_fill_median[column].fillna(df_fill_median.groupby(date)[column].transform('median'))
    return df_fill_median

# Function to add lags in the dataframe.
def createLaggedDataframe(df, id, lags_list, columns_list):
    # type(df) = pandas DataFrame
    # type(id) = str
    # type (lags_list) = list
    # type(columns_list) = list
    df_lagged = df.copy()
    for lag in lags_list:
        for column in columns_list:
            new_col_name = column + str('_l') + str(lag) #FIXME: adapt so that shifts with pos int have neg int in name and vice versa
            df_lagged[new_col_name] = (df_lagged.groupby(id)[column].shift(lag))
    return df_lagged

In [4]:
# Creating simple dataset to perform quality check of data wrangling functions.
data = {'datacqtr':['1983Q3', '1983Q4', '1983Q3', '1983Q3', '1983Q4', '1984Q1', '1984Q1'],
        'cusip': ['AAPL', 'AAPL', 'TSLA', 'GOOG', 'GOOG', 'MFST', 'AMZN'],
        'stock_price':[1, None, 3, None, 100, None, None]}
df_example = pd.DataFrame(data)
display(df_example)

Unnamed: 0,datacqtr,cusip,stock_price
0,1983Q3,AAPL,1.0
1,1983Q4,AAPL,
2,1983Q3,TSLA,3.0
3,1983Q3,GOOG,
4,1983Q4,GOOG,100.0
5,1984Q1,MFST,
6,1984Q1,AMZN,


In [5]:
# Using function to fill in the missing values in the dataframe. We do this only in a forward manner.
id = 'cusip'
columns_list = ['stock_price']
df_ffill = fillForwardDataframe(df_example, id, columns_list)
print("After forward fill:")
display(df_ffill)

# Using function to fill in the missing values in the dataframe. We do this by taking median of cross-section.
date = 'datacqtr'
df_fill_median = fillMedianDataframe(df_ffill, date, columns_list)
print("After median fill:")
display(df_fill_median)

# Using function to add lags (5 backward and 1 forward) in the dataframe.
lags_list = [5,4,3,2,1,-1]
df_lagged = createLaggedDataframe(df_fill_median, id, lags_list, columns_list)
print("After lags:")
display(df_lagged)

After forward fill:


Unnamed: 0,datacqtr,cusip,stock_price
0,1983Q3,AAPL,1.0
1,1983Q4,AAPL,1.0
2,1983Q3,TSLA,3.0
3,1983Q3,GOOG,
4,1983Q4,GOOG,100.0
5,1984Q1,MFST,
6,1984Q1,AMZN,


After median fill:


Unnamed: 0,datacqtr,cusip,stock_price
0,1983Q3,AAPL,1.0
1,1983Q4,AAPL,1.0
2,1983Q3,TSLA,3.0
3,1983Q3,GOOG,2.0
4,1983Q4,GOOG,100.0
5,1984Q1,MFST,
6,1984Q1,AMZN,


After lags:


Unnamed: 0,datacqtr,cusip,stock_price,stock_price_l5,stock_price_l4,stock_price_l3,stock_price_l2,stock_price_l1,stock_price_l-1
0,1983Q3,AAPL,1.0,,,,,,1.0
1,1983Q4,AAPL,1.0,,,,,1.0,
2,1983Q3,TSLA,3.0,,,,,,
3,1983Q3,GOOG,2.0,,,,,,100.0
4,1983Q4,GOOG,100.0,,,,,2.0,
5,1984Q1,MFST,,,,,,,
6,1984Q1,AMZN,,,,,,,


In [6]:
# filter_col = [col for col in df_lagged if col.starddtswith('revty')]

In [7]:
feature_cols = df.columns.tolist()
feature_cols.remove('costat')
feature_cols = feature_cols[-17:]
print(feature_cols)

['acoq', 'aoq', 'dlcq', 'invtq', 'lcoq', 'ltq', 'ppegtq', 'rectq', 'txpq', 'uaptq', 'chechy', 'cogsy', 'dpy', 'niy', 'revty', 'xopry', 'xsgay']


In [8]:
# Using function to fill in the missing values in the dataframe. We do this only in a forward manner.
id = 'cusip'
columns_list = feature_cols
df_ffill = fillForwardDataframe(df, id, columns_list)

# Using function to fill in the missing values in the dataframe. We do this by taking median of cross-section.
date = 'datacqtr'
df_fill_median = fillMedianDataframe(df_ffill, date, columns_list)

# Using function to add lags (5 backward and 1 forward) in the dataframe.
lags_list = [5,4,3,2,1,-1]
df_lagged = createLaggedDataframe(df_fill_median, id, lags_list, columns_list)

In [9]:
# Dropping rows with NaN and comparing cleaned-up dataframe with original.
df_lagged = df_lagged.dropna()
print("Number of rows in new dataframe: " + str(df_lagged.shape[0]) + ".")
print("Number of rows in originial dataframe: " + str(df.shape[0]) + ".")
print("Ratio of above two numbers: " + str(df_lagged.shape[0]/df.shape[0]) + ".")

Number of rows in new dataframe: 516499.
Number of rows in originial dataframe: 627386.
Ratio of above two numbers: 0.8232555396518252.


In [10]:
#FIXME: scaling!

In [11]:
# Splitting dataframe into train (+val) and test sets.
from datetime import datetime
df_train_val = df_lagged[(df_lagged.datadate >= datetime(1970, 1, 1)) & (df_lagged.datadate <= datetime(1999, 12, 31))]
df_test = df_lagged[(df_lagged.datadate >= datetime(2000, 1, 1)) & (df_lagged.datadate <= datetime(2016, 12, 31))]

In [12]:
# Defining and using functions to generate lists with all column names for X and y dataframes.
def createColsX(feature_cols, lags_list):
    list_cols_X  = feature_cols.copy()
    for lag in lags_list:
        for column in feature_cols:
            new_col_name = column + str('_l') + str(lag) #FIXME: adapt so that shifts with pos int have neg int in name and vice versa
            list_cols_X.append(new_col_name)
    return list_cols_X

def createColsy(feature_cols, lags_list):
    list_cols_X  = []
    for lag in lags_list:
        for column in feature_cols:
            new_col_name = column + str('_l') + str(lag) #FIXME: adapt so that shifts with pos int have neg int in name and vice versa
            list_cols_X.append(new_col_name)
    return list_cols_X

list_cols_X = createColsX(feature_cols, lags_list=[5,4,3,2,1])
list_cols_y = createColsy(feature_cols, lags_list=[-1])

In [13]:
# Splitting dataframes into X and y dataframes.
X_train_val = df_train_val.filter(list_cols_X)
y_train_val = df_train_val.filter(list_cols_y)         #FIXME: might have to inverse the sign if I change the above function.
X_test = df_test.filter(list_cols_X)
y_test = df_test.filter(list_cols_y) 

In [14]:
# Splitting train (+val) dataframe into training and validation sets.
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size=0.7, random_state=42) #FIXME: not 30% of rows, but 30% of stocks!

In [29]:
# Saving all dataframes needed for modeling as pickle files.
df = [[X_train, X_val, X_test, y_train, y_val, y_test], ['X_train', 'X_val', 'X_test', 'y_train', 'y_val', 'y_test']]
for i in range(len(df[0])):
    name = str()
    df[0][i].to_pickle("./" + str(df[1][i]) + ".pkl")

In [None]:
# import os
# os.remove("./dummy.pkl")

FileNotFoundError: [Errno 2] No such file or directory: './dummy.pkl'