In [10]:
# Importing necessary packages.
import pandas as pd
import numpy as np
from IPython.display import display

In [None]:
# Reading the data.
df = pd.read_csv (r'/Users/Colin/Desktop/Research project/Sigma/Data - Final/Final Data.csv')
df.head()

In [33]:
# Creating data wrangling functions.

# Function to fill in missing values in a forward manner.
def fillForwardDataframe(df, id, columns_list):
    # type(df) = pandas DataFrame
    # type(id) = str
    # type(columns_list) = list
    df_ffill = df.copy()
    for column in columns_list:
        df_ffill[column] = (df_ffill.groupby(id)[column].ffill())
    return df_ffill

# Function to fill in missing values by taking median of cross-section.
def fillMedianDataframe(df, date, columns_list):
    # type(df) = pandas DataFrame
    # type(date) = str
    # type(columns_list) = list
    df_fill_median = df.copy()
    for column in columns_list:
        df_fill_median[column] = df_fill_median[column].fillna(df_fill_median.groupby(date)[column].transform('median'))
    return df_fill_median

# Function to add lags in the dataframe.
def createLaggedDataframe(df, id, lags_list, columns_list):
    # type(df) = pandas DataFrame
    # type(id) = str
    # type (lags_list) = list
    # type(columns_list) = list
    df_lagged = df.copy()
    for lag in lags_list:
        for column in columns_list:
            new_col_name = column + str('_l') + str(lag) #FIXME: adapt so that shifts with pos int have neg int in name and vice versa
            df_lagged[new_col_name] = (df_lagged.groupby(id)[column].shift(lag))
    return df_lagged

In [34]:
# Creating simple dataset to perform quality check of data wrangling functions.
data = {'datacqtr':['1983Q3', '1983Q4', '1983Q3', '1983Q3', '1983Q4', '1984Q1', '1984Q1'],
        'cusip': ['AAPL', 'AAPL', 'TSLA', 'GOOG', 'GOOG', 'MFST', 'AMZN'],
        'stock_price':[1, None, 3, None, 100, None, None]}
df_example = pd.DataFrame(data)
display(df_example)

Unnamed: 0,datacqtr,cusip,stock_price
0,1983Q3,AAPL,1.0
1,1983Q4,AAPL,
2,1983Q3,TSLA,3.0
3,1983Q3,GOOG,
4,1983Q4,GOOG,100.0
5,1984Q1,MFST,
6,1984Q1,AMZN,


In [35]:
# Using function to fill in the missing values in the dataframe. We do this only in a forward manner.
id = 'cusip'
columns_list = ['stock_price']
df_ffill = fillForwardDataframe(df_example, id, columns_list)
print("After forward fill:")
display(df_ffill)

# Using function to fill in the missing values in the dataframe. We do this by taking median of cross-section.
date = 'datacqtr'
df_fill_median = fillMedianDataframe(df_ffill, date, columns_list)
print("After median fill:")
display(df_fill_median)

# Using function to add lags (5 backward and 1 forward) in the dataframe.
lags_list = [5,4,3,2,1,-1]
df_lagged = createLaggedDataframe(df_fill_median, id, lags_list, columns_list)
print("After lags:")
display(df_lagged)

After forward fill:


Unnamed: 0,datacqtr,cusip,stock_price
0,1983Q3,AAPL,1.0
1,1983Q4,AAPL,1.0
2,1983Q3,TSLA,3.0
3,1983Q3,GOOG,
4,1983Q4,GOOG,100.0
5,1984Q1,MFST,
6,1984Q1,AMZN,


After median fill:


Unnamed: 0,datacqtr,cusip,stock_price
0,1983Q3,AAPL,1.0
1,1983Q4,AAPL,1.0
2,1983Q3,TSLA,3.0
3,1983Q3,GOOG,2.0
4,1983Q4,GOOG,100.0
5,1984Q1,MFST,
6,1984Q1,AMZN,


After lags:


Unnamed: 0,datacqtr,cusip,stock_price,stock_price_l5,stock_price_l4,stock_price_l3,stock_price_l2,stock_price_l1,stock_price_l-1
0,1983Q3,AAPL,1.0,,,,,,1.0
1,1983Q4,AAPL,1.0,,,,,1.0,
2,1983Q3,TSLA,3.0,,,,,,
3,1983Q3,GOOG,2.0,,,,,,100.0
4,1983Q4,GOOG,100.0,,,,,2.0,
5,1984Q1,MFST,,,,,,,
6,1984Q1,AMZN,,,,,,,


In [None]:
# filter_col = [col for col in df_lagged if col.starddtswith('revty')]

In [None]:
#FIXME: next step, delete all rows with NaN? then, split into train, val and test sets?