In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load master dataframe
df = pd.DataFrame(np.load(r'../data/master_initial.npy', allow_pickle=True))
cols = np.load(r'../data/columns_initial.npy', allow_pickle=True)
df.columns = cols
df.drop(['dsindustry6','icbindustry2'], axis=1, inplace=True)
df[df.columns[2:]] = df[df.columns[2:]].astype('float')
df.insert(2, 'eligible', 0)

In [4]:
# Global variables
fields = list(df.columns)
nonfields = ['date', 'companyid', 'eligible', 'gilt3m']
for i in nonfields:
    fields.remove(i)

In [6]:
frames = []
for name, frame in df.groupby('companyid'):
    frame['ret_1f1'] = frame['ret'].shift(-1)
    frame['ret_1f3'] = frame['ret_3m'].shift(-3)
    frame['ret_1f6'] = frame['ret_6m'].shift(-6)
    frame['ret_1f12'] = frame['ret_12m'].shift(-12)
    frame['retxs_1f1'] = frame['retxs'].shift(-1)
    frame['retxs_1f3'] = frame['retxs_3m'].shift(-3)
    frame['retxs_1f6'] = frame['retxs_6m'].shift(-6)
    frame['retxs_1f12'] = frame['retxs_12m'].shift(-12)
    frames.append(frame)

df = pd.concat(frames, axis=0).sort_index()

In [7]:
'''
Eligible training has:
    1 - All data available in period t
    2 - Return available in period t+1
'''
# Find number of fields available
df['eligible'] = ((1- df[fields].isna()) * 1).sum(axis=1)
# Set eligible = 1 if all fields are available (34 fields)
df['eligible'][df['eligible']!=len(fields)] = 0
df['eligible'][df['eligible']==len(fields)] = 1

# Summary of data eligibility
print('Total companies eligible at each date:')
for name, group in df[df['eligible']==1].groupby('date'):
    print(name, ':', len(group.index))

Total companies eligible at each date:
1998-12-31 00:00:00 : 738
1999-01-29 00:00:00 : 707
1999-02-26 00:00:00 : 709
1999-03-31 00:00:00 : 708
1999-04-30 00:00:00 : 700
1999-05-31 00:00:00 : 707
1999-06-30 00:00:00 : 712
1999-07-30 00:00:00 : 714
1999-08-31 00:00:00 : 715
1999-09-30 00:00:00 : 715
1999-10-29 00:00:00 : 721
1999-11-30 00:00:00 : 726
1999-12-31 00:00:00 : 731
2000-01-31 00:00:00 : 701
2000-02-29 00:00:00 : 704
2000-03-31 00:00:00 : 709
2000-04-28 00:00:00 : 700
2000-05-31 00:00:00 : 702
2000-06-30 00:00:00 : 706
2000-07-31 00:00:00 : 708
2000-08-31 00:00:00 : 709
2000-09-29 00:00:00 : 711
2000-10-31 00:00:00 : 716
2000-11-30 00:00:00 : 718
2000-12-29 00:00:00 : 723
2001-01-31 00:00:00 : 711
2001-02-28 00:00:00 : 711
2001-03-30 00:00:00 : 712
2001-04-30 00:00:00 : 712
2001-05-31 00:00:00 : 717
2001-06-29 00:00:00 : 722
2001-07-31 00:00:00 : 718
2001-08-31 00:00:00 : 718
2001-09-28 00:00:00 : 716
2001-10-31 00:00:00 : 712
2001-11-30 00:00:00 : 714
2001-12-31 00:00:00 : 714

In [8]:
# Drop dates with very little data
df.set_index('date', inplace=True)
for date in df.index.unique()[-10:]:
    df.drop(labels=date, axis=0, inplace=True)
df.reset_index(inplace=True)

In [9]:
print("Final date in the dataset is:  ", list(df.date.unique())[-1])

Final date in the dataset is:   2018-12-31T00:00:00.000000000


# Adding Fields

In [10]:


# Market Value
df['mv_^2'] = df['mv'] ** 2
df['mv_^3'] = df['mv'] ** 3

# Book to Market Value
df['bvtmv_^2'] = df['bvtmv'] ** 2
df['bvtmv_^3'] = df['bvtmv'] ** 3

# Operating Margin
df['opmarg_^2'] = df['opmarg'] ** 2
df['opmarg_^3'] = df['opmarg'] ** 3

# Free cash flow
df['fcf_^2'] = df['fcf'] ** 2
df['fcf_^3'] = df['fcf'] ** 3

# Return on invested capital
df['roic_^2'] = df['roic'] ** 2
df['roic_^3'] = df['roic'] ** 3

# Volatility
df['std_3m_^2'] = df['std_3m'] ** 2
df['std_3m_^3'] = df['std_3m'] ** 3
df['std_6m_^2'] = df['std_6m'] ** 2
df['std_6m_^3'] = df['std_6m'] ** 3
df['std_12m_^2'] = df['std_12m'] ** 2
df['std_12m_^3'] = df['std_12m'] ** 3
df['std_24m_^2'] = df['std_24m'] ** 2
df['std_24m_^3'] = df['std_24m'] ** 3

# Return
df['ret_3m_^2'] = df['ret_3m'] ** 2
df['ret_3m_^3'] = df['ret_3m'] ** 3
df['ret_6m_^2'] = df['ret_6m'] ** 2
df['ret_6m_^3'] = df['ret_6m'] ** 3
df['ret_12m_^2'] = df['ret_12m'] ** 2
df['ret_12m_^3'] = df['ret_12m'] ** 3
df['ret_24m_^2'] = df['ret_24m'] ** 2
df['ret_24m_^3'] = df['ret_24m'] ** 3

# Excess return
df['retxs_3m_^2'] = df['retxs_3m'] ** 2
df['retxs_3m_^3'] = df['retxs_3m'] ** 3
df['retxs_6m_^2'] = df['retxs_6m'] ** 2
df['retxs_6m_^3'] = df['retxs_6m'] ** 3
df['retxs_12m_^2'] = df['retxs_12m'] ** 2
df['retxs_12m_^3'] = df['retxs_12m'] ** 3
df['retxs_24m_^2'] = df['retxs_24m'] ** 2
df['retxs_24m_^3'] = df['retxs_24m'] ** 3


# Dividend yield
df['dy_^2'] = df['dy'] ** 2
df['dy_^3'] = df['dy'] ** 3

# Beta
df['beta_^2'] = df['beta'] ** 2
df['beta_^3'] = df['beta'] ** 3

# Shares Outstanding
df['so_^2'] = df['so'] ** 2
df['so_^3'] = df['so'] ** 3

# Interaction terms
df['mv_bvtmv'] = df['bvtmv'] / df['mv']
df['retxs12m_bvtmv'] = df['retxs_12m'] * df['mv']


In [12]:
# Get rid of infinity values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(how='any', inplace=True)


# Portfolio Sorts

In [13]:
df.insert(len(df.columns), 'MKT', np.nan)
df.insert(len(df.columns), 'SMB', np.nan)
df.insert(len(df.columns), 'HML', np.nan)
df.insert(len(df.columns), 'UMD', np.nan)
df.insert(len(df.columns), 'RMW', np.nan)
df.insert(len(df.columns), 'CMA', np.nan)
df.insert(len(df.columns), 'VOL', np.nan)


frames = []

for name, frame in df.groupby('date'):
    
    # Size
    small = frame.nsmallest(int(0.2*len(frame.index)), 'mv', 'all')['ret_1f1'].mean()
    big = frame.nlargest(int(0.2*len(frame.index)), 'mv', 'all')['ret_1f1'].mean()
    frame['SMB'] = small-big

    # Value
    low = frame.nsmallest(int(0.2*len(frame.index)), 'bvtmv', 'all')['ret_1f1'].mean()
    high = frame.nlargest(int(0.2*len(frame.index)), 'bvtmv', 'all')['ret_1f1'].mean()
    frame['HML'] = high-low
    
    # Momentum
    down = frame.nsmallest(int(0.2*len(frame.index)), 'ret_12m', 'all')['ret_1f1'].mean()
    up = frame.nlargest(int(0.2*len(frame.index)), 'ret_12m', 'all')['ret_1f1'].mean()
    frame['UMD'] = up-down
    
    # Profitability
    robust = frame.nlargest(int(0.2*len(frame.index)), 'opmarg', 'all')['ret_1f1'].mean()
    weak = frame.nsmallest(int(0.2*len(frame.index)), 'opmarg', 'all')['ret_1f1'].mean()
    frame['RMW'] = robust-weak
    
    # Low Investment
    conservative = frame.nlargest(int(0.2*len(frame.index)), 'roic', 'all')['ret_1f1'].mean()
    aggressive = frame.nsmallest(int(0.2*len(frame.index)), 'roic', 'all')['ret_1f1'].mean()
    frame['CMA'] = conservative-aggressive

    # Volatility
    stable = frame.nsmallest(int(0.2*len(frame.index)), 'std_24m', 'all')['ret_1f1'].mean()
    volatile = frame.nlargest(int(0.2*len(frame.index)), 'std_24m', 'all')['ret_1f1'].mean()
    frame['VOL'] = stable-volatile
    
    frames.append(frame)

df = pd.concat(frames)

# MKT is just excess return of ftse allshare
df['MKT'] = df['allshare'] - df['gilt3m']

# Create polynomials
df['MKT_^2'] = (1 + df['MKT']) ** 2 - 1
df['MKT_^3'] = (1 + df['MKT']) ** 3 - 1

df['SMB_^2'] = (1 + df['SMB']) ** 2 - 1
df['SMB_^3'] = (1 + df['SMB']) ** 3 - 1

df['HML_^2'] = (1 + df['HML']) ** 2 - 1
df['HML_^3'] = (1 + df['HML']) ** 3 - 1

df['CMA_^2'] = (1 + df['CMA']) ** 2 - 1
df['CMA_^3'] = (1 + df['CMA']) ** 3 - 1

df['RMW_^2'] = (1 + df['RMW']) ** 2 - 1
df['RMW_^3'] = (1 + df['RMW']) ** 3 - 1

df['UMD_^2'] = (1 + df['UMD']) ** 2 - 1
df['UMD_^3'] = (1 + df['UMD']) ** 3 - 1

df['VOL_^2'] = (1 + df['VOL']) ** 2 - 1
df['VOL_^3'] = (1 + df['VOL']) ** 3 - 1

# Create interaction terms
df['HMLxSMB'] = (1 + df['HML']) * (1 + df['SMB']) - 1
df['HMLxUMD'] = (1 + df['HML']) * (1 + df['UMD']) - 1
df['SMBxRMW'] = (1 + df['SMB']) * (1 + df['RMW']) - 1
df['SMBxVOL'] = (1 + df['SMB']) * (1 + df['VOL']) - 1
df['CMAxSMB'] = (1 + df['CMA']) * (1 + df['SMB']) - 1
df['CMAxRMW'] = (1 + df['CMA']) * (1 + df['RMW']) - 1
df['CMAxUMD'] = (1 + df['CMA']) * (1 + df['UMD']) - 1
df['CMAxVOL'] = (1 + df['CMA']) * (1 + df['VOL']) - 1
df['RMWxVOL'] = (1 + df['RMW']) * (1 + df['VOL']) - 1


# Save

In [14]:
# Save over master file with new df from above
np.save(dataFolder+'/master_prepared.npy', df)
np.save(dataFolder+'/columns_prepared.npy', list(df.columns))