In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Preprocessing

In [73]:
df = pd.read_csv("data.csv")

In [74]:
df = df.drop(columns = ['Unnamed: 0'])

In [75]:
df['r(t+1)'] = df.groupby('permno')['return'].shift(-1)

In [76]:
df['Date'] = pd.to_datetime(df['Date'])

In [77]:
df_filled = df.copy()
for feature in ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']:
    df_filled[feature] = df_filled.groupby('Date')[feature].transform(lambda x: x.fillna(x.median()))

In [78]:
df_filled.isna().sum()

Date             0
permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    500
r(t+1)         500
dtype: int64

In [79]:
df.loc[:, ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']] = df_filled.loc[:,['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']]

In [80]:
mvel_sorted = df.sort_values('mvel1',ascending=False)
top_100 = mvel_sorted.groupby('Date').head(100).reset_index(drop=True)
bottom_100 = mvel_sorted.groupby('Date').tail(100).reset_index(drop=True)
top_100.set_index('Date', inplace=True)
bottom_100.set_index('Date', inplace=True)
df.set_index('Date', inplace=True)

In [81]:
df.isna().sum()

permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    500
r(t+1)         500
dtype: int64

# Splitting Data

In [138]:
def generate_rolling_windows(data, total_years, initial_train_size, val_size):
    total_years = total_years
    windows = []
    initial_train_size = initial_train_size
    val_size = val_size

    # Loop through the years to create rolling windows
    for i in range(initial_train_size + 1, total_years-val_size):
        idx_1 = '20' + str(i).zfill(2) + '-01-01'
        idx_2 = '20' + str(i + val_size).zfill(2) + '-01-01'
        
        training = data[:idx_1].dropna()
        validation = data[idx_1:idx_2]
        testing = data[idx_2:].dropna()
        
        train_com = data[:idx_2].dropna()

        windows.append((training, validation, testing, train_com))
        
    return windows

In [177]:
windows = generate_rolling_windows(df, 19, 6, 4)

# to get training sample of the first split:
windows[0][0]

Unnamed: 0_level_0,permno,return,mom1m,mom12m,chmom,indmom,mom36m,turn,mvel1,dolvol,...,retvol,idiovol,beta,betasq,ep,sp,agr,nincr,return(t-1),r(t+1)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-02-28,13610,0.161318,-0.190508,0.327877,0.249243,0.018187,-0.285854,1.023598,8.012218e+05,13.974887,...,0.028112,0.058059,0.385289,0.148447,0.019041,1.472909,0.325935,5.0,-0.202242,-0.009186
2001-02-28,13856,0.046940,-0.110820,0.471192,-0.384316,0.115779,-0.027868,0.691164,6.359623e+07,17.810579,...,0.020727,0.043186,0.118710,0.014092,0.039970,0.397105,0.225463,1.0,-0.112756,-0.041659
2001-02-28,13901,0.094426,0.000000,1.261056,0.535897,0.682048,-0.446731,0.829092,9.783550e+07,17.863872,...,0.032157,0.050186,0.021928,0.000481,0.142695,1.148088,-0.024383,2.0,0.010566,0.002179
2001-02-28,13928,-0.084292,0.030857,0.431206,-0.111444,0.682048,-0.021776,0.842627,1.496689e+07,16.501171,...,0.016428,0.038102,0.142332,0.020258,0.051091,1.138525,-0.069288,1.0,0.033114,0.034813
2001-02-28,13936,0.149662,-0.006923,0.957347,0.098914,-0.032018,-0.421654,0.820512,3.525850e+05,12.565110,...,0.045878,0.068790,1.022686,1.045888,0.091598,6.902488,0.000838,1.0,0.014335,0.031713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006-12-31,89279,0.048871,0.024940,-0.031547,0.072748,0.098944,-0.027805,0.065080,9.697706e+04,9.063220,...,0.011695,0.018700,0.115212,0.013274,0.060214,0.254025,-0.042325,0.0,0.026024,-0.004154
2006-12-31,89317,-0.054903,0.098958,-0.275862,0.327413,0.163590,0.304287,0.025187,1.166978e+05,7.426549,...,0.016572,0.040691,0.271733,0.073839,-0.130443,0.836804,-0.183381,0.0,0.097319,-0.128977
2006-12-31,89456,0.160222,-0.045905,0.019759,0.342970,0.085635,0.545880,1.244986,3.647852e+05,13.235699,...,0.023065,0.054782,1.819325,3.309943,0.052840,1.019684,0.011990,0.0,-0.041839,-0.024223
2006-12-31,89790,-0.018619,0.080000,-0.213837,-0.241931,0.068689,0.458716,0.814450,5.602500e+04,10.551278,...,0.027389,0.110512,2.490410,6.202142,0.001406,0.353447,-0.066737,1.0,0.084549,-0.058698


In [178]:
training = windows[0][0].dropna()
validation = windows[0][1].dropna()
testing = windows[0][2].dropna()
training_combined = windows[0][3].dropna()

In [179]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [180]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [181]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [182]:
X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train_combined = training_combined['r(t+1)']

# Implementing the model

In [183]:
#Hyperparameters based on the paper
depths = range(1, 7)  
n_trees = 300         
feature_splits = [3, 5, 10, 20] 

In [184]:
def R_calc(actual, predicted):

    actual = np.array(actual)
    predicted = np.array(predicted).flatten()
    
    predicted = np.clip(predicted, 0, None)
    
    ss_res = np.sum((actual - predicted) ** 2)
    
    ss_tot = np.sum(actual ** 2)
    
    r_squared = 1 - (ss_res / ss_tot)
    
    return r_squared

In [185]:
roos_vals = {}

for max_depth in depths:

    for max_features in feature_splits:

        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)

        rf.fit(X_train, y_train)

        predictions = rf.predict(X_val)

        r_val = R_calc(y_val, predictions)

        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)

best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

1 3 0.011232871152665247
1 5 0.012425960639786182
1 10 0.013882363673785036
1 20 0.014357976541994999
2 3 0.016195901271945146
2 5 0.017414537733786983
2 10 0.019512067119641707
2 20 0.019917211521390032
3 3 0.0194652238358084
3 5 0.021611248892600132
3 10 0.02253278765665978
3 20 0.022198044375804193
4 3 0.02189419987239749
4 5 0.021506322456899873
4 10 0.022259438461185943
4 20 0.023540961133823757
5 3 0.022350696414358495
5 5 0.023350831444073572
5 10 0.022903638673507754
5 20 0.023843663108134017
6 3 0.024380405193671795
6 5 0.022500495844439583
6 10 0.02316385450230385
6 20 0.022183029686349442
Best hyperparameters (max_depth, max_features): (6, 3)
Best R^2 OOS: 0.024380405193671795


In [186]:
'''
roos_vals = {}
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

for i in range(len(windows)):

    training = windows[i][0].dropna()
    validation = windows[i][1].dropna()
    testing = windows[i][2].dropna()
    training_combined = windows[i][3].dropna()
    
    X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train = training['r(t+1)']
    
    X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_val = validation['r(t+1)']
    
    X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_test = testing['r(t+1)']
    
    X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train_combined = training_combined['r(t+1)']

    rf = RandomForestRegressor(n_estimators = n_trees,
                               max_depth = depth,
                               max_features = feature,
                               random_state = 42)

    rf.fit(X_train_combined, y_train_combined)

    predictions = rf.predict(X_val)

    r_val = R_calc(y_val, predictions)

    roos_vals[(i)] = r_val
    print(i, r_val)

    #best_hyperparameters = max(roos_vals, key=roos_vals.get)
    #best_r = roos_vals[best_hyperparameters]

    #print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
    #print("Best R^2 OOS:", best_r)
'''

'\nroos_vals = {}\ndepth = best_hyperparameters[0]\nfeature = best_hyperparameters[1]\n\nfor i in range(len(windows)):\n\n    training = windows[i][0].dropna()\n    validation = windows[i][1].dropna()\n    testing = windows[i][2].dropna()\n    training_combined = windows[i][3].dropna()\n    \n    X_train = training.drop(columns = [\'permno\', \'return\', \'r(t+1)\'])\n    y_train = training[\'r(t+1)\']\n    \n    X_val = validation.drop(columns = [\'permno\', \'return\', \'r(t+1)\'])\n    y_val = validation[\'r(t+1)\']\n    \n    X_test = testing.drop(columns = [\'permno\', \'return\', \'r(t+1)\'])\n    y_test = testing[\'r(t+1)\']\n    \n    X_train_combined = training_combined.drop(columns = [\'permno\', \'return\', \'r(t+1)\'])\n    y_train_combined = training_combined[\'r(t+1)\']\n\n    rf = RandomForestRegressor(n_estimators = n_trees,\n                               max_depth = depth,\n                               max_features = feature,\n                               random_s

In [187]:
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]
rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train_combined, y_train_combined)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)

0.08992928116823606


In [188]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)

0.007340982707352506


# Top 100

In [189]:
top_100.isna().sum()

permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    100
r(t+1)         100
dtype: int64

In [190]:
windows = generate_rolling_windows(top_100, 19, 6, 4)

training = windows[0][0].dropna()
validation = windows[0][1].dropna()
testing = windows[0][2].dropna()
training_combined = windows[0][3].dropna()

  validation = data[idx_1:idx_2]
  testing = data[idx_2:].dropna()


In [191]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [192]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [193]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [194]:
X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train_combined = training_combined['r(t+1)']

In [195]:
roos_vals = {}

for max_depth in depths:

    for max_features in feature_splits:

        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)

        rf.fit(X_train, y_train)

        predictions = rf.predict(X_val)

        r_val = R_calc(y_val, predictions)

        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)

best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

1 3 -0.000861724093917271
1 5 -0.0013528519368128933
1 10 -0.0012850165960602844
1 20 -0.0014453063287205303
2 3 -0.0006605045583853997
2 5 -0.0009320465022950586
2 10 -0.0017541836844991998
2 20 -0.001883846597476424
3 3 -0.00022194811165832107
3 5 -0.0007598854200716598
3 10 -0.0015595299295763088
3 20 -0.0021621480531714976
4 3 0.00012513923023471651
4 5 0.00016935057953681998
4 10 -0.0007160422335297234
4 20 -0.0016134036326875822
5 3 0.00010828116034022894
5 5 -0.000916385533916042
5 10 -0.0006904614146558963
5 20 -0.0007909864521535859
6 3 -5.273112472248265e-05
6 5 0.0006098480663929706
6 10 0.0003689406098265069
6 20 0.0012477258270086056
Best hyperparameters (max_depth, max_features): (6, 20)
Best R^2 OOS: 0.0012477258270086056


In [None]:
'''
roos_vals = {}


for i in range(len(windows)):

    training = windows[i][0].dropna()
    validation = windows[i][1].dropna()
    testing = windows[i][2].dropna()
    training_combined = windows[i][3].dropna()
    
    X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train = training['r(t+1)']
    
    X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_val = validation['r(t+1)']
    
    X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_test = testing['r(t+1)']
    
    X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train_combined = training_combined['r(t+1)']


    rf = RandomForestRegressor(n_estimators = n_trees,
                               max_depth = depth,
                               max_features = feature,
                               random_state = 42)

    rf.fit(X_train_combined, y_train_combined)

    predictions = rf.predict(X_val)

    r_val = R_calc(y_val, predictions)

    roos_vals[(i)] = r_val
    print(i, r_val)

    #best_hyperparameters = max(roos_vals, key=roos_vals.get)
    #best_r = roos_vals[best_hyperparameters]

    #print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
    #print("Best R^2 OOS:", best_r)
'''

In [196]:
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]
rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train_combined, y_train_combined)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)

0.0559743943218215


In [197]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)

0.019976562152977162


# Bottom 100

In [198]:
windows = generate_rolling_windows(bottom_100, 19, 6, 4)

training = windows[0][0].dropna()
validation = windows[0][1].dropna()
testing = windows[0][2].dropna()
training_combined = windows[0][3].dropna()

  validation = data[idx_1:idx_2]
  testing = data[idx_2:].dropna()


In [199]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [200]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [201]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [202]:
X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train_combined = training_combined['r(t+1)']

In [203]:
roos_vals = {}

for max_depth in depths:

    for max_features in feature_splits:

        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)

        rf.fit(X_train, y_train)

        predictions = rf.predict(X_val)

        r_val = R_calc(y_val, predictions)

        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)

best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

1 3 0.03147756117830669
1 5 0.03367965731601996
1 10 0.03455543955720375
1 20 0.033253943280103226
2 3 0.04066655896404536
2 5 0.042781972770092436
2 10 0.04148429125725195
2 20 0.04212885411245071
3 3 0.04193022671334845
3 5 0.04808717512952043
3 10 0.044589292359935695
3 20 0.0442671988198271
4 3 0.0483481224954222
4 5 0.048920698311961
4 10 0.044844204454248304
4 20 0.0428427263824831
5 3 0.0487136147250099
5 5 0.04708616571249491
5 10 0.044029259264825704
5 20 0.04376178545793452
6 3 0.047867112207840457
6 5 0.04922239826930841
6 10 0.04255178593828335
6 20 0.0435411424000679
Best hyperparameters (max_depth, max_features): (6, 5)
Best R^2 OOS: 0.04922239826930841


In [None]:
'''
roos_vals = {}
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

for i in range(len(windows)):

    training = windows[i][0].dropna()
    validation = windows[i][1].dropna()
    testing = windows[i][2].dropna()
    training_combined = windows[i][3].dropna()
    
    X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train = training['r(t+1)']
    
    X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_val = validation['r(t+1)']
    
    X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_test = testing['r(t+1)']
    
    X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train_combined = training_combined['r(t+1)']


    rf = RandomForestRegressor(n_estimators = n_trees,
                               max_depth = depth,
                               max_features = feature,
                               random_state = 42)

    rf.fit(X_train_combined, y_train_combined)

    predictions = rf.predict(X_val)

    r_val = R_calc(y_val, predictions)

    roos_vals[(i)] = r_val
    print(i, r_val)

    #best_hyperparameters = max(roos_vals, key=roos_vals.get)
    #best_r = roos_vals[best_hyperparameters]

    #print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
    #print("Best R^2 OOS:", best_r)
'''

In [None]:
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train_combined, y_train_combined)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)


In [None]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)