In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Preprocessing

In [73]:
df = pd.read_csv("data.csv")

In [74]:
df = df.drop(columns = ['Unnamed: 0'])

In [75]:
df['r(t+1)'] = df.groupby('permno')['return'].shift(-1)

In [76]:
df['Date'] = pd.to_datetime(df['Date'])

In [77]:
df_filled = df.copy()
for feature in ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']:
    df_filled[feature] = df_filled.groupby('Date')[feature].transform(lambda x: x.fillna(x.median()))

In [78]:
df_filled.isna().sum()

Date             0
permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    500
r(t+1)         500
dtype: int64

In [79]:
df.loc[:, ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']] = df_filled.loc[:,['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']]

In [80]:
mvel_sorted = df.sort_values('mvel1',ascending=False)
top_100 = mvel_sorted.groupby('Date').head(100).reset_index(drop=True)
bottom_100 = mvel_sorted.groupby('Date').tail(100).reset_index(drop=True)
top_100.set_index('Date', inplace=True)
bottom_100.set_index('Date', inplace=True)
df.set_index('Date', inplace=True)

In [81]:
df.isna().sum()

permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    500
r(t+1)         500
dtype: int64

# Splitting Data

In [138]:
def generate_rolling_windows(data, total_years, initial_train_size, val_size):
    total_years = total_years
    windows = []
    initial_train_size = initial_train_size
    val_size = val_size

    # Loop through the years to create rolling windows
    for i in range(initial_train_size + 1, total_years-val_size):
        idx_1 = '20' + str(i).zfill(2) + '-01-01'
        idx_2 = '20' + str(i + val_size).zfill(2) + '-01-01'
        
        training = data[:idx_1].dropna()
        validation = data[idx_1:idx_2]
        testing = data[idx_2:].dropna()
        
        train_com = data[:idx_2].dropna()

        windows.append((training, validation, testing, train_com))
        
    return windows

In [143]:
windows = generate_rolling_windows(df, 19, 6, 4)

# to get training sample of the first split:
windows[0][0]

Unnamed: 0_level_0,permno,return,mom1m,mom12m,chmom,indmom,mom36m,turn,mvel1,dolvol,...,retvol,idiovol,beta,betasq,ep,sp,agr,nincr,return(t-1),r(t+1)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-02-28,13610,0.161318,-0.190508,0.327877,0.249243,0.018187,-0.285854,1.023598,8.012218e+05,13.974887,...,0.028112,0.058059,0.385289,0.148447,0.019041,1.472909,0.325935,5.0,-0.202242,-0.009186
2001-02-28,13856,0.046940,-0.110820,0.471192,-0.384316,0.115779,-0.027868,0.691164,6.359623e+07,17.810579,...,0.020727,0.043186,0.118710,0.014092,0.039970,0.397105,0.225463,1.0,-0.112756,-0.041659
2001-02-28,13901,0.094426,0.000000,1.261056,0.535897,0.682048,-0.446731,0.829092,9.783550e+07,17.863872,...,0.032157,0.050186,0.021928,0.000481,0.142695,1.148088,-0.024383,2.0,0.010566,0.002179
2001-02-28,13928,-0.084292,0.030857,0.431206,-0.111444,0.682048,-0.021776,0.842627,1.496689e+07,16.501171,...,0.016428,0.038102,0.142332,0.020258,0.051091,1.138525,-0.069288,1.0,0.033114,0.034813
2001-02-28,13936,0.149662,-0.006923,0.957347,0.098914,-0.032018,-0.421654,0.820512,3.525850e+05,12.565110,...,0.045878,0.068790,1.022686,1.045888,0.091598,6.902488,0.000838,1.0,0.014335,0.031713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006-12-31,89279,0.048871,0.024940,-0.031547,0.072748,0.098944,-0.027805,0.065080,9.697706e+04,9.063220,...,0.011695,0.018700,0.115212,0.013274,0.060214,0.254025,-0.042325,0.0,0.026024,-0.004154
2006-12-31,89317,-0.054903,0.098958,-0.275862,0.327413,0.163590,0.304287,0.025187,1.166978e+05,7.426549,...,0.016572,0.040691,0.271733,0.073839,-0.130443,0.836804,-0.183381,0.0,0.097319,-0.128977
2006-12-31,89456,0.160222,-0.045905,0.019759,0.342970,0.085635,0.545880,1.244986,3.647852e+05,13.235699,...,0.023065,0.054782,1.819325,3.309943,0.052840,1.019684,0.011990,0.0,-0.041839,-0.024223
2006-12-31,89790,-0.018619,0.080000,-0.213837,-0.241931,0.068689,0.458716,0.814450,5.602500e+04,10.551278,...,0.027389,0.110512,2.490410,6.202142,0.001406,0.353447,-0.066737,1.0,0.084549,-0.058698


In [144]:
training = windows[1][0].dropna()
validation = windows[1][1].dropna()
testing = windows[1][2].dropna()
training_combined = windows[1][3].dropna()

In [145]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [146]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [147]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [148]:
X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train_combined = training_combined['r(t+1)']

# Implementing the model

In [149]:
#Hyperparameters based on the paper
depths = range(1, 7)  
n_trees = 300         
feature_splits = [3, 5, 10, 20] 

In [150]:
def R_calc(actual, predicted):

    actual = np.array(actual)
    predicted = np.array(predicted).flatten()
    
    predicted = np.clip(predicted, 0, None)
    
    ss_res = np.sum((actual - predicted) ** 2)
    
    ss_tot = np.sum(actual ** 2)
    
    r_squared = 1 - (ss_res / ss_tot)
    
    return r_squared

In [151]:
roos_vals = {}

for max_depth in depths:

    for max_features in feature_splits:

        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)

        rf.fit(X_train, y_train)

        predictions = rf.predict(X_val)

        r_val = R_calc(y_val, predictions)

        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)

best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

1 3 0.015934424121926805
1 5 0.016998555995249243
1 10 0.017938935024490066
1 20 0.017515150064221663
2 3 0.019944332264875353
2 5 0.021476930998315602
2 10 0.023209275749752067
2 20 0.022942203113467063
3 3 0.022955123735833416
3 5 0.024723381649466636
3 10 0.02487746867723062
3 20 0.02450776759632134
4 3 0.025348727783720548
4 5 0.023971992320902524
4 10 0.025626734217333635
4 20 0.02373654697005978
5 3 0.025907638696189617
5 5 0.02500752108948845
5 10 0.024469495004495467
5 20 0.022572568982237362
6 3 0.027727769654823597
6 5 0.025563536872103665
6 10 0.023269100755110794
6 20 0.020412093508111795
Best hyperparameters (max_depth, max_features): (6, 3)
Best R^2 OOS: 0.027727769654823597


In [153]:
roos_vals = {}
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

for i in range(len(windows)):

    training = windows[i][0].dropna()
    validation = windows[i][1].dropna()
    testing = windows[i][2].dropna()
    training_combined = windows[i][3].dropna()
    
    X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train = training['r(t+1)']
    
    X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_val = validation['r(t+1)']
    
    X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_test = testing['r(t+1)']
    
    X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train_combined = training_combined['r(t+1)']


    rf = RandomForestRegressor(n_estimators = n_trees,
                               max_depth = depth,
                               max_features = feature,
                               random_state = 42)

    rf.fit(X_train_combined, y_train_combined)

    predictions = rf.predict(X_val)

    r_val = R_calc(y_val, predictions)

    roos_vals[(i)] = r_val
    print(i, r_val)

    #best_hyperparameters = max(roos_vals, key=roos_vals.get)
    #best_r = roos_vals[best_hyperparameters]

    #print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
    #print("Best R^2 OOS:", best_r)

0 0.022183029686349442
1 0.020412093508111795
2 0.03386507635323155
3 0.014127609918471729
4 0.008924418941873014
5 0.0029172300575580623
6 0.012955616291498884


KeyboardInterrupt: 

In [None]:
'''
rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train_combined, y_train_combined)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)
'''

In [None]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)

# Top 100

In [None]:
top_100.isna().sum()

In [None]:
windows = generate_rolling_windows(top_100, 19, 6, 4)

training = windows[1][0].dropna()
validation = windows[1][1].dropna()
testing = windows[1][2].dropna()
training_combined = windows[1][3].dropna()

In [None]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [None]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [None]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [None]:
X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train_combined = training_combined['r(t+1)']

In [None]:
roos_vals = {}

for max_depth in depths:

    for max_features in feature_splits:

        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)

        rf.fit(X_train, y_train)

        predictions = rf.predict(X_val)

        r_val = R_calc(y_val, predictions)

        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)

best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

In [None]:
roos_vals = {}
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

for i in range(len(windows)):

    training = windows[i][0].dropna()
    validation = windows[i][1].dropna()
    testing = windows[i][2].dropna()
    training_combined = windows[i][3].dropna()
    
    X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train = training['r(t+1)']
    
    X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_val = validation['r(t+1)']
    
    X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_test = testing['r(t+1)']
    
    X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train_combined = training_combined['r(t+1)']


    rf = RandomForestRegressor(n_estimators = n_trees,
                               max_depth = depth,
                               max_features = feature,
                               random_state = 42)

    rf.fit(X_train_combined, y_train_combined)

    predictions = rf.predict(X_val)

    r_val = R_calc(y_val, predictions)

    roos_vals[(i)] = r_val
    print(i, r_val)

    #best_hyperparameters = max(roos_vals, key=roos_vals.get)
    #best_r = roos_vals[best_hyperparameters]

    #print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
    #print("Best R^2 OOS:", best_r)

In [None]:
'''
rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train_combined, y_train_combined)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)
'''

In [None]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)

# Bottom 100

In [None]:
windows = generate_rolling_windows(bottom_100, 19, 6, 4)

training = windows[0][0].dropna()
validation = windows[0][1].dropna()
testing = windows[0][2].dropna()
training_combined = windows[0][3].dropna()

In [None]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [None]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [None]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [None]:
X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train_combined = training_combined['r(t+1)']

In [None]:
roos_vals = {}

for max_depth in depths:

    for max_features in feature_splits:

        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)

        rf.fit(X_train, y_train)

        predictions = rf.predict(X_val)

        r_val = R_calc(y_val, predictions)

        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)

best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

In [None]:
roos_vals = {}
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

for i in range(len(windows)):

    training = windows[i][0].dropna()
    validation = windows[i][1].dropna()
    testing = windows[i][2].dropna()
    training_combined = windows[i][3].dropna()
    
    X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train = training['r(t+1)']
    
    X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_val = validation['r(t+1)']
    
    X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_test = testing['r(t+1)']
    
    X_train_combined = training_combined.drop(columns = ['permno', 'return', 'r(t+1)'])
    y_train_combined = training_combined['r(t+1)']


    rf = RandomForestRegressor(n_estimators = n_trees,
                               max_depth = depth,
                               max_features = feature,
                               random_state = 42)

    rf.fit(X_train_combined, y_train_co)

    predictions = rf.predict(X_val)

    r_val = R_calc(y_val, predictions)

    roos_vals[(i)] = r_val
    print(i, r_val)

    #best_hyperparameters = max(roos_vals, key=roos_vals.get)
    #best_r = roos_vals[best_hyperparameters]

    #print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
    #print("Best R^2 OOS:", best_r)

In [None]:

rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train_combined, y_train_combined)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)

In [None]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)