In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Preprocessing

In [58]:
df = pd.read_csv("data.csv")

In [59]:
df = df.drop(columns = ['Unnamed: 0'])

In [60]:
#df['r(t+1)'] = df.groupby('permno')['return'].shift(-1)

In [61]:
df['Date'] = pd.to_datetime(df['Date'])



In [62]:
df_filled = df.copy()
for feature in ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']:
    df_filled[feature] = df_filled.groupby('Date')[feature].transform(lambda x: x.fillna(x.median()))

In [63]:
df_filled.isna().sum()

Date             0
permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    500
dtype: int64

In [64]:
df.loc[:, ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']] = df_filled.loc[:,['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']]

In [65]:
mvel_sorted = df.sort_values('mvel1',ascending=False)
top_100 = mvel_sorted.groupby('Date').head(100).reset_index(drop=True)
bottom_100 = mvel_sorted.groupby('Date').tail(100).reset_index(drop=True)
top_100.set_index('Date', inplace=True)
bottom_100.set_index('Date', inplace=True)
df.set_index('Date', inplace=True)

In [66]:
df.isna().sum()

permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    500
dtype: int64

# Splitting Data

In [67]:
training = df[:'2007-01-01'].dropna()
validation = df['2007-01-02':'2011-01-01']
testing = df[:'2020-01-01'].dropna()

df.isna().sum()

permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    500
dtype: int64

In [74]:
X_train = training.drop(columns = ['return'])
y_train = training['return']

In [75]:
X_val = validation.drop(columns = ['return'])
y_val = validation['return']

In [76]:
X_test = testing.drop(columns = ['return'])
y_test = testing['return']

# Implementing the model

In [77]:
#Hyperparameters based on the paper
depths = range(1, 7)  
n_trees = 300         
feature_splits = [3, 5, 10, 20] 

In [78]:
def R_calc(actual, predicted):

    actual = np.array(actual)
    predicted = np.array(predicted).flatten()
    
    predicted = np.clip(predicted, 0, None)
    
    ss_res = np.sum((actual - predicted) ** 2)
    
    ss_tot = np.sum(actual ** 2)
    
    r_squared = 1 - (ss_res / ss_tot)
    
    return r_squared

In [79]:
roos_vals = {}

for max_depth in depths:
    
    for max_features in feature_splits:
        
        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)
        
        rf.fit(X_train, y_train)
        
        predictions = rf.predict(X_val)
        
        r_val = R_calc(y_val, predictions)
        
        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)
        
best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

1 3 0.014958752783622686
1 5 0.016186036584500796
1 10 0.016938802642380013
1 20 0.01652972516238882
2 3 0.021405944336138738
2 5 0.02271292684296433
2 10 0.0236204271641699
2 20 0.022651444900517737
3 3 0.026736488831452188
3 5 0.027322423940248752
3 10 0.02729860050697308
3 20 0.026206202963935832
4 3 0.029550834314363672
4 5 0.030060777825804852
4 10 0.03031041742821916
4 20 0.027963975036472855
5 3 0.03175812070773154
5 5 0.03150346898260903
5 10 0.03116902477173067
5 20 0.027023873450172764
6 3 0.03243898102900289
6 5 0.03195467298294885
6 10 0.03149347690888582
6 20 0.027370297712267844
Best hyperparameters (max_depth, max_features): (6, 3)
Best R^2 OOS: 0.03243898102900289


In [80]:
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train, y_train)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)

0.03243898102900289


In [81]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)

0.0456277935717978


# Top 100

In [None]:
df.isna().sum()

In [None]:
training = top_100[:'2007-01-01'].dropna()
validation = top_100['2007-01-02':'2011-01-01']
testing = top_100[:'2020-01-01'].dropna()

top_100.isna().sum()

In [None]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [None]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [None]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [None]:
roos_vals = {}

for max_depth in depths:
    
    for max_features in feature_splits:
        
        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)
        
        rf.fit(X_train, y_train)
        
        predictions = rf.predict(X_val)
        
        r_val = R_calc(y_val, predictions)
        
        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)
        
best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

In [None]:
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train, y_train)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)

In [None]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)

# Bottom 100

In [None]:
training = bottom_100[:'2007-01-01'].dropna()
validation = bottom_100['2007-01-02':'2011-01-01']
testing = bottom_100[:'2020-01-01'].dropna()

In [None]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [None]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [None]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [None]:
roos_vals = {}

for max_depth in depths:
    
    for max_features in feature_splits:
        
        rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = max_depth,
                                   max_features = max_features,
                                   random_state = 42)
        
        rf.fit(X_train, y_train)
        
        predictions = rf.predict(X_val)
        
        r_val = R_calc(y_val, predictions)
        
        roos_vals[((max_depth, max_features))] = r_val
        print(max_depth, max_features, r_val)
        
best_hyperparameters = max(roos_vals, key=roos_vals.get)
best_r = roos_vals[best_hyperparameters]

print("Best hyperparameters (max_depth, max_features):", best_hyperparameters)
print("Best R^2 OOS:", best_r)

In [None]:
depth = best_hyperparameters[0]
feature = best_hyperparameters[1]

rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature,
                                   random_state = 42)
        
rf.fit(X_train, y_train)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)

In [None]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)