In [6]:
# from pathlib import Path
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import time

In [3]:
# linear model
from sklearn.linear_model import ElasticNet, Lasso, Ridge, BayesianRidge, HuberRegressor, RANSACRegressor

# ensemble learning model
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor , ExtraTreesRegressor , BaggingRegressor

# tree model
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor

# boosting model
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# other models
# from sklearn.svm import SVR, LinearSVR, NuSVR            
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.neural_network import MLPRegressor
# from sklearn.gaussian_process import GaussianProcessRegressor

In [4]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV ,cross_val_score    
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

from scipy.stats import spearmanr
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
# from yellowbrick.model_selection import ValidationCurve, LearningCurve

# Load Data

In [16]:
df_all = pd.read_pickle('data/merged data7.pkl.zip')
df_all.reset_index(drop=True, inplace=True)

In [None]:
df_all['Ticker_code'] = df_all['Ticker'].astype('category').cat.codes
df_all['date_period'] = (df_all['date'] - min(df_all['date'])).dt.days + 1
df_all.sort_values('date', inplace=True)

In [20]:
df_data = df_all[list(df_all.columns[3:])]
df_target = df_all['alpha']

# Time series split

In [10]:
timecv = TimeSeriesSplit(n_splits=5)

In [11]:
for train_index, test_index in TimeSeriesSplit(n_splits=5).split(df_data, df_target):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [    0     1     2 ... 49480 49481 49482] TEST: [49483 49484 49485 ... 98961 98962 98963]
TRAIN: [    0     1     2 ... 98961 98962 98963] TEST: [ 98964  98965  98966 ... 148442 148443 148444]
TRAIN: [     0      1      2 ... 148442 148443 148444] TEST: [148445 148446 148447 ... 197923 197924 197925]
TRAIN: [     0      1      2 ... 197923 197924 197925] TEST: [197926 197927 197928 ... 247404 247405 247406]
TRAIN: [     0      1      2 ... 247404 247405 247406] TEST: [247407 247408 247409 ... 296885 296886 296887]


# Linear Model

In [52]:
print("Elastic Net : ", cross_val_score(ElasticNet(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

Elastic Net :  -1.9837686356055784


In [53]:
print("Lasso : ", cross_val_score(Lasso(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

Lasso :  -2.018928053162672


In [21]:
print("Ridge : ", cross_val_score(Ridge(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

Ridge :  -1.675863382916118


In [55]:
print("Bayesian Ridge : ", cross_val_score(BayesianRidge(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

Bayesian Ridge :  -1.6765307044329785


In [56]:
print("Huber Regressor : ", cross_val_score(HuberRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

Huber Regressor :  -2.1682435886467446


In [57]:
print("RANSAC Regressor : ", cross_val_score(RANSACRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

RANSAC Regressor :  -12.732002939734054


# Boosting

In [50]:
MSE_raw_xgb = cross_val_score(XGBRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21363
[LightGBM] [Info] Number of data points in the train set: 49483, number of used features: 89
[LightGBM] [Info] Start training from score -0.903412
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21548
[LightGBM] [Info] Number of data points in the train set: 98964, number of used features: 89
[LightGBM] [Info] Start training from score -0.540251
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21613
[LightGBM] [Info] Number of data points in the train set: 148445, number of used features: 89
[LightGBM] [Info] Sta

In [58]:
MSE_raw_lgbm = cross_val_score(LGBMRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21363
[LightGBM] [Info] Number of data points in the train set: 49483, number of used features: 89
[LightGBM] [Info] Start training from score -0.903412
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21548
[LightGBM] [Info] Number of data points in the train set: 98964, number of used features: 89
[LightGBM] [Info] Start training from score -0.540251
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21613
[LightGBM] [Info] Number of data points in the train set: 148445, number of used features: 89
[LightGBM] [Info] Sta

In [59]:
MSE_raw_cat = cross_val_score(CatBoostRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean()

Learning rate set to 0.075841
0:	learn: 0.8872789	total: 24.6ms	remaining: 24.5s
1:	learn: 0.8608732	total: 51.1ms	remaining: 25.5s
2:	learn: 0.8373245	total: 73.5ms	remaining: 24.4s
3:	learn: 0.8127959	total: 95.8ms	remaining: 23.9s
4:	learn: 0.7922366	total: 117ms	remaining: 23.2s
5:	learn: 0.7728081	total: 143ms	remaining: 23.8s
6:	learn: 0.7556891	total: 257ms	remaining: 36.4s
7:	learn: 0.7389447	total: 308ms	remaining: 38.2s
8:	learn: 0.7220232	total: 344ms	remaining: 37.9s
9:	learn: 0.7083627	total: 364ms	remaining: 36.1s
10:	learn: 0.6949522	total: 386ms	remaining: 34.7s
11:	learn: 0.6824405	total: 407ms	remaining: 33.5s
12:	learn: 0.6700355	total: 426ms	remaining: 32.3s
13:	learn: 0.6575878	total: 449ms	remaining: 31.6s
14:	learn: 0.6478142	total: 475ms	remaining: 31.2s
15:	learn: 0.6387189	total: 499ms	remaining: 30.7s
16:	learn: 0.6284324	total: 527ms	remaining: 30.5s
17:	learn: 0.6175164	total: 553ms	remaining: 30.2s
18:	learn: 0.6092152	total: 574ms	remaining: 29.6s
19:	lea

In [51]:
print("XGB Regressor : ", MSE_raw_xgb)
print("LGBM Regressor : ", MSE_raw_lgbm)
print("Cat Boost Regressor : ", MSE_raw_cat)

XGB Regressor :  -1.111824116155646
LGBM Regressor :  -0.9487831526996976
Cat Boost Regressor :  -0.9628833404334716


# Ensemble learning

In [None]:
cross_val_score(RandomForestRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean()

KeyboardInterrupt: 

In [None]:
print("Gradient Boosting Regressor : ", cross_val_score(GradientBoostingRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

In [None]:
print("Extra Trees Regressor : ", cross_val_score(ExtraTreesRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

In [None]:
print("Bagging Regressor : ", cross_val_score(BaggingRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

# Tree Model

In [2]:
print("Decision Tree Regressor : ", cross_val_score(DecisionTreeRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

NameError: name 'cross_val_score' is not defined

In [None]:
print("Extra Tree Regressor : ", cross_val_score(ExtraTreeRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

# Other ML model

In [60]:
print("KNeighbors Regressor : ", cross_val_score(KNeighborsRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

KNeighbors Regressor :  -2.622598891537411


In [62]:
print("MLP Regressor : ", cross_val_score(MLPRegressor(), df_data, df_target, cv=timecv, scoring='neg_mean_squared_error').mean())

MLP Regressor :  -1341566718400.6562
