# Imports

In [3]:
# Importing required libraries

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, PredefinedSplit
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, classification_report, make_scorer, balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [45]:
reg_signals = pd.read_csv('data/indicators.csv', parse_dates=True, index_col='Date')
poly_features = pd.read_csv('data/indicators_w_polyterms.csv', parse_dates=True, index_col='Date')
labels = pd.read_csv('data/Y_Matrix.csv', parse_dates=True, index_col='Date')

consts = [0.7, 0.8, 0.9, 1.1, 1.2, 1.3]
labels_const = {}
for const in consts:
    labels_const[const] = pd.read_csv(f'data/Y_Matrix_{const}.csv', parse_dates=True, index_col='Date')


# Model Preparation

In [46]:
# Defining the period rolling window over which we train new models and how far back the training data goes

window = 1
lookback = 3

# Creating a function to generate the training and testing data for the models, from 2000 onwards we create for every single year a train test split
# by using last 3 years as training data and the next year as testing data
# Eg. the model that will trade between 2000 and 2001 will be trained on data from 1997 to 1999 and tuned on 1999 to 2000 data (for the purposes of hyperparameter tuning)

def generate_data(reg_signals, labels, window, lookback):
    '''
    Function to generate training and testing data for the models
    
    Parameters:
    reg_signals: DataFrame containing the  signals
    labels: DataFrame containing the labels
    window: int, number of years to use for the testing data
    lookback: int, number of years to use for the training data
    
    '''
    X = reg_signals
    y = labels
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    for i in range(2000, 2025):
        date = datetime.strptime(str(i) + '-01-01', '%Y-%m-%d')
        X_train.append(X.loc[str(date - pd.DateOffset(years=lookback)):str(date - pd.DateOffset(years=window))])
        X_test.append(X.loc[str(date - pd.DateOffset(years=window)):str(date)])
        y_train.append(y.loc[str(date - pd.DateOffset(years=lookback)):str(date - pd.DateOffset(years=window))])
        y_test.append(y.loc[str(date - pd.DateOffset(years=window)):str(date)])
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = generate_data(reg_signals, labels, window, lookback)
X_poly_train, X_poly_test, y_poly_train, y_poly_test = generate_data(poly_features, labels, window, lookback)

y_const_train, y_const_test = {}, {}
for const in consts:
    # Take the same dates as y_train and y_test
    y_const_train[const] = []
    y_const_test[const] = []
    for i in range(0, 25):
        y_const_train[const].append(labels_const[const].loc[y_train[i].index])
        y_const_test[const].append(labels_const[const].loc[y_test[i].index])
    

# NB: y_train and y_poly_train are the same for all models, as the y_backtest and y_poly_backtest are the same for all models, so we just keep one set
del y_poly_train, y_poly_test

In [49]:
# Tuning function for one single pipeline for one single period. We use the same training and testing data for all models to ensure that the results are comparable.
# We use the predefined X_train, X_test split instead of cross-validation to avoid look-ahead biases.

def tune_model(X_train, X_test, y_train, y_test, params, pipeline, hyperparameter_tuner='grid', n_iter=None, verbose=1, sample_weights=None, probability=True):
    '''
    Function to tune a model using a predefined split
    
    Parameters:
    X_train: DataFrame, training data
    X_test: DataFrame, testing data over which we perform the hyperparameter tuning
    y_train: DataFrame, training labels
    y_test: DataFrame, testing labels
    params: dict, hyperparameters to tune
    pipeline: Pipeline, model to tune
    hyperparameter_tuner: str, 'grid' or 'random', default='grid'
    n_iter: int, number of iterations for random search, default=None
    verbose: int, default=1
    sample_weights: array-like, default=None
    probability: bool, whether the model has a predict_proba method, default=True
    
    '''
    # we first merge the training and testing data
    X = pd.concat([X_train, X_test])
    y = pd.concat([y_train, y_test])
    # our crossvalidation cv uses predefined split to ensure that the training and testing data are actually maintained when tuning hyperparameters:
    cv = PredefinedSplit(test_fold=[-1]*len(X_train) + [0]*len(X_test))
     
    if probability:
        scoring = {'auc': make_scorer(roc_auc_score, needs_proba=True, multi_class="ovr"), 'f1_macro': 'f1_macro', 'precision': 'precision_macro', 'recall': 'recall_macro', 'balanced_accuracy': 'balanced_accuracy'}
    else: 
        scoring = {'f1_macro': 'f1_macro', 'precision': 'precision_macro', 'recall': 'recall_macro', 'balanced_accuracy': 'balanced_accuracy'}
    principal_metric = 'balanced_accuracy'

    if hyperparameter_tuner == 'grid':
        search = GridSearchCV(pipeline, params, cv=cv, scoring=scoring, refit=principal_metric, n_jobs=-1, verbose=verbose, error_score='raise')
    elif hyperparameter_tuner == 'random':
        search = RandomizedSearchCV(pipeline, params, cv=cv, scoring=scoring, refit=principal_metric, n_iter=n_iter, n_jobs=-1, verbose=verbose)
    else:
        raise ValueError(f"Unknown hyperparameter tuner: {hyperparameter_tuner}\n Choose from 'grid', 'random'")
    if sample_weights is not None:
        kwargs = {pipeline.steps[-1][0] + '__sample_weight': sample_weights}
        result = search.fit(X, y, **kwargs)
    else:
        result = search.fit(X, y)
    best_model = result.best_estimator_

    return result, best_model

In [50]:
def model_validation_performance(result, probability=True):
    print('')
    print(f"Cross-validation balanced accuracy scores: {result.best_score_}\n")
    print(f"Cross-validation F1 Macro scores: {result.cv_results_['mean_test_f1_macro'][result.best_index_]}\n")
    if probability:
        print(f"Cross-validation ROC AUC scores: {result.cv_results_['mean_test_auc'][result.best_index_]}\n")
    print(f"Cross-validation precision scores: {result.cv_results_['mean_test_precision'][result.best_index_]}\n")
    print(f"Cross-validation recall scores: {result.cv_results_['mean_test_recall'][result.best_index_]}\n")
    print(f"Parameters for the best model: \n{result.best_params_}")

In [51]:
def rolling_model_tuning(X_train, X_test, y_train, y_test, params, pipeline, hyperparameter_tuner='grid', n_iter=None, verbose=1, sample_weights=None, probability=True):
    '''
    Function to tune a model using a rolling window
    
    Parameters:
    X_train: list, training data
    X_test: list, testing data over which we perform the hyperparameter tuning
    y_train: list, training labels
    y_test: list, testing labels
    params: dict, hyperparameters to tune
    pipeline: Pipeline, model to tune
    hyperparameter_tuner: str, 'grid' or 'random', default='grid'
    n_iter: int, number of iterations for random search, default=None
    verbose: int, default=1
    sample_weights: array-like, default=None
    probability: bool, whether the model has a predict_proba method, default=True
    
    '''
    results = dict()
    best_models = dict()
    for target in y_train[0].columns:
        print(f"Training model for target {target}\n")
        results[target] = []
        best_models[target] = []
        for i in range(len(X_train)):
            result, best_model = tune_model(X_train[i], X_test[i], y_train[i][target], y_test[i][target], params, pipeline, hyperparameter_tuner=hyperparameter_tuner, n_iter=n_iter, verbose=verbose, sample_weights=sample_weights, probability=probability)
            results[target].append(result)
            best_models[target].append(best_model)
            # we print the results for each model
            print(f"Results for model {i+2000}:")
            if verbose:
                model_validation_performance(result, probability=probability)
            else:
                print(f"Best score: {result.best_score_}")
        print('\n\n\n')
    return results, best_models

# Model Training

### Logistic Regression

In [52]:
# Define the pipeline
log_reg_pipeline = Pipeline(
    [('log_reg', LogisticRegression(max_iter=1000))]
)

# Define the parameters to tune
log_reg_params = [{
    'log_reg__C': [0.01, 0.1, 1, 10, 100],
    'log_reg__solver': ['liblinear'],
    'log_reg__penalty': ['l1', 'l2'],
    'log_reg__class_weight': ['balanced', None]
},
                  {
    'log_reg__C': [0.01, 0.1, 1, 10, 100],
    'log_reg__solver': ['saga'],
    'log_reg__penalty': ['elasticnet'],
    'log_reg__l1_ratio': [0.1, 0.5, 0.9],
    'log_reg__class_weight': ['balanced', None]
                  }]


In [53]:
import warnings

warnings.filterwarnings('ignore')

In [10]:
log_reg_results, log_reg_best_models = rolling_model_tuning(X_poly_train, X_poly_test, y_train, y_test, log_reg_params, log_reg_pipeline, hyperparameter_tuner='grid', verbose=0, probability=True)

Training model for target long 3 days

Results for model 2000:
Best score: 0.625
Results for model 2001:
Best score: 0.6455026455026455
Results for model 2002:
Best score: 0.6316696914700545
Results for model 2003:
Best score: 0.572336265884653
Results for model 2004:
Best score: 0.563820950622981
Results for model 2005:
Best score: 0.5887384176764077
Results for model 2006:
Best score: 0.56
Results for model 2007:
Best score: 0.6028787878787878
Results for model 2008:
Best score: 0.5991849751948972
Results for model 2009:
Best score: 0.6325093632958801
Results for model 2010:
Best score: 0.6197552447552448
Results for model 2011:
Best score: 0.6629840817296271
Results for model 2012:
Best score: 0.5936046511627907
Results for model 2013:
Best score: 0.615
Results for model 2014:
Best score: 0.6096126255380201
Results for model 2015:
Best score: 0.6458257115680105
Results for model 2016:
Best score: 0.6674982181040627
Results for model 2017:
Best score: 0.5875420875420875
Results for m

In [12]:
# Save as a pickle file in the models folder

with open('models/log_reg_results.pkl', 'wb') as f:
    pickle.dump(log_reg_results, f)


In [13]:
# Open the pickle file

with open('models/log_reg_results.pkl', 'rb') as f:
    log_reg_results = pickle.load(f)
    

In [54]:
for const in consts:
    print(f"Training models for constant {const}")
    log_reg_results_const, log_reg_best_models_const = rolling_model_tuning(X_poly_train, X_poly_test, y_const_train[const], y_const_test[const], log_reg_params, log_reg_pipeline, hyperparameter_tuner='grid', verbose=0, probability=True)
    with open(f'models/log_reg_results_{const}.pkl', 'wb') as f:
        pickle.dump(log_reg_results_const, f)

Training models for constant 0.7
Training model for target long 3 days

Results for model 2000:
Best score: 0.6102571172537395
Results for model 2001:
Best score: 0.6465237166991553
Results for model 2002:
Best score: 0.587253193960511
Results for model 2003:
Best score: 0.6016746411483254
Results for model 2004:
Best score: 0.5456958393113343
Results for model 2005:
Best score: 0.5532934131736527
Results for model 2006:
Best score: 0.588266845457593
Results for model 2007:
Best score: 0.5759144237405107
Results for model 2008:
Best score: 0.5948264698264698
Results for model 2009:
Best score: 0.6094069529652352
Results for model 2010:
Best score: 0.5732614637109019
Results for model 2011:
Best score: 0.5600744468187772
Results for model 2012:
Best score: 0.6143790849673203
Results for model 2013:
Best score: 0.62687265917603
Results for model 2014:
Best score: 0.5906964656964657
Results for model 2015:
Best score: 0.6464166284178086
Results for model 2016:
Best score: 0.64038248337028

### Random Forest

In [58]:
# Define the pipeline
rf_pipeline = Pipeline(
    [('rf', RandomForestClassifier())]
)

# Define the parameters to tune
rf_params = {
    'rf__n_estimators': [100, 200, 300, 400, 500],
    'rf__max_depth': [10, 20, 30, 40, 50],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__bootstrap': [True, False],
    'rf__class_weight': ['balanced', None]
}


In [31]:
rf_results, rf_best_models = rolling_model_tuning(X_train, X_test, y_train, y_test, rf_params, rf_pipeline, hyperparameter_tuner='random', n_iter=100, verbose=0, probability=True)

Training model for target long 3 days

Results for model 2000:
Best score: 0.6277777777777778
Results for model 2001:
Best score: 0.5978835978835979
Results for model 2002:
Best score: 0.618421052631579
Results for model 2003:
Best score: 0.5711143695014662
Results for model 2004:
Best score: 0.529164743885556
Results for model 2005:
Best score: 0.5674744594915657
Results for model 2006:
Best score: 0.5755844155844156
Results for model 2007:
Best score: 0.5635227272727272
Results for model 2008:
Best score: 0.5982990786676117
Results for model 2009:
Best score: 0.6016479400749064
Results for model 2010:
Best score: 0.5393356643356644
Results for model 2011:
Best score: 0.5792349726775956
Results for model 2012:
Best score: 0.6077034883720931
Results for model 2013:
Best score: 0.5700000000000001
Results for model 2014:
Best score: 0.553012912482066
Results for model 2015:
Best score: 0.6458257115680105
Results for model 2016:
Best score: 0.6703492516037064
Results for model 2017:
Best 

In [32]:
# save the results as pickle file in the models folder

with open('models/rf_results.pkl', 'wb') as f:
    pickle.dump(rf_results, f)

In [35]:
with open('models/rf_results.pkl', 'rb') as f:
    rf_results = pickle.load(f)

In [59]:
for const in consts:
    print(f'Trainig models for constant {const}')
    rf_results_const, rf_best_models_const = rolling_model_tuning(X_train, X_test, y_const_train[const], y_const_test[const], rf_params, rf_pipeline, hyperparameter_tuner='random', n_iter=100, verbose=0, probability=True)
    with open(f'models/rf_results_{const}.pkl', 'wb') as f:
        pickle.dump(rf_results_const, f)

Trainig models for constant 0.7
Training model for target long 3 days

Results for model 2000:
Best score: 0.629592610463914
Results for model 2001:
Best score: 0.5701754385964912
Results for model 2002:
Best score: 0.5918989547038327
Results for model 2003:
Best score: 0.5533791866028708
Results for model 2004:
Best score: 0.5315638450502151
Results for model 2005:
Best score: 0.5884818598097922
Results for model 2006:
Best score: 0.5810593362386859
Results for model 2007:
Best score: 0.59095928226363
Results for model 2008:
Best score: 0.5698976948976949
Results for model 2009:
Best score: 0.6155419222903885
Results for model 2010:
Best score: 0.567643486182812
Results for model 2011:
Best score: 0.5626249396842904
Results for model 2012:
Best score: 0.5938799762329174
Results for model 2013:
Best score: 0.526685393258427
Results for model 2014:
Best score: 0.5363825363825364
Results for model 2015:
Best score: 0.6366795619959347
Results for model 2016:
Best score: 0.6176552106430155

### XGBoost

In [55]:
# Define the pipeline
xgb_pipeline = Pipeline(
    [('xgb', XGBClassifier())]
)

# Define the parameters to tune
xgb_params = {
    'xgb__n_estimators': [50, 100, 200, 500],
    'xgb__learning_rate': [0.1, 0.3],
    'xgb__max_depth': [None, 5, 10],
    'xgb__subsample': [0.5, 0.75, 1],
    'xgb__colsample_bytree': [0.5, 0.75, 1],
}

In [41]:
xgb_results, xgb_best_models = rolling_model_tuning(X_train, X_test, y_train, y_test, xgb_params, xgb_pipeline, hyperparameter_tuner='random', n_iter=100, verbose=0, probability=True)

Training model for target long 3 days

Results for model 2000:
Best score: 0.6458333333333334
Results for model 2001:
Best score: 0.5767195767195767
Results for model 2002:
Best score: 0.6035390199637023
Results for model 2003:
Best score: 0.5782013685239491
Results for model 2004:
Best score: 0.5530226119058606
Results for model 2005:
Best score: 0.6166547873604181
Results for model 2006:
Best score: 0.6155844155844156
Results for model 2007:
Best score: 0.5547727272727273
Results for model 2008:
Best score: 0.581218993621545
Results for model 2009:
Best score: 0.6072284644194756
Results for model 2010:
Best score: 0.5703671328671328
Results for model 2011:
Best score: 0.5738892848657638
Results for model 2012:
Best score: 0.6225290697674418
Results for model 2013:
Best score: 0.63
Results for model 2014:
Best score: 0.5595408895265424
Results for model 2015:
Best score: 0.6191190458769298
Results for model 2016:
Best score: 0.6638156331670231
Results for model 2017:
Best score: 0.567

In [43]:
# save the results as pickle file in the models folder

with open('models/xgb_results.pkl', 'wb') as f:
    pickle.dump(xgb_results, f)

In [56]:
for const in consts:
    print(f"Training models for constant {const}")
    xgb_results_const, xgb_best_models_const = rolling_model_tuning(X_train, X_test, y_const_train[const], y_const_test[const], xgb_params, xgb_pipeline, hyperparameter_tuner='random', n_iter=100, verbose=0, probability=True)
    with open(f'models/xgb_results_{const}.pkl', 'wb') as f:
        pickle.dump(xgb_results_const, f)

Training models for constant 0.7
Training model for target long 3 days

Results for model 2000:
Best score: 0.6178396636106707
Results for model 2001:
Best score: 0.5685510071474984
Results for model 2002:
Best score: 0.5889953542392566
Results for model 2003:
Best score: 0.5529306220095694
Results for model 2004:
Best score: 0.5637733142037302
Results for model 2005:
Best score: 0.5971468827051779
Results for model 2006:
Best score: 0.5879986590680523
Results for model 2007:
Best score: 0.5850931677018634
Results for model 2008:
Best score: 0.5732970732970732
Results for model 2009:
Best score: 0.6055896387184732
Results for model 2010:
Best score: 0.5682508351047677
Results for model 2011:
Best score: 0.5647273729923485
Results for model 2012:
Best score: 0.6301247771836007
Results for model 2013:
Best score: 0.5819288389513109
Results for model 2014:
Best score: 0.5617203742203742
Results for model 2015:
Best score: 0.6333027342469346
Results for model 2016:
Best score: 0.5906319290

### SVM

In [7]:
# Define the pipeline
svm_pipeline = Pipeline(
    [('svm', SVC(probability=True))]
)

# Define the parameters to tune
svm_params = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svm__degree': [2, 3, 4, 5],
    'svm__gamma': ['scale', 'auto'],
    'svm__class_weight': ['balanced', None]
}

In [10]:
svm_results, svm_best_models = rolling_model_tuning(X_train, X_test, y_train, y_test, svm_params, svm_pipeline, hyperparameter_tuner='random', n_iter=100, verbose=0, probability=True)

Training model for target long 3 days

Results for model 2000:
Best score: 0.6305555555555555
Results for model 2001:
Best score: 0.6296296296296295
Results for model 2002:
Best score: 0.6302177858439202
Results for model 2003:
Best score: 0.6505376344086021
Results for model 2004:
Best score: 0.5756345177664974
Results for model 2005:
Best score: 0.5780470420527442
Results for model 2006:
Best score: 0.622077922077922
Results for model 2007:
Best score: 0.5862878787878788
Results for model 2008:
Best score: 0.6233167965981573
Results for model 2009:
Best score: 0.6044194756554306
Results for model 2010:
Best score: 0.6730769230769231
Results for model 2011:
Best score: 0.6362556426704681
Results for model 2012:
Best score: 0.6068313953488372
Results for model 2013:
Best score: 0.6175
Results for model 2014:
Best score: 0.5739598278335725
Results for model 2015:
Best score: 0.6623253091388015
Results for model 2016:
Best score: 0.6824661439771917
Results for model 2017:
Best score: 0.5

In [11]:
# save the results as pickle file in the models folder

with open('models/svm_results.pkl', 'wb') as f:
    pickle.dump(svm_results, f)