# Imports

In [8]:
# Importing required libraries

from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score, RepeatedKFold, RandomizedSearchCV, PredefinedSplit
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, classification_report, make_scorer, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [9]:
reg_signals = pd.read_csv('data/indicators.csv', parse_dates=True, index_col='Date')
poly_features = pd.read_csv('data/indicators_w_polyterms.csv', parse_dates=True, index_col='Date')
labels = pd.read_csv('data/Y_Matrix.csv', parse_dates=True, index_col='Date')

# Model Preparation

In [10]:
# Defining the period rolling window over which we train new models and how far back the training data goes

window = 1
lookback = 3

# Creating a function to generate the training and testing data for the models, from 2000 onwards we create for every single year a train test split
# by using last 3 years as training data and the next year as testing data
# Eg. the model that will trade between 2000 and 2001 will be trained on data from 1997 to 1999 and tuned on 1999 to 2000 data (for the purposes of hyperparameter tuning)

def generate_data(reg_signals, labels, window, lookback):
    '''
    Function to generate training and testing data for the models
    
    Parameters:
    reg_signals: DataFrame containing the  signals
    labels: DataFrame containing the labels
    window: int, number of years to use for the testing data
    lookback: int, number of years to use for the training data
    
    '''
    X = reg_signals
    y = labels
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    for i in range(2000, 2025):
        date = datetime.strptime(str(i) + '-01-01', '%Y-%m-%d')
        X_train.append(X.loc[str(date - pd.DateOffset(years=lookback)):str(date - pd.DateOffset(years=window))])
        X_test.append(X.loc[str(date - pd.DateOffset(years=window)):str(date)])
        y_train.append(y.loc[str(date - pd.DateOffset(years=lookback)):str(date - pd.DateOffset(years=window))])
        y_test.append(y.loc[str(date - pd.DateOffset(years=window)):str(date)])
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = generate_data(reg_signals, labels, window, lookback)
X_poly_train, X_poly_test, y_poly_train, y_poly_test = generate_data(poly_features, labels, window, lookback)

# NB: y_train and y_poly_train are the same for all models, as the y_backtest and y_poly_backtest are the same for all models, so we just keep one set
del y_poly_train, y_poly_test

In [11]:
# Tuning function for one single pipeline for one single period. We use the same training and testing data for all models to ensure that the results are comparable.
# We use the predefined X_train, X_test split instead of cross-validation to avoid look-ahead biases.

def tune_model(X_train, X_test, y_train, y_test, params, pipeline, hyperparameter_tuner='grid', n_iter=None, verbose=1, sample_weights=None, probability=True):
    '''
    Function to tune a model using a predefined split
    
    Parameters:
    X_train: DataFrame, training data
    X_test: DataFrame, testing data over which we perform the hyperparameter tuning
    y_train: DataFrame, training labels
    y_test: DataFrame, testing labels
    params: dict, hyperparameters to tune
    pipeline: Pipeline, model to tune
    hyperparameter_tuner: str, 'grid' or 'random', default='grid'
    n_iter: int, number of iterations for random search, default=None
    verbose: int, default=1
    sample_weights: array-like, default=None
    probability: bool, whether the model has a predict_proba method, default=True
    
    '''
    # we first merge the training and testing data
    X = pd.concat([X_train, X_test])
    y = pd.concat([y_train, y_test])
    # our crossvalidation cv uses predefined split to ensure that the training and testing data are actually maintained when tuning hyperparameters:
    cv = PredefinedSplit(test_fold=[-1]*len(X_train) + [0]*len(X_test))
     
    if probability:
        scoring = {'auc': make_scorer(roc_auc_score, needs_proba=True, multi_class="ovr"), 'f1_macro': 'f1_macro', 'precision': 'precision_macro', 'recall': 'recall_macro', 'balanced_accuracy': 'balanced_accuracy'}
    else: 
        scoring = {'f1_macro': 'f1_macro', 'precision': 'precision_macro', 'recall': 'recall_macro', 'balanced_accuracy': 'balanced_accuracy'}
    principal_metric = 'balanced_accuracy'

    if hyperparameter_tuner == 'grid':
        search = GridSearchCV(pipeline, params, cv=cv, scoring=scoring, refit=principal_metric, n_jobs=-1, verbose=verbose, error_score='raise')
    elif hyperparameter_tuner == 'random':
        search = RandomizedSearchCV(pipeline, params, cv=cv, scoring=scoring, refit=principal_metric, n_iter=n_iter, n_jobs=-1, verbose=verbose)
    else:
        raise ValueError(f"Unknown hyperparameter tuner: {hyperparameter_tuner}\n Choose from 'grid', 'random'")
    if sample_weights is not None:
        kwargs = {pipeline.steps[-1][0] + '__sample_weight': sample_weights}
        result = search.fit(X, y, **kwargs)
    else:
        result = search.fit(X, y)
    best_model = result.best_estimator_

    return result, best_model

In [18]:
def model_validation_performance(result, probability=True):
    print('')
    print(f"Cross-validation balanced accuracy scores: {result.best_score_}\n")
    print(f"Cross-validation F1 Macro scores: {result.cv_results_['mean_test_f1_macro'][result.best_index_]}\n")
    if probability:
        print(f"Cross-validation ROC AUC scores: {result.cv_results_['mean_test_auc'][result.best_index_]}\n")
    print(f"Cross-validation precision scores: {result.cv_results_['mean_test_precision'][result.best_index_]}\n")
    print(f"Cross-validation recall scores: {result.cv_results_['mean_test_recall'][result.best_index_]}\n")
    print(f"Parameters for the best model: \n{result.best_params_}")

In [22]:
def rolling_model_tuning(X_train, X_test, y_train, y_test, params, pipeline, hyperparameter_tuner='grid', n_iter=None, verbose=1, sample_weights=None, probability=True):
    '''
    Function to tune a model using a rolling window
    
    Parameters:
    X_train: list, training data
    X_test: list, testing data over which we perform the hyperparameter tuning
    y_train: list, training labels
    y_test: list, testing labels
    params: dict, hyperparameters to tune
    pipeline: Pipeline, model to tune
    hyperparameter_tuner: str, 'grid' or 'random', default='grid'
    n_iter: int, number of iterations for random search, default=None
    verbose: int, default=1
    sample_weights: array-like, default=None
    probability: bool, whether the model has a predict_proba method, default=True
    
    '''
    results = []
    best_models = []
    for i in range(len(X_train)):
        result, best_model = tune_model(X_train[i], X_test[i], y_train[i], y_test[i], params, pipeline, hyperparameter_tuner=hyperparameter_tuner, n_iter=n_iter, verbose=verbose, sample_weights=sample_weights, probability=probability)
        results.append(result)
        best_models.append(best_model)
        # we print the results for each model
        print(f"Results for model {i+2000}:")
        if verbose:
            model_validation_performance(result, probability=probability)
        else:
            print(f"Best score: {result.best_score_}")
    return results, best_models

# Model Training

### Logistic Regression

In [23]:
# Define the pipeline
log_reg_pipeline = Pipeline(
    [('log_reg', LogisticRegression(max_iter=1000))]
)

# Define the parameters to tune
log_reg_params = {
    'log_reg__C': [0.01, 0.1, 1, 10, 100],
    'log_reg__penalty': ['l1', 'l2', 'elasticnet'],
    'log_reg__class_weight': ['balanced', None]
}


In [24]:
log_reg_results, log_reg_best_models = rolling_model_tuning(X_train, X_test, y_train, y_test, log_reg_params, log_reg_pipeline, hyperparameter_tuner='grid', verbose=0)



ValueError: Solver lbfgs supports only 'l2' or None penalties, got l1 penalty.

### Random Forest

In [14]:
# Define the pipeline
rf_pipeline = Pipeline(
    [('rf', RandomForestClassifier())]
)

# Define the parameters to tune
rf_params = {
    'rf__n_estimators': [100, 200, 300, 400, 500],
    'rf__max_depth': [10, 20, 30, 40, 50],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__bootstrap': [True, False],
    'rf__class_weight': ['balanced', None]
}


### XGBoost

In [15]:
# Define the pipeline
xgb_pipeline = Pipeline(
    [('xgb', XGBClassifier())]
)

# Define the parameters to tune
xgb_params = {
    'xgb__n_estimators': [100, 200, 300, 400, 500],
    'xgb__max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'xgb__subsample': [0.5, 0.7, 0.9, 1],
    'xgb__colsample_bytree': [0.5, 0.7, 0.9, 1],
    'xgb__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'xgb__reg_alpha': [0, 0.1, 0.5, 1],
    'xgb__reg_lambda': [0, 0.1, 0.5, 1],
    'xgb__scale_pos_weight': [1, 2, 3, 4, 5],
    'xgb__eval_metric': ['mlogloss', 'merror'],
    'xgb__objective': ['multi:softmax', 'multi:softprob']
}

### SVM

In [16]:
# Define the pipeline
svm_pipeline = Pipeline(
    [('svm', SVC(probability=True))]
)

# Define the parameters to tune
svm_params = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svm__degree': [2, 3, 4, 5],
    'svm__gamma': ['scale', 'auto'],
    'svm__class_weight': ['balanced', None]
}


# Model Training