# Imports

In [28]:
# Importing required libraries

from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score, RepeatedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, classification_report, make_scorer, balanced_accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [7]:
reg_signals = pd.read_csv('data/indicators.csv', parse_dates=True, index_col='Date')
poly_features = pd.read_csv('data/indicators_w_polyterms.csv', parse_dates=True, index_col='Date')
labels = pd.read_csv('data/labels.csv', parse_dates=True, index_col='Date')

In [12]:
reg_signals.tail()

Unnamed: 0_level_0,momentum_21d_zscore,drawdown_21d_zscore,momentum_42d_zscore,drawdown_42d_zscore,momentum_63d_zscore,drawdown_63d_zscore,momentum_126d_zscore,drawdown_126d_zscore,momentum_252d_zscore,drawdown_252d_zscore
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-12-23,-0.982926,-0.963551,-0.807795,-0.9684,-0.873208,-1.29851,-0.846531,-0.262267,-0.183703,-0.480692
2024-12-24,-0.57853,-0.004139,-0.32832,0.101233,-0.394884,-0.077231,-0.522828,0.341703,0.039373,0.208263
2024-12-26,-0.685421,-0.013106,-0.344943,0.059074,-0.566419,-0.107557,-0.556426,0.323455,-0.074626,0.185081
2024-12-27,-1.32709,-0.891718,-1.07239,-0.984458,-0.930216,-1.297679,-0.779085,-0.279994,-0.37834,-0.506939
2024-12-30,-1.459168,-1.594145,-1.643523,-1.869904,-1.450497,-2.321768,-1.20321,-0.850992,-0.642789,-1.16476


In [19]:
(labels.index[0])

Timestamp('1996-01-02 00:00:00')

In [23]:
labels.index[labels.index >= '2000-01-01']

DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
               '2000-01-07', '2000-01-10', '2000-01-11', '2000-01-12',
               '2000-01-13', '2000-01-14',
               ...
               '2024-12-02', '2024-12-03', '2024-12-04', '2024-12-05',
               '2024-12-06', '2024-12-09', '2024-12-10', '2024-12-11',
               '2024-12-12', '2024-12-13'],
              dtype='datetime64[ns]', name='Date', length=6278, freq=None)

# Model Preparation

In [32]:
# Defining the period rolling window over which we train new models and how far back the training data goes

window = 252
lookback = 252*3

# Creating a function to generate the training and testing data for the models, from 2000 onwards we create for every single year a train test split
#  by using last 3 years as training data and the next year as testing data

def generate_data(reg_signals, labels, window, lookback):
    X = reg_signals
    y = labels
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    for i in range(2000, 2025):
        date = datetime.strptime(str(i) + '-01-01', '%Y-%m-%d')
        X_train.append(X.loc[str(date - pd.DateOffset(days=lookback)):str(date - pd.DateOffset(days=window))])
        X_test.append(X.loc[str(date - pd.DateOffset(days=window)):str(date)])
        y_train.append(y.loc[str(date - pd.DateOffset(days=lookback)):str(date - pd.DateOffset(days=window))])
        y_test.append(y.loc[str(date - pd.DateOffset(days=window)):str(date)])
    return X_train, X_test, y_train, y_test


In [None]:
# Tuning function for one single pipeline for one single period

def tune_model(X, y, params, pipeline, n_repeats=3, hyperparameter_tuner='grid', n_iter=None, verbose=1, sample_weights=None, probability=True):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=n_repeats, random_state=42)
    if probability:
        scoring = {'auc': make_scorer(roc_auc_score, needs_proba=True, multi_class="ovr"), 'f1_macro': 'f1_macro', 'precision': 'precision_macro', 'recall': 'recall_macro', 'balanced_accuracy': 'balanced_accuracy'}
    else: 
        scoring = {'f1_macro': 'f1_macro', 'precision': 'precision_macro', 'recall': 'recall_macro', 'balanced_accuracy': 'balanced_accuracy'}
    principal_metric = 'balanced_accuracy'

    if hyperparameter_tuner == 'grid':
        search = GridSearchCV(pipeline, params, cv=cv, scoring=scoring, refit=principal_metric, n_jobs=-1, verbose=verbose, error_score='raise')
    elif hyperparameter_tuner == 'random':
        search = RandomizedSearchCV(pipeline, params, cv=cv, scoring=scoring, refit=principal_metric, n_iter=n_iter, n_jobs=-1, verbose=verbose)
    else:
        raise ValueError(f"Unknown hyperparameter tuner: {hyperparameter_tuner}\n Choose from 'grid', 'random'")
    if sample_weights is not None:
        kwargs = {pipeline.steps[-1][0] + '__sample_weight': sample_weights}
        result = search.fit(X, y, **kwargs)
    else:
        result = search.fit(X, y)
    best_model = result.best_estimator_

    return result, best_model