In [None]:
# Models and metrics
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor

# Classification metrics
from sklearn.metrics import classification_report, precision_recall_curve, auc, r2_score
from sklearn.calibration import calibration_curve, CalibrationDisplay, CalibratedClassifierCV
from sklearn.metrics import f1_score, precision_score, recall_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# Suppress all warnings
import warnings

warnings.filterwarnings('ignore')

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Other imports
import pandas as pd
import numpy as np
import joblib
import os
from utils.db_utils import QUERY


In [6]:
ml_features = pd.read_parquet('/Users/louisspencer/Desktop/Trading-Bot/data/ml_features.parquet').columns

In [7]:
def train_model(min_year, max_year, is_reg):
    for year in range(min_year, max_year + 1):
        # Train XGBoost model for each month
        for month in range(1, 13):
            # if year < 2023 or (year == 2023 and month < 4):
            #     continue

            data_train = QUERY(
                f"""
                SELECT *
                FROM market_data.ml_features
                WHERE
                    date_part('year', time_period_end) < {year} OR
                    (date_part('year', time_period_end) = {year} AND
                     date_part('month', time_period_end) <= {month})
                """
            )
            data_train['symbol_id'] = (
                data_train['asset_id_base'].str.upper() +
                '_' +
                data_train['asset_id_quote'].str.upper() +
                '_' +
                data_train['exchange_id'].str.upper()
            ).astype('category')

            max_train_date = pd.to_datetime(data_train['time_period_end'].dt.date.max())
            min_train_date = pd.to_datetime(max_train_date - pd.Timedelta(days = 365 * 2))

            # Test data is all data in the next month
            next_month = month + 1
            if next_month == 13:
                next_year = year + 1
                next_month = 1
            else:
                next_year = year

            data_test = QUERY(
                f"""
                SELECT *
                FROM market_data.ml_features
                WHERE
                    time_period_end > '{max_train_date}' AND
                    date_part('year', time_period_end) = {next_year} AND
                    date_part('month', time_period_end) <= {next_month}
                """
            )
            data_test['symbol_id'] = (
                data_test['asset_id_base'].str.upper() +
                '_' +
                data_test['asset_id_quote'].str.upper() +
                '_' +
                data_test['exchange_id'].str.upper()
            ).astype('category')

            data_train.replace([np.inf, -np.inf], np.nan, inplace=True)
            data_test.replace([np.inf, -np.inf], np.nan, inplace=True)

            # Filter out data with nan trade_returns_h7
            data_train = data_train.dropna(subset = ['forward_returns_7'])
            data_test = data_test.dropna(subset = ['forward_returns_7'])

            # Filter out training data older than 2 years from the max train date
            filter_train = (
                data_train['time_period_end'] >= min_train_date
            )
            filter_test = (
            )
            data_train = data_train[filter_train]
            # data_test = data_test[filter_test]

            data_train['symbol_id'] = data_train['symbol_id'].astype('category')
            data_test['symbol_id'] = data_test['symbol_id'].astype('category')

            if data_train.empty or data_test.empty:
                continue

            X_train = data_train
            X_test = data_test

            # Split data into features and target
            if is_reg:
                y_train = X_train['forward_returns_7'].clip(-1,1).abs()
                y_test = X_test['forward_returns_7'].clip(-1,1).abs()
            else:
                y_train = (X_train['forward_returns_7'] > 0).astype(int)
                y_test = (X_test['forward_returns_7'] > 0).astype(int)

            print()
            print(f'Train Date Range: {X_train["time_period_end"].min()} - {X_train["time_period_end"].max()}')
            print(f'Number of observations (Train): {X_train.shape[0]}')
            print()

            print(f'Test Date Range: {X_test["time_period_end"].min()} - {X_test["time_period_end"].max()}')
            print(f'Number of observations (Test): {X_test.shape[0]}')
            print()

            # Ensure no data leakage
            data_leakage_indicator = (
                (X_train['time_period_end'].max() >= X_test['time_period_end'].min())
            )
            assert not data_leakage_indicator, 'Data leakage detected'

            # Define the model
            if is_reg:
                ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
                X_train = X_train.drop(columns = cols_to_drop, axis = 1, errors = 'ignore')

                encoded_data = ohe.fit_transform(X_train[['symbol_id']])
                encoded_cols = ohe.get_feature_names_out(['symbol_id'])
                encoded_df = pd.DataFrame(encoded_data, columns = encoded_cols, index = X_train.index)

                X_train = pd.concat([X_train, encoded_df], axis = 1).drop('symbol_id', axis = 1).fillna(0)

                model = MLPRegressor(hidden_layer_sizes=(100,), verbose=True)
                model.fit(
                    X_train,
                    y_train
                )
            else:
                ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

                X_train = X_train.drop(columns = cols_to_drop, axis = 1, errors = 'ignore')
                encoded_data = ohe.fit_transform(X_train[['symbol_id']])
                encoded_cols = ohe.get_feature_names_out()
                encoded_df = pd.DataFrame(encoded_data, columns = encoded_cols, index = X_train.index)
                X_train = pd.concat([X_train, encoded_df], axis = 1).drop('symbol_id', axis = 1).fillna(0)

                model = LogisticRegression(n_jobs=-1, max_iter = 2000)
                # model = MLPClassifier(hidden_layer_sizes=(100,100), alpha = 1, max_iter = 50)
                model.fit(
                    X_train,
                    y_train
                )

            if is_reg:
                # Constant model (mean)
                y_pred_const = [y_train.mean()] * len(y_test)
                mae_const = mean_absolute_error(y_test, y_pred_const)

                # Constant model (zero)
                y_pred_zero = np.zeros(len(y_test))
                mae_zero = mean_absolute_error(y_test, y_pred_zero)

                encoded_data = ohe.transform(X_test[['symbol_id']])
                encoded_cols = ohe.get_feature_names_out(['symbol_id'])
                encoded_df = pd.DataFrame(encoded_data, columns=encoded_cols, index = X_test.index)

                X_test_ = X_test.drop(columns = cols_to_drop, axis = 1, errors = 'ignore')
                X_test_ = pd.concat([X_test_, encoded_df], axis = 1).drop('symbol_id', axis = 1).fillna(0)

                # Linear regression model
                y_pred = model.predict(X_test_)
                mae = mean_absolute_error(y_test, y_pred)

                # R2
                r2_model = r2_score(y_test, y_pred)

                print(f'MAE (Lin. Reg.): {mae}')
                print(f'MAE (Naive mean): {mae_const}')
                print(f'MAE (Naive zero): {mae_zero}')
                print(f'R^2 (Lin. Reg): {r2_model}')
                print()

                # Subplots 2 columns, 1 row
                fig, ax = plt.subplots(1, 2, figsize=(15, 7))
                # Predicted vs. Actual Plot
                ax[0].scatter(y_test, y_pred, alpha=0.5)
                ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
                ax[0].set_xlabel('True Values')
                ax[0].set_ylabel('Predicted Values')
                ax[0].set_title('Predicted vs Actual')

                # Plot feature importances
                # top_n = 50
                # feature_importances = pd.Series(model.coef_, index=X_test_.columns)
                # feature_importances = feature_importances.sort_values().tail(top_n)
                # feature_importances.plot(kind='barh', ax = ax[1])
                # ax[1].set_title(f'Top {top_n} Most Important Features')
                plt.tight_layout()
                plt.show()

            else:
                encoded_data = ohe.transform(X_test[['symbol_id']])
                encoded_cols = ohe.get_feature_names_out(['symbol_id'])
                encoded_df = pd.DataFrame(encoded_data, columns=encoded_cols, index = X_test.index)

                X_test_ = X_test.drop(columns = cols_to_drop, axis = 1, errors = 'ignore')
                X_test_ = pd.concat([X_test_, encoded_df], axis = 1).drop('symbol_id', axis = 1).fillna(0)

                y_pred_proba = model.predict_proba(X_test_)[:, 1]
                y_pred = (y_pred_proba >= 0.7).astype(int)

                X_test['y_true'] = y_test
                X_test['y_pred'] = y_pred
                X_test['y_pred_proba'] = y_pred_proba

                print('Class Distribution:')
                print(X_test['y_true'].value_counts(normalize = True))
                print()

                trade_side = np.where(
                    y_pred == 1, 1, 0
                )
                trade_pnl = trade_side * X_test['forward_returns_7'].values
                expectancy = trade_pnl[trade_side == 1].mean()
                hit_rate = (trade_pnl[trade_side == 1] > 0).mean()
                payoff_ratio = trade_pnl[trade_pnl>0].mean() / abs(trade_pnl[trade_pnl<0].mean())

                print(f'Expectancy: {expectancy}')
                print(f'Hit Rate: {hit_rate}')
                print(f'Payoff Ratio: {payoff_ratio}')
                print()

                # Classification Report
                print('Classification Report:')
                print(classification_report(X_test['y_true'], X_test['y_pred']))
                print()

                # Calibration Curve
                disp = CalibrationDisplay.from_predictions(y_test, y_pred_proba)
                plt.show()

                precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
                pr_auc = auc(recall, precision)

                # Subplots 2 columns, 1 row
                fig, ax = plt.subplots(1, 2, figsize=(15, 7))

                # Plot Precision Recall Curve with area under curve filled
                ax[0].plot(recall, precision, marker='.')
                ax[0].set_title(f'Precision Recall Curve: AUC={pr_auc:.2f}')
                ax[0].set_xlabel('Recall')
                ax[0].set_ylabel('Precision')

                # Plot feature importances
                top_n = 50
                feature_importances = pd.Series(model.coef_[0], index=X_test_.columns)
                feature_importances = feature_importances.sort_values().tail(top_n)
                feature_importances.plot(kind='barh', ax = ax[1])
                ax[1].set_title(f'Top {top_n} Most Important Features')
                plt.tight_layout()
                plt.show()

            # Delete old data from memory
            del X_train
            del X_test
            del data_train
            del data_test

            # Save the calibrated model and performance metrics
            if is_reg:
                path = f'/Users/louisspencer/Desktop/Trading-Bot/data/pretrained_models/regression/lr_model_{year}_{month}.pkl'
            else:
                path = f'/Users/louisspencer/Desktop/Trading-Bot/data/pretrained_models/classification/lr_model_{year}_{month}.pkl'

            # Save model and its best prediction threshold on validation data
            joblib.dump(model, path)

In [8]:
# Columns we need to drop before training the model
triple_barrier_label_cols = [
    col for col in ml_features if 'triple_barrier_label_h' in col
]

trade_returns_cols = [
    col for col in ml_features if 'trade_returns' in col
]

forward_returns_cols = [col for col in ml_features if 'forward_returns' in col]

non_numeric_cols = [
    'Unnamed: 0', 'sample_weight', 'y_true', 'y_pred', 'y_pred_proba', 'asset_id_base', 'asset_id_quote', 'exchange_id'
]

other_cols = [
    'open', 'high', 'low', 'close', 'volume', 'trades', 'start_date_triple_barrier_label_h7', 'start_date_triple_barrier_label_h1', 'end_date_triple_barrier_label_h1',
    'end_date_triple_barrier_label_h7', 'avg_uniqueness', 'time_period_end',
    'total_buy_dollar_volume_1d', 'total_buy_dollar_volume_7d', 'total_buy_dollar_volume_30d', 'total_buy_dollar_volume_90d',
    'total_buy_dollar_volume_180d', 'total_buy_dollar_volume_365d',  'total_sell_dollar_volume_1d', 'total_sell_dollar_volume_7d',
    'total_sell_dollar_volume_30d', 'total_sell_dollar_volume_90d', 'total_sell_dollar_volume_180d', 'total_sell_dollar_volume_365d',
    'num_buys_1d', 'num_buys_7d', 'num_buys_30d', 'num_buys_90d', 'num_buys_180d', 'num_buys_365d',
    'num_sells_1d', 'num_sells_7d', 'num_sells_30d', 'num_sells_90d', 'num_sells_180d', 'num_sells_365d',
]

rz_cols = [col for col in ml_features if '_rz' in col and 'forward_returns' not in col] + ['symbol_id']

cols_to_drop = (
    triple_barrier_label_cols +
    trade_returns_cols +
    non_numeric_cols +
    forward_returns_cols +
    other_cols
)

In [9]:
train_model(2019, 2024, is_reg=False)

IOException: IO Error: Could not set lock on file "/Users/louisspencer/Desktop/Trading-Bot-Data-Pipelines/data/database.db": Conflicting lock is held in /Library/Frameworks/Python.framework/Versions/3.11/Resources/Python.app/Contents/MacOS/Python (PID 10878) by user louisspencer. See also https://duckdb.org/docs/connect/concurrency