In [None]:
# Models and metrics
from sklearn.metrics import classification_report, precision_recall_curve, auc
from sklearn.calibration import calibration_curve, CalibrationDisplay
from sklearn.metrics import f1_score, precision_score, recall_score, mean_squared_error, mean_absolute_error

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import lightgbm as lgb

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

# suppress convergence warnings from sklearn
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Other imports
import pandas as pd
import numpy as np
import joblib
import sklearn
import keras
import os
# from utils.db_utils import QUERY

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy.stats import t

def binned_scatter_plotly(
    df: pd.DataFrame,
    x: str,
    y: str,
    bins: int = 10,
    interval: str = "sd",   # "sd", "sem", or a float (e.g. 0.95 for 95 % CI)
    show: bool = True,
    **scatter_kwargs          # colour, marker_symbol, etc. for go.Scatter
):
    # Step 1: Bin using fixed-width intervals
    bin_cuts = pd.cut(df[x], bins=bins, include_lowest=True)
    df["_bin"] = bin_cuts
    grouped = df.groupby("_bin", observed=True)[y]

    # Step 2: Summary stats
    summary = grouped.agg(mean="mean", std="std", count="count")
    summary["sem"] = summary["std"] / np.sqrt(summary["count"])

    # Step 3: Dispersion calculation
    if interval == "sd":
        disp = summary["std"]
        disp_label = "±1 SD"
    elif interval == "sem":
        disp = summary["sem"]
        disp_label = "±1 SEM"
    else:
        ci = float(interval)
        dfree = summary["count"] - 1
        tcrit = t.ppf(0.5 + ci / 2, dfree)
        disp = summary["sem"] * tcrit
        disp_label = f"±{int(ci * 100)}% CI"

    # Step 4: X-axis = bin midpoints
    bin_mid = bin_cuts.cat.categories.mid

    # Step 5: Plot with Plotly
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=bin_mid,
            y=summary["mean"],
            mode="lines+markers",
            error_y=dict(type="data", array=disp, visible=True),
            customdata=np.stack(
                (summary["std"], summary["sem"], summary["count"]), axis=-1
            ),
            hovertemplate=(
                "<b>Bin mid:</b> %{x:.4f}<br>"
                "<b>Mean:</b> %{y:.4f}<br>"
                "<b>Std:</b> %{customdata[0]:.4f}<br>"
                "<b>SEM:</b> %{customdata[1]:.4f}<br>"
                "<b>N:</b> %{customdata[2]:.0f}<extra></extra>"
            ),
            **scatter_kwargs,
        )
    )

    fig.update_layout(
        template="plotly_white",
        xaxis_title=f"{x} (fixed-width bins)",
        yaxis_title=f"{y} (mean {disp_label})",
        title=f"{y} vs. {x} – {len(summary)} fixed-width bins",
    )

    if show:
        fig.show()

In [None]:
ml_features = pd.read_parquet('/Users/louisspencer/Desktop/Trading-Bot/data/ml_features.parquet')
ml_features_cols = ml_features.columns.tolist()

In [None]:
# Columns we need to drop before training the model
forward_returns_cols = [col for col in ml_features_cols if 'forward_returns' in col]

non_numeric_cols = [
    'asset_id_base','asset_id_base_x','asset_id_base_y', 
    'asset_id_quote','asset_id_quote_x', 'asset_id_quote_y', 
    'exchange_id','exchange_id_x','exchange_id_y'
]

other_cols = [
    'open_spot', 'high_spot', 'low_spot', 'close_spot', 
    'open_futures', 'high_futures', 'low_futures', 'close_futures', 
    'time_period_end',
]

num_cols = [col for col in ml_features_cols if 'num' in col and 'rz' not in col and 'zscore' not in col and 'percentile' not in col]

dollar_cols = [col for col in ml_features_cols if 'dollar' in col and 'rz' not in col and 'zscore' not in col and 'percentile' not in col]

delta_cols = [col for col in ml_features_cols if 'delta' in col and 'rz' not in col and 'zscore' not in col and 'percentile' not in col]

other = [col for col in ml_features_cols if '10th_percentile' in col or '90th_percentile' in col]

cols_to_drop = (
    forward_returns_cols +
    non_numeric_cols +
    other_cols +
    num_cols +
    dollar_cols +
    delta_cols +
    other
)

# Columns to include in the model
returns_cols = [col for col in ml_features_cols if ('spot_returns' in col or 'futures_returns' in col) and 'cs_' not in col]
returns_cs_cols = [col for col in ml_features_cols if ('spot_returns' in col or 'futures_returns' in col) and 'cs_' in col and 'kurtosis' not in col]

alpha_beta_cols = [col for col in ml_features_cols if ('alpha' in col or 'beta' in col) and 'cs_' not in col]
alpha_beta_cs_cols = [col for col in ml_features_cols if ('alpha' in col or 'beta' in col) and 'cs_' in col and 'kurtosis' not in col]

basis_pct_cols = [col for col in ml_features_cols if 'basis_pct' in col and 'cs_' not in col]
basis_pct_cs_cols = [col for col in ml_features_cols if 'basis_pct' in col and 'cs_' in col and 'kurtosis' not in col]

trade_imbalance_cols = [col for col in ml_features_cols if 'trade_imbalance' in col and 'cs_' not in col]
trade_imbalance_cs_cols = [col for col in ml_features_cols if 'trade_imbalance' in col and 'cs_' in col and 'kurtosis' not in col]

ema_cols = [col for col in ml_features_cols if 'ema' in col and not (col.endswith('_basis') or 'volume' in col or 'num' in col)]

valid_cols = (
    returns_cols +
    returns_cs_cols +
    alpha_beta_cols +
    alpha_beta_cs_cols +
    basis_pct_cols +
    basis_pct_cs_cols +
    trade_imbalance_cols +
    trade_imbalance_cs_cols +
    ema_cols
)

rz_cols = [col for col in ml_features_cols if ('rz' in col or 'zscore' in col or 'percentile' in col or col in valid_cols) and 'forward_returns' not in col]
# rz_cols = [col for col in ml_features_cols if col in valid_cols]

In [None]:
len(rz_cols)

In [None]:
def run_hyperparameter_tuning(X, y, param_space, max_evals=5, direction = 'long', model_type='xgb', is_reg = False):
    def objective(params): 
      params['n_estimators'] = int(params['n_estimators'])
      params['max_depth'] = int(params['max_depth'])
      # params['min_child_samples'] = int(params['min_child_samples'])
      
      # Split the data into training and validation sets (e.g., 90% train, 10% validation)
      train_end_date = X['time_period_end'].quantile(0.9)
      X_train = X[X['time_period_end'] <= train_end_date]
      X_val = X[X['time_period_end'] > train_end_date]

      if is_reg:
        if direction == 'long':
          y_train = X_train['forward_returns_7'].abs()
          y_val = X_val['forward_returns_7'].abs()
        else:
          y_train = X_train['futures_forward_returns_7'].abs()
          y_val = X_val['futures_forward_returns_7'].abs()
      else:
        if direction == 'long':
          y_train = (X_train['forward_returns_7'] > 0).astype(int)
          y_val = (X_val['forward_returns_7'] > 0).astype(int)
        else:
          y_train = (X_train['futures_forward_returns_7'] < 0).astype(int)
          y_val = (X_val['futures_forward_returns_7'] < 0).astype(int)

      model = lgb.LGBMClassifier(**params) if not is_reg else lgb.LGBMRegressor(**params)
      
      # Fit the model
      model.fit(X_train[rz_cols], y_train, eval_set=[(X_val[rz_cols], y_val)]) # callbacks=[lgb.early_stopping(stopping_rounds=30, verbose=False)])
      
      # Make predictions on the validation set
      y_pred_proba = model.predict_proba(X_val[rz_cols])[:, 1] 
      y_pred = (y_pred_proba >= 0.5).astype(int)
      f1 = f1_score(y_val, y_pred)

      # Calculate Sortino-like metric of 7-day forward returns
      pred_mask = (y_pred == 1)
      forward_returns = X_val['forward_returns_7']
      expectancy = np.mean(forward_returns[pred_mask]) if direction == 'long' else -np.mean(forward_returns[pred_mask])
      std_dev_neg = np.nan_to_num(np.std(forward_returns[pred_mask & (forward_returns < 0)]) if direction == 'long' else np.std(forward_returns[pred_mask & (forward_returns > 0)]), nan=0.0)
      sortino_like_metric = (expectancy / std_dev_neg) if std_dev_neg != 0 else 0
      market_returns = X_val['forward_returns_7']
      market_expectancy = np.mean(market_returns) if direction == 'long' else -np.mean(market_returns)
      market_std_dev_neg = np.std(market_returns[market_returns < 0]) if direction == 'long' else np.std(market_returns[market_returns > 0])
      
      # Calculate classification metrics
      print(f'Hyperparameters: {params}')
      print()
      print(classification_report(y_val, y_pred))
      print()
      print(f'Expectancy: {expectancy}, Market Expectancy: {market_expectancy}')
      print(f'Std Dev Negative: {std_dev_neg}, Market Std Dev Negative: {market_std_dev_neg}')
      print(f'Sortino-Like: {sortino_like_metric}, Market Sortino-Like: {market_expectancy / market_std_dev_neg if market_std_dev_neg != 0 else 0}')
      print()

      # Calculate the loss (negative Sortino-like metric)
      optimization_metric = np.nan_to_num(-sortino_like_metric, nan=0.0, posinf=10, neginf=-10)
      print(f'Optimization Metric: {optimization_metric}')
      print('=' * 80)
      return {'loss': optimization_metric, 'status': STATUS_OK, 'model': model}

    trials = Trials()
    fmin(objective, param_space, algo=tpe.suggest, max_evals=max_evals, trials=trials, show_progressbar=True)
    best = trials.best_trial['result']['model']
    return best

In [None]:
def train_model(min_year, max_year, is_reg, model_type, direction='long', period='7d'):
    for year in range(min_year, max_year + 1):
        # Train XGBoost model for each month
        for month in range(1, 13):    
            # if year < 2018 or (year == 2018 and month < 12):
            #     continue
            # curr_date is end of the month
            curr_date = pd.to_datetime(f'{year}-{month:02d}-01') + pd.offsets.MonthEnd(1)
            # exclude last 7 days of the month to prevent data leakage
            if period == '7d':
                curr_date = curr_date - pd.Timedelta(days=6)
            else:
                curr_date = curr_date - pd.Timedelta(days=0)

            if direction == 'long':
                train_filter = (
                    (ml_features['time_period_end'] <= curr_date) &
                    (~ml_features['close_spot'].isna()) 
                )
            else:
                if year < 2020:
                    train_filter = (
                        (ml_features['time_period_end'] <= curr_date) &
                        (~ml_features['close_spot'].isna()) 
                    )
                else:
                    # For futures, we can use all data after 2019
                    # to train the model as futures data is available
                    train_filter = (
                        (ml_features['time_period_end'] <= curr_date) &
                        (~ml_features['close_futures'].isna()) 
                    )

            data_train = ml_features[train_filter]
            max_train_date = pd.to_datetime(data_train['time_period_end'].dt.date.max())
            min_train_date = pd.to_datetime(max_train_date - pd.Timedelta(days = 365))

            # Test data is all data in the next month
            next_month = month + 1
            if next_month == 13:
                next_year = year + 1
                next_month = 1
            else:
                next_year = year

            if direction == 'long':
                test_filter = (
                    (ml_features['time_period_end'].dt.year == next_year) &
                    (ml_features['time_period_end'].dt.month == next_month) &
                    (~ml_features['close_spot'].isna()) 
                )
            else:
                if next_year < 2020:
                    test_filter = (
                        (ml_features['time_period_end'].dt.year == next_year) &
                        (ml_features['time_period_end'].dt.month == next_month) &
                        (~ml_features['close_spot'].isna()) 
                    )
                else:
                    # For futures, we can use all data after 2019
                    # to test the model as futures data is available
                    test_filter = (
                        (ml_features['time_period_end'].dt.year == next_year) &
                        (ml_features['time_period_end'].dt.month == next_month) &
                        (~ml_features['close_futures'].isna()) 
                    )

            data_test = ml_features[test_filter]
            data_train.replace([np.inf, -np.inf], np.nan, inplace=True)
            data_test.replace([np.inf, -np.inf], np.nan, inplace=True)
            
            if direction == 'long':
                if period == '7d':
                    # Filter out data with nan forward returns
                    data_train = data_train.dropna(subset=['forward_returns_7'])
                    data_test = data_test.dropna(subset=['forward_returns_7'])
                else:
                    # Filter out data with nan forward returns
                    data_train = data_train.dropna(subset=['forward_returns_1'])
                    data_test = data_test.dropna(subset=['forward_returns_1'])
            else:
                if period == '7d':
                    if year < 2020:
                        # Filter out data with nan forward returns
                        data_train = data_train.dropna(subset=['forward_returns_7'])
                        data_test = data_test.dropna(subset=['forward_returns_7'])
                    else:
                        # Filter out data with nan futures forward returns
                        data_train = data_train.dropna(subset=['futures_forward_returns_7'])
                        data_test = data_test.dropna(subset=['futures_forward_returns_7'])
                else:
                    # Filter out data with nan futures forward returns
                    data_train = data_train.dropna(subset=['futures_forward_returns_1'])
                    data_test = data_test.dropna(subset=['futures_forward_returns_1'])

            # Filter out training data older than 2 years from the max train date
            # filter_train = (
            #     (data_train['time_period_end'] >= min_train_date) 
            # )
            # data_train = data_train[filter_train]

            print(f'len data_train: {len(data_train)}')
            print(f'len data_test: {len(data_test)}')

            if data_train.empty or data_test.empty:
                continue

            X_train = data_train
            X_test = data_test

            # max_train_date = X_train['time_period_end'].quantile(0.9)
            # X_val = X_train[X_train['time_period_end'] > max_train_date]
            # X_train = X_train[X_train['time_period_end'] <= max_train_date]

            # Ensure no data leakage
            assert X_train['time_period_end'].max() < X_test['time_period_end'].min(), 'Data leakage detected'
            # assert X_val['time_period_end'].max() < X_test['time_period_end'].min(), 'Data leakage detected'
            # assert X_train['time_period_end'].max() < X_val['time_period_end'].min(), 'Data leakage detected'

            # Split data into features and target
            if is_reg:
                if direction == 'long':
                    if period == '7d':
                        y_train = X_train['forward_returns_7']
                        # y_val = X_val['forward_returns_7']
                        y_test = X_test['forward_returns_7']
                    else:
                        y_train = X_train['forward_returns_1']
                        # y_val = X_val['forward_returns_1']
                        y_test = X_test['forward_returns_1']
                else:
                    if period == '7d':
                        if year < 2020:
                            y_train = X_train['forward_returns_7']
                            # y_val = X_val['forward_returns_7']
                            y_test = X_test['forward_returns_7']
                        else:
                            y_train = X_train['futures_forward_returns_7']
                            # y_val = X_val['futures_forward_returns_7']
                            y_test = X_test['futures_forward_returns_7']
                    else:
                        y_train = X_train['futures_forward_returns_1']
                        # y_val = X_val['futures_forward_returns_7']
                        y_test = X_test['futures_forward_returns_1']
            else:
                if direction == 'long':
                    if period == '7d':
                        y_train = (X_train['forward_returns_7'] > 0).astype(int)
                        # y_val = (X_val['forward_returns_7'] > 0).astype(int)
                        y_test = (X_test['forward_returns_7'] > 0).astype(int)
                    else:
                        y_train = (X_train['forward_returns_1'] > 0).astype(int)
                        # y_val = (X_val['forward_returns_1'] > 0).astype(int)
                        y_test = (X_test['forward_returns_1'] > 0).astype(int)
                else:
                    if period == '7d':
                        if year < 2020:
                            y_train = (X_train['forward_returns_7'] < 0).astype(int)
                            # y_val = (X_val['forward_returns_7'] < 0).astype(int)
                            y_test = (X_test['forward_returns_7'] < 0).astype(int)
                        else:
                            y_train = (X_train['futures_forward_returns_7'] < 0).astype(int)
                            # y_val = (X_val['futures_forward_returns_7'] < 0).astype(int)
                            y_test = (X_test['futures_forward_returns_7'] < 0).astype(int)
                    else:
                        y_train = (X_train['futures_forward_returns_1'] < 0).astype(int)
                        # y_val = (X_val['futures_forward_returns_1'] < 0).astype(int)
                        y_test = (X_test['futures_forward_returns_1'] < 0).astype(int)

            print()
            print(f'Train Date Range: {X_train["time_period_end"].min()} - {X_train["time_period_end"].max()}')
            print(f'Number of observations (Train): {X_train.shape[0]}')
            print()

            # print(f'Validation Date Range: {X_val["time_period_end"].min()} - {X_val["time_period_end"].max()}')
            # print(f'Number of observations (Validation): {X_val.shape[0]}')
            # print()

            print(f'Test Date Range: {X_test["time_period_end"].min()} - {X_test["time_period_end"].max()}')
            print(f'Number of observations (Test): {X_test.shape[0]}')
            print()

            # best_params = run_hyperparameter_tuning(
            #     X_train,
            #     y_train,
            #     param_space_lgbm,
            #     max_evals=50,
            #     direction=direction,
            #     model_type=model_type,
            #     is_reg=is_reg
            # )
            # print(f'Best Hyperparameters: {best_params}') 
            # model = best_params

            param_space_lgbm = {
                # --- core booster -------------------------------------------------------
                "objective": "binary" if not is_reg else "regression",
                # --- learning rate / #iterations ---------------------------------------
                "learning_rate":  0.1,
                "n_estimators":   100,
                # --- tree complexity ----------------------------------------------------
                "max_depth": 5,
                # --- row / column subsampling ------------------------------------------
                "feature_fraction": 0.8,  # aka colsample_bytree
                "bagging_fraction": 0.8,  # aka subsample
                "bagging_freq": 5,        # aka subsample_freq
                # --- regularisation -----------------------------------------------------
                "lambda_l2": 5,
                # --- boilerplate --------------------------------------------------------
                "n_jobs":      -1,
                "verbosity":   -1
            }

            model = lgb.LGBMClassifier(**param_space_lgbm) if not is_reg else lgb.LGBMRegressor(**param_space_lgbm)
            model.fit(
                X_train[rz_cols],
                y_train,
                eval_set=[(X_test[rz_cols], y_test)],
                # callbacks=[lgb.early_stopping(stopping_rounds=30, verbose=False)]
            )
            
            if is_reg:
                # Make predictions
                y_pred = model.predict(X_test[rz_cols])
                X_test['y_true'] = y_test
                X_test['y_pred'] = y_pred
                information_coefficient = np.corrcoef(y_test, y_pred)[0, 1]
                mae = mean_absolute_error(y_test, y_pred)
                baseline_mae = mean_absolute_error(y_test, [y_train.mean()] * len(y_test))
                mse = mean_squared_error(y_test, y_pred)
                baseline_mse = mean_squared_error(y_test, [y_train.mean()] * len(y_test))
                print(f'Information Coefficient: {information_coefficient:.3f}')
                print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.3f}, Baseline MAE: {baseline_mae:.3f}')
                print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred):.3f}, Baseline MSE: {baseline_mse:.3f}')
                print()
                # Scatter plot of predicted vs true values
                plt.figure(figsize=(8, 8))
                sns.regplot(x=y_pred, y=y_test, line_kws={"color": "red"})
                plt.xlabel('Predicted Values')
                plt.ylabel('True Values')
                plt.title('Predicted vs True Values')
                plt.show()
                # Feature importance plot for lightgbm and xgboost
                try:
                    if model_type == 'lgbm':
                        fig, ax = plt.subplots(figsize=(10, 7))
                        max_features = min(10, len(rz_cols))
                        lgb.plot_importance(model, max_num_features=max_features, importance_type='gain', title='Feature Importance (LightGBM)', ax=ax)
                        plt.show()
                except:
                    continue
            else:
                y_pred_proba = model.predict_proba(X_test[rz_cols].fillna(0))[:, 1]
                y_pred = (y_pred_proba >= 0.5).astype(int)

                X_test['y_true'] = y_test
                X_test['y_pred'] = y_pred
                X_test['y_pred_proba'] = y_pred_proba

                print('Class Distribution:')
                print(X_test['y_true'].value_counts(normalize = True))
                print()

                trade_side = np.where(
                    y_pred == 1, 1, 0
                )
                if direction == 'short':
                    if year < 2020:
                        trade_side = np.where(y_pred == 1, -1, 0)
                        trade_pnl = trade_side * X_test['forward_returns_7'].values
                    else:                        
                        trade_side = np.where(y_pred == 1, -1, 0)
                        trade_pnl = trade_side * X_test['futures_forward_returns_7'].values
                        
                    trades_mask = trade_side == -1
                    expectancy = trade_pnl[trades_mask].mean()
                    std_neg = trade_pnl[trades_mask & (trade_pnl < 0)].std()
                    sortino_like = expectancy / std_neg if std_neg != 0 else 0
                    hit_rate = (trade_pnl[trades_mask] > 0).mean()
                else:
                    trade_side = np.where(y_pred == 1, 1, 0)
                    trade_pnl = trade_side * X_test['forward_returns_7'].values
                    trades_mask = trade_side == 1
                    expectancy = trade_pnl[trades_mask].mean()
                    std_neg = trade_pnl[trades_mask & (trade_pnl < 0)].std()
                    sortino_like = expectancy / std_neg if std_neg != 0 else 0
                    hit_rate = (trade_pnl[trades_mask] > 0).mean()

                if direction == 'long':
                    # Market Performance
                    market_expectancy = X_test['forward_returns_7'].mean()
                    market_std_neg = X_test['forward_returns_7'][X_test['forward_returns_7'] < 0].std()
                    market_sortino_like = market_expectancy / market_std_neg if market_std_neg != 0 else np.nan
                    market_hit_rate = (X_test['forward_returns_7'] > 0).mean()
                else:
                    # Market Performance
                    if year < 2020:
                        market_expectancy = -X_test['forward_returns_7'].mean()
                        market_std_neg = X_test['forward_returns_7'][X_test['forward_returns_7'] > 0].std()
                        market_sortino_like = market_expectancy / market_std_neg if market_std_neg != 0 else np.nan
                        market_hit_rate = (X_test['forward_returns_7'] < 0).mean()
                    else:
                        market_expectancy = -X_test['futures_forward_returns_7'].mean()
                        market_std_neg = X_test['futures_forward_returns_7'][X_test['futures_forward_returns_7'] > 0].std()
                        market_sortino_like = market_expectancy / market_std_neg if market_std_neg != 0 else np.nan
                        market_hit_rate = (X_test['futures_forward_returns_7'] < 0).mean()
                    
                print(f'Expectancy: {expectancy:.3f}, Market Expectancy: {market_expectancy:.3f}')
                print(f'Std Negative Returns: {std_neg:.3f}, Market Std Negative Returns: {market_std_neg:.3f}')
                print(f'Sortino-Like: {sortino_like:.3f}, Market Sortino-Like: {market_sortino_like:.3f}')
                print(f'Hit Rate: {hit_rate:.3f}, Market Hit Rate: {market_hit_rate:.3f}')
                print()

                # Classification Report
                print('Classification Report:')
                print(classification_report(X_test['y_true'], X_test['y_pred']))
                print()

                # Calibration Curve
                disp = CalibrationDisplay.from_predictions(y_test, y_pred_proba)
                plt.show()

            # Delete old data from memory
            del X_train
            del X_test
            del data_train
            del data_test

            model_folder = 'regression' if is_reg else 'classification'
            model_path = f'/Users/louisspencer/Desktop/Trading-Bot/data/pretrained_models/{model_folder}/{model_type}_{direction}_model_{next_year}_{next_month}_{period}.pkl'
            
            # Save the model
            joblib.dump(model, model_path)

In [None]:
def plot_model_performance(model_type, direction, period='7d', is_reg=False):
    has_short_model = False
    perf_metrics = []
    predictions = []
    for year in range(2018, 2026):
        for month in range(1, 13):
            try:
                cls_reg = 'regression' if is_reg else 'classification'
                model_path = f'/Users/louisspencer/Desktop/Trading-Bot/data/pretrained_models/{cls_reg}/{model_type}_{direction}_model_{year}_{month}_{period}.pkl'
                model = joblib.load(model_path)
                input_features = model.feature_names_in_
                model.set_params(verbosity=-1)
            except Exception as e:
                print(e)
                continue

            if direction == 'long':
                filter = (
                    (ml_features['time_period_end'].dt.year == year) &
                    (ml_features['time_period_end'].dt.month == month) &
                    (~ml_features['close_spot'].isna())
                )
            else:
                filter = (
                    (ml_features['time_period_end'].dt.year == year) &
                    (ml_features['time_period_end'].dt.month == month) &
                    (~ml_features['close_futures'].isna()) 
                )

            data_test = ml_features[filter]

            if data_test.empty:
                print(f'No data for {year}-{month:02d}')
                if is_reg:
                    if period == '7d':
                        perf_metrics.append({
                            'year': year,
                            'month': month,
                            'information_coefficient_7d': np.nan,
                            'n_preds': 0
                        })
                    else:
                        perf_metrics.append({
                            'year': year,
                            'month': month,
                            'information_coefficient_1d': np.nan,
                            'n_preds': 0
                        })
                else:
                    perf_metrics.append({
                        'year': year,
                        'month': month,
                        'expectancy_7d': np.nan,
                        'sortino_like_7d': np.nan,
                        'n_preds': 0
                    })
                continue

            data_test.replace([np.inf, -np.inf], np.nan, inplace=True)
            if period == '7d':
                if direction == 'long':
                    data_test = data_test.dropna(subset=['forward_returns_7'])
                else:
                    data_test = data_test.dropna(subset=['futures_forward_returns_7'])
            else:
                data_test = data_test.dropna(subset=['forward_returns_1']) if direction == 'long' else data_test.dropna(subset=['futures_forward_returns_1'])

            data_test_ = data_test[input_features].copy()
                                            
            # Make predictions
            y_pred = model.predict(data_test_)
            data_test['y_pred'] = y_pred
            predictions.append(data_test[['time_period_end', 'symbol_id', 'y_pred', f'forward_returns_{period[0]}']])
            if is_reg:
                if direction == 'long':
                    if period == '7d':
                        data_test['y_true'] = data_test['forward_returns_7']
                        information_coefficient = np.corrcoef(data_test['y_true'], y_pred)[0, 1]
                        n = len(data_test['y_true'])
                    else:
                        data_test['y_true'] = data_test['forward_returns_1']
                        information_coefficient = np.corrcoef(data_test['y_true'], y_pred)[0, 1]
                        n = len(data_test['y_true'])
                else:
                    if period == '7d':
                        data_test['y_true'] = data_test['futures_forward_returns_7']
                        information_coefficient = np.corrcoef(data_test['y_true'], y_pred)[0, 1]
                        n = len(data_test['y_true'])
                    else:
                        data_test['y_true'] = data_test['futures_forward_returns_1']
                        information_coefficient = np.corrcoef(data_test['y_true'], y_pred)[0, 1]
                        n = len(data_test['y_true'])
            else:
                if direction == 'long':
                    trade_side = np.where(y_pred == 1, 1, 0)
                    trade_pnl = trade_side * data_test['forward_returns_7'].values
                    trades_mask = trade_side == 1
                    expectancy = trade_pnl[trades_mask].mean()
                    std_neg = trade_pnl[trades_mask & (trade_pnl < 0)].std()
                    sortino_like = expectancy / std_neg if std_neg != 0 else 0
                    hit_rate = (trade_pnl[trades_mask] > 0).mean()
                    n = len(trades_mask[trades_mask])
                else:
                    trade_side = np.where(y_pred == 1, -1, 0)
                    trade_pnl = trade_side * data_test['futures_forward_returns_7'].values
                    trades_mask = trade_side == -1
                    expectancy = trade_pnl[trades_mask].mean()
                    std_neg = trade_pnl[trades_mask & (trade_pnl < 0)].std()
                    sortino_like = expectancy / std_neg if std_neg != 0 else 0
                    hit_rate = (trade_pnl[trades_mask] > 0).mean()
                    n = len(trades_mask[trades_mask])

            print(f'Model Performance for {year}-{month:02d}:')

            if is_reg:
                print(f'Information Coefficient ({period}): {information_coefficient:.3f}')
                perf_metrics.append({
                    'year': year,
                    'month': month,
                    f'information_coefficient_{period}': information_coefficient,
                    'n_preds': n
                })
            else:
                perf_metrics.append({
                    'year': year,
                    'month': month,
                    f'expectancy_{period}': expectancy,
                    f'sortino_like_{period}': sortino_like,
                    'n_preds': n,
                })
                print(f'Expectancy: {expectancy:.3f}, Sortino-Like: {sortino_like:.3f}, Hit Rate: {hit_rate:.3f}', 
                      f'Num. Positive Predictions: {n}')

            print('='*80)

    # Convert performance metrics to DataFrame
    perf_metrics = pd.DataFrame(perf_metrics)
    reg_tag = 'reg' if is_reg else 'cls'

    # Save performance metrics to CSV
    perf_metrics.to_csv(f'/Users/louisspencer/Desktop/Trading-Bot/data/performance_metrics_{reg_tag}_{model_type}_{direction}_{period}.csv', index=False)

    # Combine all predictions into a single DataFrame
    predictions = pd.concat(predictions, ignore_index=True)
    predictions.to_csv(f'/Users/louisspencer/Desktop/Trading-Bot/data/predictions_{reg_tag}_{model_type}_{direction}_{period}.csv', index=False)

In [None]:
# train_model(2018, 2025, is_reg=True, model_type='lgbm', direction='long', period='1d')

In [315]:
plot_model_performance('lgbm', 'long', period='1d', is_reg=True)

[Errno 2] No such file or directory: '/Users/louisspencer/Desktop/Trading-Bot/data/pretrained_models/regression/lgbm_long_model_2018_1_1d.pkl'
Model Performance for 2018-02:
Information Coefficient (1d): 0.069
Model Performance for 2018-03:
Information Coefficient (1d): -0.050
Model Performance for 2018-04:
Information Coefficient (1d): 0.008
Model Performance for 2018-05:
Information Coefficient (1d): 0.092
Model Performance for 2018-06:
Information Coefficient (1d): -0.021
Model Performance for 2018-07:
Information Coefficient (1d): -0.164
Model Performance for 2018-08:
Information Coefficient (1d): -0.109
Model Performance for 2018-09:
Information Coefficient (1d): 0.031
Model Performance for 2018-10:
Information Coefficient (1d): 0.148
Model Performance for 2018-11:
Information Coefficient (1d): 0.100
Model Performance for 2018-12:
Information Coefficient (1d): 0.047
Model Performance for 2019-01:
Information Coefficient (1d): 0.155
Model Performance for 2019-02:
Information Coeffi

In [317]:
# Load performance metrics (7d)
perf_metrics = pd.read_csv('/Users/louisspencer/Desktop/Trading-Bot/data/performance_metrics_reg_lgbm_long_7d.csv')
perf_metrics['date'] = pd.to_datetime(perf_metrics[['year', 'month']].assign(day=1))
weighted_ic_1d = (perf_metrics['information_coefficient_7d'] * perf_metrics['n_preds']).sum() / perf_metrics['n_preds'].sum()
ic_1d = perf_metrics['information_coefficient_7d'].mean()

print(f'Weighted Information Coefficient (7d): {weighted_ic_1d:.3f}')
print(f'Information Coefficient (1d): {ic_1d:.3f}')
print()

# Load performance metrics (1d)
perf_metrics = pd.read_csv('/Users/louisspencer/Desktop/Trading-Bot/data/performance_metrics_reg_lgbm_long_1d.csv')
perf_metrics['date'] = pd.to_datetime(perf_metrics[['year', 'month']].assign(day=1))
weighted_ic_1d = (perf_metrics['information_coefficient_1d'] * perf_metrics['n_preds']).sum() / perf_metrics['n_preds'].sum()
ic_1d = perf_metrics['information_coefficient_1d'].mean()

print(f'Weighted Information Coefficient (1d): {weighted_ic_1d:.3f}')
print(f'Information Coefficient (1d): {ic_1d:.3f}')

Weighted Information Coefficient (7d): 0.188
Information Coefficient (1d): 0.152

Weighted Information Coefficient (1d): 0.397
Information Coefficient (1d): 0.290


In [326]:
# Volume data to join with predictions
volume_data = ml_features[['time_period_end', 'symbol_id', 'dollar_volume_spot']].copy()
volume_data.to_csv('/Users/louisspencer/Desktop/Trading-Bot/data/volume_data.csv', index=False)

In [327]:
def plot_rolling_ic(window=30, period='1d'):
    # Load predictions
    predictions = pd.read_csv(f'/Users/louisspencer/Desktop/Trading-Bot/data/predictions_reg_lgbm_long_{period}.csv')
    predictions['time_period_end'] = pd.to_datetime(predictions['time_period_end'])

    rolling_ic = []
    date_range = pd.date_range(start=predictions['time_period_end'].min(), end=predictions['time_period_end'].max(), freq='D')
    for current_date in date_range:
        # 30-day window excluding the most recent (period) days to prevent data leakage
        skip_days = 7 if period == '7d' else 1
        mask = (predictions['time_period_end'] >= current_date - pd.Timedelta(days=window + skip_days)) & (predictions['time_period_end'] < current_date - pd.Timedelta(days=skip_days)) 
        subset = predictions[mask]
        if subset.empty:
            rolling_ic.append({'date': current_date, f'rolling_{window}d_ic': np.nan})
            continue
        ic = np.corrcoef(subset['y_pred'], subset[f'forward_returns_{skip_days}'])[0, 1]
        rolling_ic.append({'date': current_date, f'rolling_{window}d_ic': ic})

    rolling_ic = pd.DataFrame(rolling_ic)
    fig = px.line(rolling_ic, x='date', y=f'rolling_{window}d_ic', title=f'Rolling {window}-Day Information Coefficient ({period})', labels={'date': 'Date', f'rolling_{window}d_ic': f'Rolling {window}-Day IC'})
    fig.add_hline(y=0, line_dash="dot", line_color="red")
    fig.show()
     

In [328]:
plot_rolling_ic(window=30, period='7d')

In [329]:
plot_rolling_ic(window=30, period='1d')