# Machine Learning Model Development for Statistical Arbitrage
This notebook demonstrates the full ML workflow: feature engineering, model selection, hyperparameter optimization, robust evaluation, and model persistence, all tailored for financial time series.

In [None]:
# 1. Feature Engineering Walkthrough (using src/features.py)
from src.features import FeatureEngineer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your cleaned spread data (from previous notebook or file)
spread_df = pd.read_parquet("../data/spread_features.parquet")  # or generate as before

fe = FeatureEngineer(spread_df)
fe.add_zscore("spread") \
  .add_half_life("spread") \
  .add_hurst_exponent("spread") \
  .add_volume_imbalance("buy_volume", "sell_volume") \
  .add_bid_ask_spread("bid", "ask") \
  .add_spread_autocorrelation("spread") \
  .add_realized_volatility("spread")

features_df = fe.get_features()
print(features_df.head())

# Visualize feature distributions and correlations
features_df.hist(figsize=(16, 10), bins=30)
plt.tight_layout()
plt.show()

sns.heatmap(features_df.corr(), annot=False, cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()

: 

In [None]:
# 2. Time-Series Cross-Validation & Leakage Prevention
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

X = features_df.dropna().drop(columns=["target"], errors="ignore")
y = features_df.loc[X.index, "target"]  # Define your target variable

tscv = TimeSeriesSplit(n_splits=5)
plt.figure(figsize=(10, 2))
for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    plt.plot(train_idx, [i+1]*len(train_idx), '|', color='blue')
    plt.plot(test_idx, [i+1]*len(test_idx), '|', color='red')
plt.title("Walk-Forward TimeSeriesSplit Visualization")
plt.xlabel("Sample Index")
plt.ylabel("CV Fold")
plt.show()

# (Optional) Purging and embargoing with mlfinlab
from mlfinlab.cross_validation import PurgedKFold
pkf = PurgedKFold(n_splits=5, embargo_td=0.01)
# Use pkf.split(X, event_times) for advanced leakage prevention

In [None]:
# 3. Advanced Model Training & Tuning (XGBoost, LightGBM, CatBoost)
from src.ml_models import MLModels

ml = MLModels(model_dir="../models")
model_types = ["xgboost", "lightgbm", "catboost"]
results = {}

for model_type in model_types:
    print(f"Training {model_type}...")
    ml.train(X, y, model_type=model_type)
    preds = ml.predict(X)
    results[model_type] = preds

# Bayesian Optimization with Optuna
best_params = ml.bayesian_optimization(X, y, model_type="xgboost", n_trials=20)
print("Best XGBoost params:", best_params)

In [None]:
# 4. Robust Model Evaluation
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc

for model_type, preds in results.items():
    print(f"Results for {model_type}:")
    print(classification_report(y, preds))
    auc_score = roc_auc_score(y, preds)
    print(f"AUC: {auc_score:.4f}")

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y, preds)
    plt.plot(recall, precision, label=model_type)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()

In [None]:
# 5. Feature Importance Analysis (SHAP)
import shap

ml.train(X, y, model_type="xgboost")
explainer = shap.Explainer(ml.model, X)
shap_values = explainer(X)
shap.summary_plot(shap_values, X)

In [None]:
# 6. Model Selection Criteria

from sklearn.metrics import roc_auc_score

# Assume 'results' is a dict: {model_type: predictions}
# and y is the true target

model_scores = {}
for model_type, preds in results.items():
    auc_score = roc_auc_score(y, preds)
    model_scores[model_type] = auc_score
    print(f"{model_type} AUC: {auc_score:.4f}")

# Select the best model by highest AUC
best_model_type = max(model_scores, key=model_scores.get)
print(f"\nBest model by AUC: {best_model_type} (AUC={model_scores[best_model_type]:.4f})")

# Optionally, set ml.model to the best model for downstream use
ml.train(X, y, model_type=best_model_type)

In [None]:
# 7. Model Persistence & Versioning
from src.utils import save_joblib

final_model = ml.model  # Assume best model is loaded
save_joblib(final_model, "../models/best_model_v1.joblib")
print("Model saved for deployment and backtesting.")

In [None]:
# 8. Scenario Analysis (at least 4 scenarios)
scenarios = [
    ("Normal Market", X),
    ("High Volatility", X * (1 + np.random.normal(0, 0.05, X.shape))),
    ("Flash Crash", X.copy().assign(spread=X["spread"] * (1 - 0.2))),
    ("Regime Shift", X.copy().assign(spread=X["spread"] * (1 + np.linspace(0, 0.1, len(X)))))
]

for name, scenario_X in scenarios:
    preds = ml.model.predict(scenario_X)
    print(f"Scenario: {name}")
    print("  Mean prediction:", np.mean(preds))
    # Add more scenario-specific evaluation as needed