In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
import shap
import mlflow
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Load your processed data
DATA_PATH = '../data/processed/etf_features.parquet'
data = pd.read_parquet(DATA_PATH)

# Separate features (X) and target (y)
X = data.drop('target', axis=1)
y = data['target']

In [3]:
# Define the chronological split point
# For example, use data up to the end of 2021 for training, and 2022 onwards for testing.
split_date = '2022-01-01'
X_train, X_test = X.loc[:split_date], X.loc[split_date:]
y_train, y_test = y.loc[:split_date], y.loc[split_date:]

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

mlflow.set_experiment("ETF_Trend_Prediction")

2025/08/23 11:49:42 INFO mlflow.tracking.fluent: Experiment with name 'ETF_Trend_Prediction' does not exist. Creating a new experiment.


Training set size: 2380
Test set size: 908


<Experiment: artifact_location='file:///c:/Users/dawso/Dev/Personal/AIGrind/mlops-etf-forecasting/notebooks/mlruns/587245152497429123', creation_time=1755964182346, experiment_id='587245152497429123', last_update_time=1755964182346, lifecycle_stage='active', name='ETF_Trend_Prediction', tags={}>

In [4]:
# Train Logistic Regression
with mlflow.start_run(run_name="LogisticRegression_Baseline"):
    model_lr = LogisticRegression(max_iter=1000, random_state=42)
    model_lr.fit(X_train, y_train)
    y_pred_lr = model_lr.predict(X_test)
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_lr))
    mlflow.log_metric("f1_score", f1_score(y_test, y_pred_lr))
    print(f"Logistic Regression F1 Score: {f1_score(y_test, y_pred_lr):.4f}")

# Train Random Forest
with mlflow.start_run(run_name="RandomForest_Baseline"):
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_rf))
    mlflow.log_metric("f1_score", f1_score(y_test, y_pred_rf))
    print(f"Random Forest F1 Score: {f1_score(y_test, y_pred_rf):.4f}")

Logistic Regression F1 Score: 0.6968
Random Forest F1 Score: 0.6242


In [5]:
def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'random_state': 42
    }
    
    model = xgb.XGBClassifier(**params)
    
    # Use TimeSeriesSplit for robust cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring='f1', n_jobs=-1).mean()
    
    return score

In [6]:
# Run the study to find the best params
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) # Run for 50 trials

best_params = study.best_params
print("Best XGBoost Params:", best_params)

# Train the final XGBoost model with the best parameters and log to MLflow
with mlflow.start_run(run_name="XGBoost_Tuned_Champion") as run:
    final_xgb_model = xgb.XGBClassifier(**best_params, random_state=42)
    final_xgb_model.fit(X_train, y_train)
    y_pred_xgb = final_xgb_model.predict(X_test)
    
    f1 = f1_score(y_test, y_pred_xgb)
    print(f"Final Tuned XGBoost F1 Score: {f1:.4f}")
    
    # Log everything to MLflow
    mlflow.log_params(best_params)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_xgb))
    mlflow.log_metric("roc_auc", roc_auc_score(y_test, final_xgb_model.predict_proba(X_test)[:, 1]))
    
    # Save the model
    mlflow.xgboost.log_model(final_xgb_model, "xgb-model")
    
    # Capture the run ID for later
    champion_run_id = run.info.run_id

[I 2025-08-23 11:50:05,846] A new study created in memory with name: no-name-947cf6b0-fb25-4850-9f4c-93c826c4667c
[I 2025-08-23 11:50:12,258] Trial 0 finished with value: 0.5479968395472907 and parameters: {'n_estimators': 920, 'max_depth': 9, 'learning_rate': 0.01128201100741125, 'subsample': 0.6153642174913793, 'colsample_bytree': 0.9961571665220094, 'gamma': 2.854272440093066}. Best is trial 0 with value: 0.5479968395472907.
[I 2025-08-23 11:50:15,171] Trial 1 finished with value: 0.5065589187226617 and parameters: {'n_estimators': 813, 'max_depth': 10, 'learning_rate': 0.1418325572862168, 'subsample': 0.9543419163932172, 'colsample_bytree': 0.846499658757347, 'gamma': 4.347146490084573}. Best is trial 0 with value: 0.5479968395472907.
[I 2025-08-23 11:50:18,154] Trial 2 finished with value: 0.5091366625747832 and parameters: {'n_estimators': 772, 'max_depth': 5, 'learning_rate': 0.1392702940553672, 'subsample': 0.7387203990206367, 'colsample_bytree': 0.6061308984965355, 'gamma': 2.

Best XGBoost Params: {'n_estimators': 215, 'max_depth': 3, 'learning_rate': 0.010338381774272903, 'subsample': 0.693272401944256, 'colsample_bytree': 0.7380256387178412, 'gamma': 1.509407674529806}
Final Tuned XGBoost F1 Score: 0.6862




In [9]:
# Explain the model's predictions using SHAP
explainer = shap.TreeExplainer(final_xgb_model)
shap_values = explainer.shap_values(X_test)

# Create and save the summary plot
fig, ax = plt.subplots()
shap.summary_plot(shap_values, X_test, show=False)
plt.title("SHAP Feature Importance for XGBoost Model")
plt.savefig("shap_summary.png")
plt.close()

# Log the SHAP plot as an artifact in the same MLflow run
with mlflow.start_run(run_id=champion_run_id):
    mlflow.log_artifact("shap_summary.png")

print("SHAP analysis complete and plot logged to MLflow.")

SHAP analysis complete and plot logged to MLflow.
