In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
import shap
import mlflow
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

mlflow.set_tracking_uri("../mlruns")

# Load your processed data
DATA_PATH = '../data/processed/etf_features.parquet'
data = pd.read_parquet(DATA_PATH)

# Separate features (X) and target (y)
X = data.drop('target', axis=1)
y = data['target']

In [None]:
# Define the chronological split point
# For example, use data up to the end of 2021 for training, and 2022 onwards for testing.
split_date = '2022-01-01'
X_train, X_test = X.loc[:split_date], X.loc[split_date:]
y_train, y_test = y.loc[:split_date], y.loc[split_date:]

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

mlflow.set_experiment("ETF_Trend_Prediction")

In [None]:
# Train Logistic Regression
with mlflow.start_run(run_name="LogisticRegression_Baseline"):
    model_lr = LogisticRegression(max_iter=1000, random_state=42)
    model_lr.fit(X_train, y_train)
    y_pred_lr = model_lr.predict(X_test)
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_lr))
    mlflow.log_metric("f1_score", f1_score(y_test, y_pred_lr))
    print(f"Logistic Regression F1 Score: {f1_score(y_test, y_pred_lr):.4f}")

# Train Random Forest
with mlflow.start_run(run_name="RandomForest_Baseline"):
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_rf))
    mlflow.log_metric("f1_score", f1_score(y_test, y_pred_rf))
    print(f"Random Forest F1 Score: {f1_score(y_test, y_pred_rf):.4f}")

In [None]:
def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'random_state': 42
    }
    
    model = xgb.XGBClassifier(**params)
    
    # Use TimeSeriesSplit for robust cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring='f1', n_jobs=-1).mean()
    
    return score

In [None]:
# Run the study to find the best params
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=125) 

best_params = study.best_params
print("Best XGBoost Params:", best_params)

# Train the final XGBoost model with the best parameters and log to MLflow
with mlflow.start_run(run_name="XGBoost_Tuned_Champion") as run:
    final_xgb_model = xgb.XGBClassifier(**best_params, random_state=42)
    final_xgb_model.fit(X_train, y_train)
    y_pred_xgb = final_xgb_model.predict(X_test)
    y_pred_proba_xgb = final_xgb_model.predict_proba(X_test)[:, 1]

    f1 = f1_score(y_test, y_pred_xgb)
    print(f"Final Tuned XGBoost F1 Score: {f1:.4f}")

    mlflow.log_params(best_params)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_xgb))
    mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_pred_proba_xgb))

    mlflow.xgboost.log_model(final_xgb_model, "xgb-model")
    champion_run_id = run.info.run_id # Capture run ID

    # --- SHAP Plot Generation and Logging (Move these lines here) ---
    print("\nSHAP analysis complete and plot logged to MLflow.")

    # 1. Create a SHAP Explainer
    explainer = shap.TreeExplainer(final_xgb_model)
    shap_values = explainer.shap_values(X_test) # Or X_train, depending on what you want to explain

    # 2. Generate and save the SHAP summary plot to a temporary file
    # Ensure you import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(10, 8)) # You might want to specify figure size
    shap.summary_plot(shap_values, X_test, show=False, plot_size=(8, 6)) # show=False prevents immediate display
    plt.title("SHAP Feature Importance for XGBoost Model") # Add a title
    plot_filename = "shap_summary_champion.png" # Give it a more descriptive name
    plt.savefig(plot_filename, bbox_inches='tight', dpi=300) # Save the plot to a file
    plt.close() # Close the plot to free memory

    # 3. Log the saved plot as an MLflow artifact to the *current* active run
    mlflow.log_artifact(plot_filename)

In [None]:
# --- MLP Challenger Model ---
# Step 1: Imports and Data Scaling
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

# Deep learning models are sensitive to feature scale. We must standardize our data.
# We fit the scaler ONLY on the training data to prevent data leakage from the test set.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data successfully scaled.")
print(f"Shape of scaled training data: {X_train_scaled.shape}")

In [None]:
# Convert numpy arrays to PyTorch Tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders to handle batching
# We don't shuffle time-series data to preserve temporal order if needed, 
# but for a simple MLP, shuffling is often acceptable. Let's keep it False for rigor.
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print("PyTorch Tensors and DataLoaders created.")

In [None]:
# Step 2: Define the MLP Architecture
class ETF_MLP(nn.Module):
    def __init__(self, input_size, hidden_size_1=128, hidden_size_2=64, dropout_rate=0.5):
        """
        Initializes the MLP model.
        
        Args:
            input_size (int): The number of input features.
            hidden_size_1 (int): Number of neurons in the first hidden layer.
            hidden_size_2 (int): Number of neurons in the second hidden layer.
            dropout_rate (float): The dropout probability.
        """
        super(ETF_MLP, self).__init__()
        
        # --- Layer Definitions ---
        self.layer_1 = nn.Linear(input_size, hidden_size_1)
        self.bn_1 = nn.BatchNorm1d(hidden_size_1)
        
        self.layer_2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.bn_2 = nn.BatchNorm1d(hidden_size_2)
        
        self.output_layer = nn.Linear(hidden_size_2, 1)
        
        # --- Activation and Regularization ---
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_rate)
        
    def forward(self, x):
        """ The forward pass of the model. """
        # First hidden layer
        x = self.layer_1(x)
        x = self.bn_1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Second hidden layer
        x = self.layer_2(x)
        x = self.bn_2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Output layer with sigmoid for binary classification
        x = torch.sigmoid(self.output_layer(x))
        return x

# Instantiate the model to test
input_features = X_train.shape[1]
model_mlp = ETF_MLP(input_size=input_features)
print("MLP Model Architecture:")
print(model_mlp)

In [None]:
# Step 3: Manual MLP Training and Evaluation

# --- Configuration ---
INPUT_SIZE = X_train.shape[1]
LEARNING_RATE = 0.001
EPOCHS = 50

# --- Model, Loss, Optimizer (Demonstrates 5.2, 5.3) ---
model_mlp = ETF_MLP(input_size=INPUT_SIZE, dropout_rate=0.4)
criterion = nn.BCELoss() # Binary Cross Entropy Loss for binary classification
optimizer = torch.optim.Adam(model_mlp.parameters(), lr=LEARNING_RATE) # Adam Optimizer
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS) # LR Schedule

# --- MLflow Logging ---
with mlflow.start_run(run_name="MLP_Manual_Baseline") as run:
    mlflow.log_params({"learning_rate": LEARNING_RATE, "epochs": EPOCHS, "optimizer": "Adam"})
    
    # --- Training Loop ---
    for epoch in range(EPOCHS):
        model_mlp.train() # Set model to training mode
        for features, labels in train_loader:
            # Forward pass
            outputs = model_mlp(features)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Update learning rate
        scheduler.step()
        
        # --- Evaluation on Test Set ---
        model_mlp.eval() # Set model to evaluation mode
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for features, labels in test_loader:
                outputs = model_mlp(features)
                predicted = (outputs > 0.5).float()
                all_preds.extend(predicted.numpy())
                all_labels.extend(labels.numpy())
        
        # Calculate and log F1 score for the epoch
        f1 = f1_score(all_labels, all_preds)
        mlflow.log_metric("test_f1_score", f1, step=epoch)

    print(f"Final MLP F1 Score from manual run: {f1:.4f}")
    # Log the final model
    mlflow.pytorch.log_model(model_mlp, "mlp-model")

In [None]:
import mlflow
import pandas as pd

# Ensure MLflow is pointing to your tracking server/directory
# mlflow.set_tracking_uri("../mlruns") # Uncomment if running in a new session/script

# Get the experiment by its name
experiment = mlflow.get_experiment_by_name("ETF_Trend_Prediction")

if experiment:
    # Search for all runs within this experiment
    runs_df = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        # Order by F1 score (desc) and then by start time (desc)
        order_by=["metrics.f1_score DESC", "start_time DESC"],
        output_format="pandas"
    )

    # --- New Logic to Extract All Metrics and Parameters ---
    # Identify all metric and parameter columns
    metric_cols = [col for col in runs_df.columns if col.startswith("metrics.")]
    param_cols = [col for col in runs_df.columns if col.startswith("params.")]

    # Select core run info, all metrics, and all parameters
    # The 'tags.mlflow.runName' contains the run name
    selected_cols = [
        "tags.mlflow.runName", "start_time", "run_id"
    ] + metric_cols + param_cols

    metrics_and_params = runs_df[selected_cols].copy()

    # Rename columns for better readability (optional, you can keep original for params if many)
    # This example renames just the core and metric columns
    metrics_and_params.rename(columns={
        "tags.mlflow.runName": "Run Name",
        "metrics.f1_score": "F1 Score",
        "metrics.accuracy": "Accuracy",
        "metrics.roc_auc": "ROC AUC"
        # Add more renames for specific metrics/params if you want,
        # but for ALL params, it might be too many to rename individually.
        # Keeping 'params.param_name' is often fine.
    }, inplace=True)

    print("Metrics and Parameters for 'ETF_Trend_Prediction' Experiment:")
    display(metrics_and_params)

else:
    print(f"Experiment 'ETF_Trend_Prediction' not found.")
