# 03. Train Models (Matches Script Structure)

Matches the structure of 03_train_models.py with proper MLflow setup.

In [None]:
# ============================================================================
# IMPORTS - Match script structure
# ============================================================================

import pandas as pd
import numpy as np
import os
import warnings
import sys

# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from dotenv import load_dotenv
load_dotenv()

# Suppress annoying warnings
warnings.filterwarnings("ignore", message="pkg_resources is deprecated")

# MLflow
import mlflow
import mlflow.sklearn
import mlflow.tensorflow
from mlflow.models import infer_signature

# Scikit-learn
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ARIMA
from pmdarima import auto_arima

# TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout

# Force CPU to avoid GPU issues
try:
    tf.config.set_visible_devices([], 'GPU')
except:
    pass

# Project imports
from utils.data_manager import DataManager

# Constants - MATCH SCRIPT
EXPERIMENT_NAME = "EURUSD_Experiments"

# Setup visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 7)


In [None]:
# ============================================================================
# Evaluation Function - MATCH SCRIPT
# ============================================================================

def eval_metrics(actual, pred):
    """Match the exact function from 03_train_models.py"""
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    
    # Directional Accuracy
    actual_arr = np.array(actual)
    pred_arr = np.array(pred)
    
    actual_sign = np.sign(actual_arr)
    pred_sign = np.sign(pred_arr)
    accuracy = np.mean(actual_sign == pred_sign)
    
    return rmse, mae, accuracy


## 1. Load Data

In [None]:
print("Loading data via DataManager...")

dm = DataManager(data_type='processed')
train_df, test_df, scaler = dm.load_processed()

if train_df is None:
    print("Error: Could not load processed data. Run preprocessing first.")

print(f"✅ Loaded {len(train_df)} train rows, {len(test_df)} test rows")
print(f"Train: {train_df.index.min().date()} to {train_df.index.max().date()}")
print(f"Test:  {test_df.index.min().date()} to {test_df.index.max().date()}")

# Get local path for scaler (for logging artifact) - MATCH SCRIPT
scaler_path = dm.get_local_path('scaler.pkl')

# Feature selection - MATCH SCRIPT STRUCTURE
target_col = 'Target'

# Feature subsets - MATCH SCRIPT
FEATURES_LINREG = ['Return', 'MA_50', 'Return_20d', 'Lag_2', 'Lag_3']
FEATURES_LSTM = ['Return', 'MA_50', 'Return_20d', 'Lag_2', 'Lag_3', 'Open', 'High', 'Low', 'Close']

# Check which features exist
linreg_cols = [c for c in FEATURES_LINREG if c in train_df.columns]
lstm_cols = [c for c in FEATURES_LSTM if c in train_df.columns]

print(f"\nLinear Regression features ({len(linreg_cols)}): {linreg_cols}")
print(f"LSTM features ({len(lstm_cols)}): {lstm_cols}")


In [None]:
# ============================================================================
# MLflow Setup - MATCH SCRIPT EXACTLY
# ============================================================================

print("\nSetting up MLflow...")

# Check if running on AWS (simple check) - MATCH SCRIPT
tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
if tracking_uri:
    print(f"Using Remote MLflow Tracking URI: {tracking_uri}")
    mlflow.set_tracking_uri(tracking_uri)
else:
    print("Using Local MLflow (sqlite:///mlflow.db)")
    mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Set experiment - MATCH SCRIPT
mlflow.set_experiment(EXPERIMENT_NAME)

# Common Tags - MATCH SCRIPT
tags = {
    "developer": "User",
    "project": "EURUSD_Capstone",
    "data_version": "v1"
}


## 2. Train Ridge Regression (Improved Linear Regression)

In [None]:
# ============================================================================
# Model A: Ridge Regression (Improved Linear Regression)
# ============================================================================

with mlflow.start_run(run_name="Ridge_Regression_CV") as run:
    mlflow.set_tags(tags)
    mlflow.set_tag("model_type", "RidgeRegression")
    
    print("Training Ridge Regression with TimeSeries Cross-Validation...")
    
    # Prepare data
    X_train_lr = train_df[linreg_cols]
    y_train_lr = train_df[target_col]
    X_test_lr = test_df[linreg_cols]
    y_test_lr = test_df[target_col]
    
    # Time Series Cross-Validation
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Hyperparameter grid
    ridge = Ridge(random_state=42)
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
    
    # Grid search
    grid_search = GridSearchCV(
        ridge, 
        param_grid, 
        cv=tscv, 
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train_lr, y_train_lr)
    
    # Best model
    best_ridge = grid_search.best_estimator_
    predictions = best_ridge.predict(X_test_lr)
    
    # Calculate metrics
    rmse, mae, da = eval_metrics(y_test_lr, predictions)
    
    print(f"  Best alpha: {grid_search.best_params_['alpha']}")
    print(f"  RMSE: {rmse}")
    print(f"  MAE: {mae}")
    print(f"  Directional Accuracy: {da}")
    
    # Log Params - MATCH SCRIPT STRUCTURE
    mlflow.log_params(best_ridge.get_params())
    mlflow.log_param("features", linreg_cols)
    mlflow.log_param("best_alpha", grid_search.best_params_['alpha'])
    mlflow.log_param("cv_splits", 5)
    
    # Log Metrics - MATCH SCRIPT
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("directional_accuracy", da)
    
    # Log Scaler as Artifact - MATCH SCRIPT
    mlflow.log_artifact(scaler_path, artifact_path="scaler")
    
    # Log Feature Config - MATCH SCRIPT
    feature_config = {
        "features": linreg_cols,
        "target": target_col,
        "model_type": "RidgeRegression"
    }
    mlflow.log_dict(feature_config, "feature_config.json")
    
    # Infer and log signature - MATCH SCRIPT
    signature = infer_signature(X_train_lr, predictions)
    mlflow.sklearn.log_model(best_ridge, "model", signature=signature)
    
    ridge_results = {
        'predictions': predictions,
        'rmse': rmse,
        'mae': mae,
        'da': da,
        'model': best_ridge
    }
    
    print("✅ Ridge Regression trained and logged to MLflow")


In [None]:
# Visualize Ridge results
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.scatter(y_test_lr.values, predictions, alpha=0.5)
plt.plot([y_test_lr.min(), y_test_lr.max()], [y_test_lr.min(), y_test_lr.max()], 
         'r--', label='Perfect Prediction')
plt.xlabel('Actual Returns')
plt.ylabel('Predicted Returns')
plt.title(f'Ridge Regression\nDA: {da:.2%}')
plt.legend()

plt.subplot(1, 3, 2)
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': linreg_cols,
    'Coefficient': best_ridge.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

colors = ['green' if x > 0 else 'red' for x in feature_importance['Coefficient']]
plt.barh(feature_importance['Feature'], feature_importance['Coefficient'], color=colors)
plt.xlabel('Coefficient Value')
plt.title(f'Feature Importance (alpha={grid_search.best_params_["alpha"]})')
plt.gca().invert_yaxis()

plt.subplot(1, 3, 3)
plt.plot(y_test_lr.values[:50], label='Actual', alpha=0.7)
plt.plot(predictions[:50], label='Predicted', alpha=0.7)
plt.xlabel('Test Days (First 50)')
plt.ylabel('Return')
plt.title('Ridge Predictions')
plt.legend()

plt.tight_layout()
plt.show()


## 3. Train ARIMA (Match Script Structure)

In [None]:
# ============================================================================
# Model B: ARIMA - MATCH SCRIPT STRUCTURE
# ============================================================================

with mlflow.start_run(run_name="ARIMA") as run:
    mlflow.set_tags(tags)
    mlflow.set_tag("model_type", "ARIMA")
    
    print("Training ARIMA...")
    
    # For ARIMA, use the 'Return_Unscaled' series - MATCH SCRIPT
    train_series = train_df['Return_Unscaled']
    test_series = test_df['Return_Unscaled'].values  # Iterator as in script
    
    # Auto ARIMA with improved settings
    arima_model = auto_arima(
        train_series, 
        seasonal=False, 
        trend='c',
        start_p=1, start_q=1,
        max_p=5, max_q=5,
        d=None,
        test='adf',
        trace=True,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True
    )
    
    print(f"  Best Order: {arima_model.order}")
    
    # Log model immediately (Before updating) - MATCH SCRIPT
    feature_config = {
        "features": ['Return_Unscaled'],
        "target": target_col,
        "model_type": "ARIMA"
    }
    mlflow.log_dict(feature_config, "feature_config.json")
    mlflow.sklearn.log_model(arima_model, "model")
    
    # Rolling Forecast - MATCH SCRIPT LOGIC
    print("  Running Rolling Forecast for ARIMA...")
    
    predictions = []
    
    for obs in test_series:
        # Update with the new observation
        arima_model.update(obs)
        # Predict 1 step ahead
        pred = arima_model.predict(n_periods=1)[0]
        predictions.append(pred)
        
    predictions = np.array(predictions)
    
    # Align with test target
    y_test_arima = test_df[target_col]
    min_len = min(len(predictions), len(y_test_arima))
    predictions = predictions[:min_len]
    y_test_arima = y_test_arima.values[:min_len]
    
    # Calculate metrics
    rmse, mae, da = eval_metrics(y_test_arima, predictions)
    
    print(f"  RMSE: {rmse}")
    print(f"  MAE: {mae}")
    print(f"  Directional Accuracy: {da}")
    
    # Log parameters - MATCH SCRIPT
    mlflow.log_param("order", str(arima_model.order))
    mlflow.log_param("seasonal_order", str(arima_model.seasonal_order))
    mlflow.log_param("aic", arima_model.aic())
    
    # Log metrics - MATCH SCRIPT
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("directional_accuracy", da)
    
    arima_results = {
        'predictions': predictions,
        'rmse': rmse,
        'mae': mae,
        'da': da,
        'model': arima_model
    }
    
    print("✅ ARIMA trained and logged to MLflow")


In [None]:
# Visualize ARIMA results
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.scatter(y_test_arima, predictions, alpha=0.5)
plt.plot([y_test_arima.min(), y_test_arima.max()], [y_test_arima.min(), y_test_arima.max()], 
         'r--', label='Perfect Prediction')
plt.xlabel('Actual Returns')
plt.ylabel('Predicted Returns')
plt.title(f'ARIMA {arima_model.order}\nDA: {da:.2%}')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(y_test_arima[:50], label='Actual', alpha=0.7)
plt.plot(predictions[:50], label='Predicted', alpha=0.7)
plt.xlabel('Test Days (First 50)')
plt.ylabel('Return')
plt.title('ARIMA Predictions')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(predictions, alpha=0.7)
plt.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
plt.xlabel('Test Days')
plt.ylabel('Predicted Return')
plt.title('ARIMA All Predictions')

plt.tight_layout()
plt.show()


## 4. Train LSTM (Match Script Structure)

In [None]:
# ============================================================================
# Model C: LSTM - MATCH SCRIPT STRUCTURE
# ============================================================================

with mlflow.start_run(run_name="LSTM") as run:
    mlflow.set_tags(tags)
    mlflow.set_tag("model_type", "LSTM")
    
    print("Training LSTM...")
    
    # Prepare data
    X_train_lstm = train_df[lstm_cols]
    y_train_lstm = train_df[target_col]
    X_test_lstm = test_df[lstm_cols]
    y_test_lstm = test_df[target_col]
    
    # Helper to create sequences - MATCH SCRIPT
    def create_sequences(data, target, time_steps):
        X, y = [], []
        for i in range(len(data) - time_steps + 1):
            X.append(data[i:(i + time_steps)])
            val = target[i + time_steps - 1]
            y.append(val)
        return np.array(X), np.array(y)
    
    # Config - MATCH SCRIPT
    time_steps = 60
    n_features = len(lstm_cols)
    
    # Generate Sequences
    X_train_seq, y_train_seq = create_sequences(X_train_lstm.values, y_train_lstm.values, time_steps)
    X_test_seq, y_test_seq = create_sequences(X_test_lstm.values, y_test_lstm.values, time_steps)
    
    print(f"  LSTM Input Shape: {X_train_seq.shape}")
    
    epochs = 20
    batch_size = 32
    
    # Build model - Simplified to avoid training issues
    model = Sequential()
    model.add(Input(shape=(time_steps, n_features)))
    
    # Layer 1
    model.add(LSTM(100, return_sequences=True))
    model.add(Dropout(0.2))
    
    # Layer 2
    model.add(LSTM(100, return_sequences=True))
    model.add(Dropout(0.2))
    
    # Layer 3
    model.add(LSTM(50, return_sequences=True))
    model.add(Dropout(0.2))
    
    # Layer 4
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    
    # Output
    model.add(Dense(25))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mse')
    
    # Train
    print(f"  Training for {epochs} epochs...")
    history = model.fit(
        X_train_seq, y_train_seq, 
        epochs=epochs, 
        batch_size=batch_size, 
        verbose=1,
        validation_split=0.1
    )
    
    # Predict
    predictions = model.predict(X_test_seq)
    predictions = predictions.flatten()
    
    # Calculate metrics
    rmse, mae, da = eval_metrics(y_test_seq, predictions)
    
    print(f"  RMSE: {rmse}")
    print(f"  MAE: {mae}")
    print(f"  Directional Accuracy: {da}")
    
    # Log Architecture Details - MATCH SCRIPT STRUCTURE
    mlflow.log_param("input_shape", f"[{X_train_seq.shape[0]}, {time_steps}, {n_features}]")
    mlflow.log_param("time_steps", time_steps)
    mlflow.log_param("n_features", n_features)
    mlflow.log_param("epochs", epochs)
    mlflow.log_param("batch_size", batch_size)
    
    # Log Layer Details
    model_summary = []
    for i, layer in enumerate(model.layers):
        layer_config = layer.get_config()
        layer_name = layer_config['name']
        layer_class = layer.__class__.__name__
        units = layer_config.get('units', 'N/A')
        activation = layer_config.get('activation', 'N/A')
        
        mlflow.log_param(f"layer_{i}_class", layer_class)
        mlflow.log_param(f"layer_{i}_units", units)
        mlflow.log_param(f"layer_{i}_activation", activation)
        
        model_summary.append(f"{layer_class}(units={units}, activation={activation})")
        
    mlflow.log_param("model_arch_summary", " -> ".join(model_summary))
    
    # Log metrics - MATCH SCRIPT
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("directional_accuracy", da)
    
    # Log Scaler as Artifact - MATCH SCRIPT
    mlflow.log_artifact(scaler_path, artifact_path="scaler")
    
    # Log Feature Config
    feature_config = {
        "features": lstm_cols,
        "target": target_col,
        "time_steps": time_steps,
        "n_features": n_features,
        "model_type": "LSTM"
    }
    mlflow.log_dict(feature_config, "feature_config.json")

    # Infer and log signature - MATCH SCRIPT
    signature = infer_signature(X_train_seq, predictions)
    mlflow.tensorflow.log_model(model, "model", signature=signature)
    
    lstm_results = {
        'predictions': predictions,
        'rmse': rmse,
        'mae': mae,
        'da': da,
        'model': model
    }
    
    print("✅ LSTM trained and logged to MLflow")


In [None]:
# Visualize LSTM results
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('LSTM Training History')
plt.legend()

plt.subplot(1, 3, 2)
plt.scatter(y_test_seq, predictions, alpha=0.5)
plt.plot([y_test_seq.min(), y_test_seq.max()], [y_test_seq.min(), y_test_seq.max()], 
         'r--', label='Perfect Prediction')
plt.xlabel('Actual Returns')
plt.ylabel('Predicted Returns')
plt.title(f'LSTM Predictions\nDA: {da:.2%}')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(y_test_seq[:50], label='Actual', alpha=0.7)
plt.plot(predictions[:50], label='Predicted', alpha=0.7)
plt.xlabel('Test Days (First 50)')
plt.ylabel('Return')
plt.title('LSTM Predictions')
plt.legend()

plt.tight_layout()
plt.show()


## 5. Model Comparison (Visual Only)

In [None]:
# ============================================================================
# Model Comparison Visualization
# ============================================================================

# Collect all results
all_results = {
    'Ridge': ridge_results,
    'ARIMA': arima_results,
    'LSTM': lstm_results
}

# Create comparison dataframe
comparison_data = []
for model_name, results in all_results.items():
    comparison_data.append({
        'Model': model_name,
        'RMSE': results['rmse'],
        'MAE': results['mae'],
        'Directional_Accuracy': results['da']
    })

comparison_df = pd.DataFrame(comparison_data).set_index('Model')

print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
print(comparison_df)


In [None]:
# Visual comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Directional Accuracy
models = list(all_results.keys())
da_values = [all_results[m]['da'] for m in models]

axes[0, 0].bar(models, da_values, color=['blue', 'orange', 'green'])
axes[0, 0].axhline(y=0.5, color='red', linestyle='--', label='Random (50%)')
axes[0, 0].set_title('Directional Accuracy (Higher is Better)')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_ylim([0, 0.6])
for i, v in enumerate(da_values):
    axes[0, 0].text(i, v + 0.01, f'{v:.2%}', ha='center')

# 2. RMSE
rmse_values = [all_results[m]['rmse'] for m in models]
axes[0, 1].bar(models, rmse_values, color=['blue', 'orange', 'green'])
axes[0, 1].set_title('RMSE (Lower is Better)')
axes[0, 1].set_ylabel('RMSE')
for i, v in enumerate(rmse_values):
    axes[0, 1].text(i, v + 0.0001, f'{v:.6f}', ha='center')

# 3. MAE
mae_values = [all_results[m]['mae'] for m in models]
axes[0, 2].bar(models, mae_values, color=['blue', 'orange', 'green'])
axes[0, 2].set_title('MAE (Lower is Better)')
axes[0, 2].set_ylabel('MAE')
for i, v in enumerate(mae_values):
    axes[0, 2].text(i, v + 0.0001, f'{v:.6f}', ha='center')

# 4. Predictions vs Actual (first 50 days)
n_plot = 50
axes[1, 0].plot(y_test_lr.values[:n_plot], label='Actual', color='black', linewidth=2, alpha=0.7)
axes[1, 0].plot(ridge_results['predictions'][:n_plot], label='Ridge', alpha=0.7)
axes[1, 0].plot(arima_results['predictions'][:n_plot], label='ARIMA', alpha=0.7)
axes[1, 0].plot(lstm_results['predictions'][:n_plot], label='LSTM', alpha=0.7)
axes[1, 0].set_xlabel('Test Days')
axes[1, 0].set_ylabel('Return')
axes[1, 0].set_title('Predictions Comparison (First 50 Days)')
axes[1, 0].legend()

# 5. Error distribution
errors = {
    'Ridge': ridge_results['predictions'] - y_test_lr.values[:len(ridge_results['predictions'])],
    'ARIMA': arima_results['predictions'] - y_test_arima[:len(arima_results['predictions'])],
    'LSTM': lstm_results['predictions'] - y_test_seq[:len(lstm_results['predictions'])]
}

for model_name, error_vals in errors.items():
    axes[1, 1].hist(error_vals, bins=30, alpha=0.5, label=model_name, density=True)

axes[1, 1].axvline(x=0, color='black', linestyle='--')
axes[1, 1].set_xlabel('Prediction Error')
axes[1, 1].set_ylabel('Density')
axes[1, 1].set_title('Error Distribution')
axes[1, 1].legend()

# 6. Cumulative returns
axes[1, 2].axhline(y=1, color='gray', linestyle='--', alpha=0.5)

for model_name, results in all_results.items():
    if model_name == 'Ridge':
        preds = results['predictions']
        actuals = y_test_lr.values[:len(preds)]
    elif model_name == 'ARIMA':
        preds = results['predictions']
        actuals = y_test_arima[:len(preds)]
    else:  # LSTM
        preds = results['predictions']
        actuals = y_test_seq[:len(preds)]
    
    positions = np.where(preds > 0, 1, -1)
    daily_returns = positions * actuals
    cumulative_returns = (1 + daily_returns).cumprod()
    
    axes[1, 2].plot(cumulative_returns[:100], label=model_name, alpha=0.7)

axes[1, 2].set_xlabel('Test Days')
axes[1, 2].set_ylabel('Cumulative Return')
axes[1, 2].set_title('Cumulative Returns (First 100 Days)')
axes[1, 2].legend()

plt.tight_layout()
plt.show()


In [None]:
# Summary
print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print("\nAll models have been trained and logged to MLflow.")
print(f"Experiment: {EXPERIMENT_NAME}")
print("\nNext steps:")
print("1. Check MLflow UI to see all logged runs")
print("2. Run 04_evaluate_select.py to register and compare models")
print("3. Deploy the best model for predictions")

# MLflow run info
client = mlflow.tracking.MlflowClient()
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
if experiment:
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["attribute.start_time DESC"]
    )
    print(f"\nLatest runs in experiment:")
    for i, run in enumerate(runs[:3]):  # Show 3 most recent
        model_type = run.data.tags.get('model_type', 'Unknown')
        rmse = run.data.metrics.get('rmse', 0)
        da = run.data.metrics.get('directional_accuracy', 0)
        print(f"  {i+1}. {model_type}: RMSE={rmse:.6f}, DA={da:.2%}")
