# 6. Machine Learning Models
## XGBoost and LSTM for Trade Prediction

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_utils import load_data
from strategy import EMAStrategy, TradeAnalyzer, apply_ml_filter
from ml_models import MLFeatureEngineer, XGBoostModel, LSTMModel, ModelEvaluator
from backtest import Backtester, BacktestVisualizer, generate_backtest_report
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load data
df = load_data('../data/nifty_merged_5min.csv')
print(f"Data shape: {df.shape}")

## 6.1 Generate Trading Signals

In [None]:
# Generate baseline strategy signals
strategy = EMAStrategy(fast_ema=5, slow_ema=15)
df = strategy.generate_signals(df, use_regime_filter=True)
df = strategy.generate_positions(df)

print(f"Generated {np.sum(df['signal'] != 0)} trading signals")

## 6.2 Create Target Variable

In [None]:
# Extract trades to create target
trades_df = TradeAnalyzer.extract_trades(df)
print(f"Extracted {len(trades_df)} trades")
print(f"Profitable: {trades_df['is_profitable'].sum()}, Unprofitable: {(~trades_df['is_profitable']).sum()}")

# Create target variable
target = MLFeatureEngineer.create_target(trades_df, df)
df['target'] = target
print(f"\nTarget distribution:")
print(df['target'].value_counts())

## 6.3 Feature Engineering for ML

In [None]:
# Add signal strength features
df = MLFeatureEngineer.add_signal_strength_features(df)

# Prepare ML features
df, feature_cols = MLFeatureEngineer.prepare_ml_features(df)
print(f"\nSelected {len(feature_cols)} features for ML:")
print(feature_cols[:10], "...")

## 6.4 Prepare Train/Test Split

In [None]:
# Remove rows with missing target
df_ml = df.dropna(subset=['target']).copy()
print(f"\nML dataset: {len(df_ml)} samples")

# Split data (70/30)
split_idx = int(len(df_ml) * 0.7)
train_df = df_ml.iloc[:split_idx]
test_df = df_ml.iloc[split_idx:]

# Prepare features and target
X_train = train_df[feature_cols].fillna(0)
y_train = train_df['target']
X_test = test_df[feature_cols].fillna(0)
y_test = test_df['target']

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

## 6.5 Train XGBoost Model

In [None]:
# Train XGBoost
xgb_model = XGBoostModel()
xgb_model.train(X_train, y_train, X_test, y_test)

# Predictions
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)

# Evaluate
xgb_metrics = ModelEvaluator.evaluate_model(y_test.values, xgb_pred, xgb_pred_proba)
print("\nXGBoost Performance:")
print(f"Accuracy: {xgb_metrics['accuracy']:.4f}")
print(f"Precision: {xgb_metrics['precision']:.4f}")
print(f"Recall: {xgb_metrics['recall']:.4f}")
print(f"F1 Score: {xgb_metrics['f1_score']:.4f}")
print(f"AUC: {xgb_metrics['auc']:.4f}")

In [None]:
# Feature importance
importance_df = xgb_model.get_feature_importance(feature_cols)
print("\nTop 10 Important Features:")
print(importance_df.head(10))

# Plot
plt.figure(figsize=(10, 8))
plt.barh(importance_df['feature'][:15], importance_df['importance'][:15])
plt.xlabel('Importance')
plt.title('XGBoost Feature Importance (Top 15)')
plt.tight_layout()
plt.savefig('../plots/xgboost_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 6.6 Train LSTM Model

In [None]:
# Train LSTM
lstm_model = LSTMModel(sequence_length=10, n_features=len(feature_cols))
history = lstm_model.train(X_train, y_train, X_test, y_test, epochs=50, batch_size=32)

# Predictions
lstm_pred_proba = lstm_model.predict_proba(X_test)
lstm_pred = (lstm_pred_proba > 0.5).astype(int)

# Remove NaN values for evaluation
valid_idx = ~np.isnan(lstm_pred_proba)
lstm_metrics = ModelEvaluator.evaluate_model(
    y_test.values[valid_idx], 
    lstm_pred[valid_idx], 
    lstm_pred_proba[valid_idx]
)

print("\nLSTM Performance:")
print(f"Accuracy: {lstm_metrics['accuracy']:.4f}")
print(f"Precision: {lstm_metrics['precision']:.4f}")
print(f"Recall: {lstm_metrics['recall']:.4f}")
print(f"F1 Score: {lstm_metrics['f1_score']:.4f}")
print(f"AUC: {lstm_metrics['auc']:.4f}")

## 6.7 Plot ROC Curves

In [None]:
# XGBoost ROC
ModelEvaluator.plot_roc_curve(
    y_test.values, 
    xgb_pred_proba,
    save_path='../plots/xgboost_roc_curve.png'
)

# LSTM ROC
ModelEvaluator.plot_roc_curve(
    y_test.values[valid_idx], 
    lstm_pred_proba[valid_idx],
    save_path='../plots/lstm_roc_curve.png'
)
print("ROC curves saved")

## 6.8 Backtest with ML Filters

In [None]:
# Get full predictions for entire dataset
df_full = df.copy()
X_full = df_full[feature_cols].fillna(0)

# XGBoost predictions
xgb_full_proba = xgb_model.predict_proba(X_full)
df_xgb = apply_ml_filter(df_full, xgb_full_proba, confidence_threshold=0.5)

# LSTM predictions
lstm_full_proba = lstm_model.predict_proba(X_full)
lstm_full_proba = np.nan_to_num(lstm_full_proba, nan=0.5)  # Replace NaN with neutral
df_lstm = apply_ml_filter(df_full, lstm_full_proba, confidence_threshold=0.5)

print("ML filters applied")

In [None]:
# Backtest comparison
backtester = Backtester(initial_capital=100000)

# Split to test set
split_idx = int(len(df) * 0.7)
test_df_baseline = df.iloc[split_idx:]
test_df_xgb = df_xgb.iloc[split_idx:]
test_df_lstm = df_lstm.iloc[split_idx:]

# Run backtests
results_baseline = backtester.backtest(test_df_baseline, "Baseline")
results_xgb = backtester.backtest(test_df_xgb, "XGBoost Enhanced")
results_lstm = backtester.backtest(test_df_lstm, "LSTM Enhanced")

# Compare
comparison_df = backtester.compare_strategies([results_baseline, results_xgb, results_lstm])
print("\nStrategy Comparison:")
print(comparison_df)

In [None]:
# Plot comparison
BacktestVisualizer.plot_strategy_comparison(
    comparison_df,
    save_path='../plots/ml_strategy_comparison.png'
)
print("Comparison plot saved")

## 6.9 Save Models

In [None]:
# Save models
xgb_model.save_model('../models/xgboost_model.pkl')
lstm_model.save_model('../models/lstm_model.h5')
print("Models saved")

# Save comparison results
comparison_df.to_csv('../results/ml_models_comparison.csv', index=False)
print("Comparison results saved")

## Summary

In [None]:
print("=" * 80)
print("MACHINE LEARNING MODELS SUMMARY")
print("=" * 80)
print("\nXGBoost Model:")
print(f"  Accuracy: {xgb_metrics['accuracy']:.4f}")
print(f"  AUC: {xgb_metrics['auc']:.4f}")
print("\nLSTM Model:")
print(f"  Accuracy: {lstm_metrics['accuracy']:.4f}")
print(f"  AUC: {lstm_metrics['auc']:.4f}")
print("\nBacktest Performance Improvement:")
baseline_return = comparison_df[comparison_df['Strategy']=='Baseline']['Total Return (%)'].values[0]
xgb_return = comparison_df[comparison_df['Strategy']=='XGBoost Enhanced']['Total Return (%)'].values[0]
lstm_return = comparison_df[comparison_df['Strategy']=='LSTM Enhanced']['Total Return (%)'].values[0]
print(f"  Baseline: {baseline_return:.2f}%")
print(f"  XGBoost: {xgb_return:.2f}% ({xgb_return - baseline_return:+.2f}%)")
print(f"  LSTM: {lstm_return:.2f}% ({lstm_return - baseline_return:+.2f}%)")
print("\nNext Step: Proceed to 07_outlier_analysis.ipynb")