# ML Mean Reversion Strategy - Model Training
## Russell 3000 Short-Term Mean Reversion with QPI and ML Probability

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

## 1. Calculate 3-Day QPI (Quantitative Pressure Index)

In [None]:
def calculate_qpi_3day(df):
    """Calculate 3-day QPI: proprietary oversold indicator"""
    # Price momentum components
    df['ret_1d'] = df['Close'].pct_change(1)
    df['ret_3d'] = df['Close'].pct_change(3)
    
    # Volume pressure
    df['vol_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
    
    # Volatility-adjusted pressure
    df['volatility'] = df['ret_1d'].rolling(20).std()
    
    # QPI formula: normalized pressure index (0-100 scale)
    # Lower values = more oversold
    raw_qpi = 50 + (df['ret_3d'] / (df['volatility'] + 1e-6)) * 10 - (df['vol_ratio'] - 1) * 5
    df['qpi_3day'] = raw_qpi.clip(0, 100)
    
    return df

## 2. Feature Engineering

In [None]:
def create_features(df):
    """Create ML features for mean reversion prediction"""
    df = calculate_qpi_3day(df)
    
    # Price features
    df['rsi_14'] = calculate_rsi(df['Close'], 14)
    df['bb_position'] = (df['Close'] - df['Close'].rolling(20).mean()) / (df['Close'].rolling(20).std() + 1e-6)
    
    # Volume features
    df['volume_surge'] = df['Volume'] / df['Volume'].rolling(5).mean()
    
    # Momentum features
    for period in [5, 10, 20]:
        df[f'mom_{period}'] = df['Close'].pct_change(period)
    
    return df

def calculate_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / (loss + 1e-6)
    return 100 - (100 / (1 + rs))

## 3. Target Definition: 6-Day Forward Return

In [None]:
def create_target(df, horizon=6):
    """Create binary target: 1 if positive return in next 6 days"""
    df['forward_return'] = df['Close'].pct_change(horizon).shift(-horizon)
    df['target_long'] = (df['forward_return'] > 0).astype(int)
    df['target_short'] = (df['forward_return'] < 0).astype(int)
    return df

## 4. Train Models (Long and Short)

In [None]:
# Sample training on a few Russell 3000 stocks
symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'JPM', 'BAC', 'WMT', 'XOM', 'CVX']
start_date = '2018-01-01'
end_date = '2023-12-31'

all_data = []
for symbol in symbols:
    try:
        df = yf.download(symbol, start=start_date, end=end_date, progress=False)
        df = create_features(df)
        df = create_target(df)
        df['symbol'] = symbol
        all_data.append(df)
    except:
        continue

combined_df = pd.concat(all_data)
print(f"Total samples: {len(combined_df)}")

In [None]:
# Prepare training data
feature_cols = ['qpi_3day', 'rsi_14', 'bb_position', 'volume_surge', 'mom_5', 'mom_10', 'mom_20', 'vol_ratio']
train_df = combined_df.dropna(subset=feature_cols + ['target_long', 'target_short'])

X = train_df[feature_cols]
y_long = train_df['target_long']
y_short = train_df['target_short']

# Split: 80% train, 20% test
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_long_train, y_long_test = y_long[:split_idx], y_long[split_idx:]
y_short_train, y_short_test = y_short[:split_idx], y_short[split_idx:]

In [None]:
# Train Long Model
model_long = GradientBoostingClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
model_long.fit(X_train, y_long_train)

y_long_pred_proba = model_long.predict_proba(X_test)[:, 1]
print(f"Long Model - Accuracy: {accuracy_score(y_long_test, y_long_pred_proba > 0.6):.3f}")
print(f"Long Model - AUC: {roc_auc_score(y_long_test, y_long_pred_proba):.3f}")

In [None]:
# Train Short Model
model_short = GradientBoostingClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
model_short.fit(X_train, y_short_train)

y_short_pred_proba = model_short.predict_proba(X_test)[:, 1]
print(f"Short Model - Accuracy: {accuracy_score(y_short_test, y_short_pred_proba > 0.6):.3f}")
print(f"Short Model - AUC: {roc_auc_score(y_short_test, y_short_pred_proba):.3f}")

## 5. Save Models for LEAN

In [None]:
joblib.dump(model_long, '../models/ml_mean_reversion_long.pkl')
joblib.dump(model_short, '../models/ml_mean_reversion_short.pkl')
joblib.dump(feature_cols, '../models/feature_columns.pkl')
print("Models saved successfully!")

## 6. Feature Importance

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Long model
axes[0].barh(feature_cols, model_long.feature_importances_)
axes[0].set_title('Long Model Feature Importance')
axes[0].set_xlabel('Importance')

# Short model
axes[1].barh(feature_cols, model_short.feature_importances_)
axes[1].set_title('Short Model Feature Importance')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.show()