# Machine Learning Model Research for Statistical Arbitrage

This notebook explores various machine learning approaches for developing predictive models in statistical arbitrage strategies. We'll experiment with different algorithms, feature engineering techniques, and model validation approaches.

## Research Objectives
- Explore various ML algorithms for return prediction
- Develop robust feature engineering pipelines
- Implement proper cross-validation for time series data
- Compare model performance and stability
- Test ensemble methods and model combinations

## Models to Investigate
1. **Linear Models**: Ridge, Lasso, Elastic Net
2. **Tree-Based Models**: Random Forest, XGBoost, LightGBM
3. **Neural Networks**: Feed-forward, LSTM, GRU
4. **Ensemble Methods**: Voting, Stacking, Blending
5. **Time Series Models**: ARIMA-GARCH, State Space Models
6. **Clustering**: K-Means for regime identification

## Key Research Questions
- Can ML models predict short-term return patterns?
- Which features are most predictive of future returns?
- How do models perform across different market regimes?
- What is the optimal lookback period for features?
- How can we combine multiple models effectively?

---

In [None]:
# Import core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# Advanced ML libraries
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False
    print("XGBoost not available")

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except ImportError:
    LGB_AVAILABLE = False
    print("LightGBM not available")

# Deep learning (optional)
try:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, LSTM, Dropout
    from tensorflow.keras.optimizers import Adam
    KERAS_AVAILABLE = True
except ImportError:
    KERAS_AVAILABLE = False
    print("Keras/TensorFlow not available")

# Financial data
import yfinance as yf
from scipy import stats
import talib

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"XGBoost available: {XGB_AVAILABLE}")
print(f"LightGBM available: {LGB_AVAILABLE}")
print(f"Keras available: {KERAS_AVAILABLE}")

# Load data for experimentation
def load_experimental_data():
    """Load and prepare data for ML experiments"""
    
    # Define stock universe
    stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 
              'JPM', 'BAC', 'WFC', 'GS', 'MS',
              'XOM', 'CVX', 'COP', 'EOG', 'SLB']
    
    try:
        # Download 3 years of data
        print("Downloading market data...")
        price_data = yf.download(stocks, period="3y", interval="1d")['Adj Close']
        
        # Calculate returns and other metrics
        returns = price_data.pct_change()
        log_returns = np.log(price_data / price_data.shift(1))
        
        # Volume data for additional features
        volume_data = yf.download(stocks, period="3y", interval="1d")['Volume']
        
        print(f"Data loaded: {len(stocks)} stocks, {len(price_data)} days")
        return price_data, returns, log_returns, volume_data, stocks
        
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Creating synthetic data for experimentation...")
        
        # Generate synthetic data
        dates = pd.date_range(start='2021-01-01', end='2024-01-01', freq='D')
        np.random.seed(42)
        
        price_data = pd.DataFrame({
            stock: 100 * np.exp(np.cumsum(np.random.normal(0.0005, 0.02, len(dates))))
            for stock in stocks
        }, index=dates)
        
        returns = price_data.pct_change()
        log_returns = np.log(price_data / price_data.shift(1))
        
        # Synthetic volume
        volume_data = pd.DataFrame({
            stock: np.random.randint(1000000, 10000000, len(dates))
            for stock in stocks
        }, index=dates)
        
        return price_data, returns, log_returns, volume_data, stocks

# Load the experimental dataset
price_data, returns, log_returns, volume_data, stocks = load_experimental_data()

## 1. Feature Engineering for ML Models

Creating a comprehensive set of predictive features from price, volume, and market data for our ML models.

In [None]:
def create_features(price_data, returns, volume_data, stock_symbol, lookback_periods=[5, 10, 20]):
    """
    Create comprehensive feature set for ML models
    """
    features = pd.DataFrame(index=price_data.index)
    
    # Price-based features
    prices = price_data[stock_symbol]
    stock_returns = returns[stock_symbol]
    volumes = volume_data[stock_symbol] if volume_data is not None else None
    
    # 1. Lagged returns
    for lag in range(1, 6):  # 1-5 day lags
        features[f'return_lag_{lag}'] = stock_returns.shift(lag)
    
    # 2. Moving averages and ratios
    for period in lookback_periods:
        features[f'ma_{period}'] = prices.rolling(period).mean()
        features[f'price_to_ma_{period}'] = prices / features[f'ma_{period}']
        features[f'ma_slope_{period}'] = (features[f'ma_{period}'] - features[f'ma_{period}'].shift(5)) / 5
    
    # 3. Volatility measures
    for period in lookback_periods:
        features[f'volatility_{period}'] = stock_returns.rolling(period).std()
        features[f'volatility_ratio_{period}'] = (features[f'volatility_{period}'] / 
                                                 features[f'volatility_{period}'].shift(period))
    
    # 4. Price momentum and mean reversion
    for period in lookback_periods:
        features[f'momentum_{period}'] = (prices / prices.shift(period) - 1)
        features[f'mean_reversion_{period}'] = (prices - prices.rolling(period).mean()) / prices.rolling(period).std()
    
    # 5. Technical indicators
    if len(prices.dropna()) > 50:  # Ensure sufficient data
        try:
            # RSI
            features['rsi_14'] = pd.Series(talib.RSI(prices.values, timeperiod=14), index=prices.index)
            
            # MACD
            macd, macd_signal, macd_hist = talib.MACD(prices.values)
            features['macd'] = pd.Series(macd, index=prices.index)
            features['macd_signal'] = pd.Series(macd_signal, index=prices.index)
            features['macd_histogram'] = pd.Series(macd_hist, index=prices.index)
            
            # Bollinger Bands
            bb_upper, bb_middle, bb_lower = talib.BBANDS(prices.values)
            features['bb_position'] = (prices - pd.Series(bb_lower, index=prices.index)) / (
                pd.Series(bb_upper, index=prices.index) - pd.Series(bb_lower, index=prices.index))
            
        except Exception as e:
            print(f"Technical indicators failed: {e}")
            # Fill with simple alternatives
            features['rsi_14'] = np.nan
            features['macd'] = np.nan
            features['bb_position'] = np.nan
    
    # 6. Volume features (if available)
    if volumes is not None:
        for period in lookback_periods:
            features[f'volume_ma_{period}'] = volumes.rolling(period).mean()
            features[f'volume_ratio_{period}'] = volumes / features[f'volume_ma_{period}']
        
        # Price-Volume features
        features['price_volume_trend'] = (prices * volumes).rolling(20).sum()
    
    # 7. Cross-sectional features (relative to market)
    market_return = returns.mean(axis=1)  # Simple market proxy
    features['excess_return_1d'] = stock_returns - market_return
    features['beta_20d'] = stock_returns.rolling(20).cov(market_return) / market_return.rolling(20).var()
    
    # 8. Time-based features
    features['day_of_week'] = features.index.dayofweek
    features['month'] = features.index.month
    features['quarter'] = features.index.quarter
    
    # 9. Regime features
    market_vol = market_return.rolling(20).std()
    features['high_vol_regime'] = (market_vol > market_vol.rolling(60).quantile(0.7)).astype(int)
    
    return features

# Create features for the first few stocks
print("Creating features for ML models...")
feature_datasets = {}

for i, stock in enumerate(stocks[:3]):  # Start with 3 stocks for testing
    print(f"Processing {stock}...")
    features = create_features(price_data, returns, volume_data, stock)
    
    # Create target variable (next day return)
    features['target'] = returns[stock].shift(-1)  # Next day return
    
    # Remove rows with NaN values
    features_clean = features.dropna()
    
    feature_datasets[stock] = features_clean
    print(f"  Features created: {features_clean.shape}")

# Display feature summary
sample_stock = stocks[0]
sample_features = feature_datasets[sample_stock]

print(f"\n=== FEATURE SUMMARY FOR {sample_stock} ===")
print(f"Total features: {sample_features.shape[1] - 1}")  # -1 for target
print(f"Data points: {sample_features.shape[0]}")
print(f"Date range: {sample_features.index.min()} to {sample_features.index.max()}")

print("\nFeature categories:")
feature_names = [col for col in sample_features.columns if col != 'target']
print(f"Price/Return features: {len([f for f in feature_names if 'return' in f or 'momentum' in f])}")
print(f"Technical indicators: {len([f for f in feature_names if any(x in f for x in ['rsi', 'macd', 'bb'])])}")
print(f"Volume features: {len([f for f in feature_names if 'volume' in f])}")
print(f"Cross-sectional: {len([f for f in feature_names if 'excess' in f or 'beta' in f])}")
print(f"Time features: {len([f for f in feature_names if any(x in f for x in ['day', 'month', 'quarter'])])}")

# Show sample of features
print(f"\nSample features for {sample_stock}:")
print(sample_features.head()[['target'] + feature_names[:10]].round(4))