In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import sys
import os

# Link to your src/models.py
sys.path.append(os.path.abspath(os.path.join('..')))
from src.models import BaselineModel, LSTMModel

print("="*80)
print("TRAINING ON ALL 5 PAIRS: Jan 2021 - Dec 2023")
print("TESTING ON: Jan 2024 - Dec 2025")
print("="*80)

# 1. DATA LOADING
print("\nüì• Loading data...")
df = pd.read_csv('../data/processed/04_ml_ready_features.csv', index_col=0, parse_dates=True)

print(f"   Total rows: {len(df)}")
print(f"   Date range: {df.index.min().date()} to {df.index.max().date()}")
print(f"   Unique pairs: {df['Pair_ID'].nunique()}")

# 2. DATE-BASED SPLIT (not random 80/20)
train_end = '2023-12-31'

print(f"\nüìÖ Date-based train/test split:")
print(f"   TRAIN: {df.index.min().date()} to {pd.to_datetime(train_end).date()}")
print(f"   TEST:  {(pd.to_datetime(train_end) + pd.Timedelta(days=1)).date()} to {df.index.max().date()}")

df_train_full = df[df.index <= train_end].copy()
df_test_full = df[df.index > train_end].copy()

print(f"\n   Train samples: {len(df_train_full)}")
print(f"   Test samples: {len(df_test_full)}")

print(f"\n   Train breakdown by pair:")
for pair in sorted(df_train_full['Pair_ID'].unique()):
    count = (df_train_full['Pair_ID'] == pair).sum()
    print(f"      {pair}: {count}")

print(f"\n   Test breakdown by pair:")
for pair in sorted(df_test_full['Pair_ID'].unique()):
    count = (df_test_full['Pair_ID'] == pair).sum()
    print(f"      {pair}: {count}")

# 3. PREPARE FEATURES WITH PROPER SCALING (NO DATA LEAKAGE)
print(f"\nüîÑ Preparing features...")

feature_cols = ['Z_Score', 'Volatility']

# Fit scaler ONLY on training data (prevent data leakage)
scaler = StandardScaler()
X_train_scaled_vals = scaler.fit_transform(df_train_full[feature_cols])

# Create scaled dataframe for training
df_train_scaled = pd.DataFrame(X_train_scaled_vals, columns=feature_cols, index=df_train_full.index)
df_train_scaled['Target'] = df_train_full['Target_Direction'].values
df_train_scaled['Pair_ID'] = df_train_full['Pair_ID'].values
df_train_scaled['DateTime'] = df_train_full.index
df_train_scaled['Original_Index'] = np.arange(len(df_train_full))

# Apply same scaler to test data
X_test_scaled_vals = scaler.transform(df_test_full[feature_cols])

# Create scaled dataframe for testing
df_test_scaled = pd.DataFrame(X_test_scaled_vals, columns=feature_cols, index=df_test_full.index)
df_test_scaled['Target'] = df_test_full['Target_Direction'].values
df_test_scaled['Pair_ID'] = df_test_full['Pair_ID'].values
df_test_scaled['DateTime'] = df_test_full.index
df_test_scaled['Original_Index'] = np.arange(len(df_test_full))

# 4. PAIR-AWARE SEQUENCE GENERATION
def create_pair_sequences(data_df, lookback=10):
    """Generate sequences per pair to maintain trading logic"""
    X_seq, y_seq, indices = [], [], []
    
    for pair in data_df['Pair_ID'].unique():
        pair_df = data_df[data_df['Pair_ID'] == pair].reset_index(drop=True)
        X_vals = pair_df[feature_cols].values
        y_vals = pair_df['Target'].values
        row_ids = pair_df['Original_Index'].values 
        
        if len(X_vals) <= lookback: 
            continue
        
        for i in range(len(X_vals) - lookback):
            X_seq.append(X_vals[i:i+lookback])
            y_seq.append(y_vals[i+lookback])
            indices.append(row_ids[i+lookback]) 
            
    return np.array(X_seq), np.array(y_seq), np.array(indices)

LOOKBACK = 10
print(f"\nüìä Generating training sequences (Lookback={LOOKBACK})...")
X_train_3d, y_train, train_indices = create_pair_sequences(df_train_scaled, lookback=LOOKBACK)
X_train_2d = np.array([s[-1] for s in X_train_3d])

print(f"   X_train_3d shape: {X_train_3d.shape}")
print(f"   X_train_2d shape: {X_train_2d.shape}")
print(f"   y_train shape: {y_train.shape}")

print(f"\nüìä Generating test sequences (Lookback={LOOKBACK})...")
X_test_3d, y_test, test_indices = create_pair_sequences(df_test_scaled, lookback=LOOKBACK)
X_test_2d = np.array([s[-1] for s in X_test_3d])

print(f"   X_test_3d shape: {X_test_3d.shape}")
print(f"   X_test_2d shape: {X_test_2d.shape}")
print(f"   y_test shape: {y_test.shape}")

# 5. CLASS DISTRIBUTION
print(f"\nüìà Target distribution:")
print(f"   Train: {(y_train == 0).sum()} down, {(y_train == 1).sum()} up ({100*y_train.mean():.1f}% up)")
print(f"   Test:  {(y_test == 0).sum()} down, {(y_test == 1).sum()} up ({100*y_test.mean():.1f}% up)")

# 6. CONVERT TO TENSORS (for LSTM)
print(f"\nüîß Converting to PyTorch tensors...")
X_train_t3d = torch.FloatTensor(X_train_3d)
X_test_t3d = torch.FloatTensor(X_test_3d)
y_train_t = torch.FloatTensor(y_train).view(-1, 1)

# 7. TRAINING FUNCTION
def train_torch_model(model, X_train, y_train_t, epochs=100):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    model.train()
    
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(X_train)
        loss = criterion(output, y_train_t)
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 20 == 0:
            print(f"      Epoch {epoch+1}/{epochs} - Loss: {loss.item():.6f}")
    
    return model

# 8. TRAIN ALL MODELS
print(f"\n{'='*80}")
print("TRAINING MODELS")
print(f"{'='*80}")

print(f"\n1Ô∏è‚É£  Training Ridge Regression...")
model_ridge = BaselineModel(alpha=1.0)
model_ridge.fit(X_train_2d, y_train)
print(f"   ‚úÖ Complete")

print(f"\n2Ô∏è‚É£  Training LSTM...")
model_lstm = LSTMModel(input_dim=2)
model_lstm = train_torch_model(model_lstm, X_train_t3d, y_train_t, epochs=100)
print(f"   ‚úÖ Complete")

# 9. GENERATE & SAVE PREDICTIONS FOR ALL MODELS
print(f"\n{'='*80}")
print("GENERATING PREDICTIONS")
print(f"{'='*80}\n")

# Map test indices back to original dataframe
test_results = df_test_full.iloc[test_indices].copy()

# Set DateTime as index
test_results['DateTime'] = pd.to_datetime(test_results.index)
test_results = test_results.set_index('DateTime')

# Generate Predictions for Each Model
print("   Ridge predictions...")
ridge_preds = model_ridge.predict(X_test_2d)
test_results['Ridge_Pred'] = ridge_preds

print("   LSTM predictions...")
with torch.no_grad():
    model_lstm.eval()
    lstm_preds = model_lstm(X_test_t3d).numpy().flatten()
test_results['LSTM_Pred'] = lstm_preds

# Sort by date
test_results = test_results.sort_index()

# Save to CSV
output_file = '../data/processed/05_model_predictions.csv'
test_results.to_csv(output_file)

# 10. RESULTS SUMMARY
print(f"\n{'='*80}")
print(f"‚úÖ SUCCESS!")
print(f"{'='*80}")

print(f"\nüìä Results saved to: {output_file}")
print(f"   Shape: {test_results.shape}")
print(f"   Date range: {test_results.index.min().date()} to {test_results.index.max().date()}")
print(f"   Columns: {test_results.columns.tolist()}")

print(f"\n   Pair breakdown in predictions:")
for pair in sorted(test_results['Pair_ID'].unique()):
    count = (test_results['Pair_ID'] == pair).sum()
    print(f"      {pair}: {count} predictions")

# 11. QUICK PERFORMANCE SUMMARY
print(f"\n{'='*80}")
print("QUICK PERFORMANCE SUMMARY")
print(f"{'='*80}\n")

y_test_true = test_results['Target_Direction'].values

for model_name, preds in [('Ridge', ridge_preds), ('LSTM', lstm_preds)]:
    # Convert to binary
    if model_name == 'Ridge':
        preds_binary = preds  # Already binary
    else:
        preds_binary = (preds >= 0.5).astype(int)
    
    acc = accuracy_score(y_test_true, preds_binary)
    prec = precision_score(y_test_true, preds_binary, zero_division=0)
    rec = recall_score(y_test_true, preds_binary, zero_division=0)
    f1 = f1_score(y_test_true, preds_binary, zero_division=0)
    
    print(f"{model_name}:")
    print(f"   Accuracy:  {acc:.4f}")
    print(f"   Precision: {prec:.4f}")
    print(f"   Recall:    {rec:.4f}")
    print(f"   F1 Score:  {f1:.4f}\n")

# 12. PER-PAIR PERFORMANCE
print(f"\n{'='*80}")
print("PERFORMANCE BY PAIR")
print(f"{'='*80}\n")

for pair in sorted(test_results['Pair_ID'].unique()):
    pair_mask = test_results['Pair_ID'] == pair
    y_pair = y_test_true[pair_mask]
    n = pair_mask.sum()
    
    ridge_binary = ridge_preds[pair_mask]
    lstm_binary = (lstm_preds[pair_mask] >= 0.5).astype(int)
    
    ridge_acc = accuracy_score(y_pair, ridge_binary)
    lstm_acc = accuracy_score(y_pair, lstm_binary)
    
    print(f"{pair} ({n} samples):")
    print(f"   Ridge: {ridge_acc:.4f} | LSTM: {lstm_acc:.4f}\n")

# 13. SECTOR PERFORMANCE
print(f"\n{'='*80}")
print("PERFORMANCE BY SECTOR")
print(f"{'='*80}\n")

for sector in sorted(test_results['Sector'].unique()):
    sector_mask = test_results['Sector'] == sector
    y_sector = y_test_true[sector_mask]
    n = sector_mask.sum()
    
    ridge_binary = ridge_preds[sector_mask]
    lstm_binary = (lstm_preds[sector_mask] >= 0.5).astype(int)
    
    ridge_acc = accuracy_score(y_sector, ridge_binary)
    lstm_acc = accuracy_score(y_sector, lstm_binary)
    
    print(f"{sector.upper()} ({n} samples):")
    print(f"   Ridge: {ridge_acc:.4f} | LSTM: {lstm_acc:.4f}\n")

print(f"{'='*80}")
print("‚úÖ Training and predictions complete!")
print(f"{'='*80}")

TRAINING ON ALL 5 PAIRS: Jan 2021 - Dec 2023
TESTING ON: Jan 2024 - Dec 2025

üì• Loading data...
   Total rows: 5223
   Date range: 2021-02-02 to 2025-12-29
   Unique pairs: 5

üìÖ Date-based train/test split:
   TRAIN: 2021-02-02 to 2023-12-31
   TEST:  2024-01-01 to 2025-12-29

   Train samples: 2843
   Test samples: 2380

   Train breakdown by pair:
      ADSK-MSI: 482
      ADSK-NOW: 482
      AIG-CB: 664
      CMS-DUK: 733
      INTU-MSFT: 482

   Test breakdown by pair:
      ADSK-MSI: 460
      ADSK-NOW: 460
      AIG-CB: 500
      CMS-DUK: 500
      INTU-MSFT: 460

üîÑ Preparing features...

üìä Generating training sequences (Lookback=10)...
   X_train_3d shape: (2793, 10, 2)
   X_train_2d shape: (2793, 2)
   y_train shape: (2793,)

üìä Generating test sequences (Lookback=10)...
   X_test_3d shape: (2330, 10, 2)
   X_test_2d shape: (2330, 2)
   y_test shape: (2330,)

üìà Target distribution:
   Train: 1394 down, 1399 up (50.1% up)
   Test:  1157 down, 1173 up (50.3% up)
