THIS NOTEBOOK TRAINS THE RIDGe CLASSIFIER AND LSTM MODELS BASED ON PREVIOUSLY CREATED FEATURE
it implements the core machine learning pipeline defined in models.py

the models takes the following input features:
1.Z-Score
2.Range_postion
3.MR_Strenght

and the following target variable:
1.target direction: a binary label 1-0

the models are trained to capture relationship between input features and target variable

In [29]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import joblib  #
import json    
import sys
import os

# Setup paths
sys.path.append(os.path.abspath('..'))
from src.models import BaselineModel, LSTMModel

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

print("="*80)
print("TRAINING PHASE")
print("="*80)

# 1. CONFIGURATION
SELECTED_FEATURES = ['Z_Score', 'Range_Position', 'MR_Strength']
TRAIN_END = '2023-12-31'
LOOKBACK = 10

# 2. LOAD DATA
df = pd.read_csv('../data/processed/04_ml_ready_features.csv', index_col=0, parse_dates=True)
df_train = df[df.index <= TRAIN_END].copy()

# 3. PREPARE & SCALE
# Important: We fit the scaler ONLY on training data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_vals = scaler.fit_transform(df_train[SELECTED_FEATURES])

# Helper to rebuild DF for sequencing
df_train_proc = pd.DataFrame(X_train_vals, columns=SELECTED_FEATURES, index=df_train.index)
df_train_proc['Target'] = df_train['Target_Direction'].values
df_train_proc['Pair_ID'] = df_train['Pair_ID'].values
df_train_proc['Original_Index'] = np.arange(len(df_train))

# 4. GENERATE SEQUENCES
def create_pair_sequences(data_df, feature_cols, lookback=10):
    X_seq, y_seq = [], []
    for pair in data_df['Pair_ID'].unique():
        pair_df = data_df[data_df['Pair_ID'] == pair].reset_index(drop=True)
        X_vals = pair_df[feature_cols].values
        y_vals = pair_df['Target'].values
        if len(X_vals) <= lookback: continue
        for i in range(len(X_vals) - lookback):
            X_seq.append(X_vals[i:i+lookback])
            y_seq.append(y_vals[i+lookback])
    return np.array(X_seq), np.array(y_seq)

X_train_3d, y_train = create_pair_sequences(df_train_proc, SELECTED_FEATURES, lookback=LOOKBACK)
X_train_2d = np.array([s[-1] for s in X_train_3d])

# 5. TRAIN MODELS
print("Training Ridge...")
model_ridge = BaselineModel(alpha=1.0)
model_ridge.fit(X_train_2d, y_train)

print("Training LSTM...")
X_train_t3d = torch.FloatTensor(X_train_3d)
y_train_t = torch.FloatTensor(y_train).view(-1, 1)

model_lstm = LSTMModel(input_dim=len(SELECTED_FEATURES))
optimizer = optim.Adam(model_lstm.parameters(), lr=0.001)
criterion = nn.BCELoss()

model_lstm.train()
for epoch in range(100): # Fast training loop
    optimizer.zero_grad()
    output = model_lstm(X_train_t3d)
    loss = criterion(output, y_train_t)
    loss.backward()
    optimizer.step()


print(f"\n Saving to ../models/ ...")

# 1. Save the Models
joblib.dump(model_ridge, '../models/ridge_model.pkl')
torch.save(model_lstm.state_dict(), '../models/lstm_model.pth')

# 2. Save the Scaler (CRITICAL!)
# If you don't save this, the prediction notebook won't know how to scale new data
joblib.dump(scaler, '../models/scaler.pkl')

# 3. Save Config (So prediction notebook knows what features to use)
config = {
    "features": SELECTED_FEATURES,
    "lookback": LOOKBACK,
    "train_end_date": TRAIN_END
}
with open('../models/config.json', 'w') as f:
    json.dump(config, f)



TRAINING PHASE
Training Ridge...
Training LSTM...

 Saving to ../models/ ...
