In [None]:
"""
STEP 1: Generate predictions from all 4 models on Days 6-15
Complete standalone script - FIXED TCN BUG
"""
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import xgboost as xgb
import pickle
from pathlib import Path

# ============================================================
# PATHS
# ============================================================
RAW_DATA_PATH = Path("C:/Users/wdkal/iex_data/book_snapshots")
MODEL_PATH = Path("C:/Users/wdkal/Downloads/RL_Model")
OUTPUT_PATH = Path("C:/Users/wdkal/Downloads/NEW_PREDICTIONS")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

NEW_DAYS = [
    '20251027', '20251028', '20251029', '20251030', '20251031',
    '20251103', '20251104', '20251105', '20251106', '20251107'
]

# ============================================================
# MODEL ARCHITECTURES
# ============================================================
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, fc1_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, output_size)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc1(lstm_out[:, -1, :])
        out = self.fc2(out)
        return out

class TemporalBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation):
        super(TemporalBlock, self).__init__()
        padding = (kernel_size - 1) * dilation
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, 
                              padding=padding, dilation=dilation)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, 
                              padding=padding, dilation=dilation)
        self.downsample = nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else None
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.relu(self.conv1(x))
        out = self.relu(self.conv2(out))  # FIXED: was self.conv2(x)
        
        if self.downsample is not None:
            res = self.downsample(x)
        else:
            res = x
        
        if out.size(2) != res.size(2):
            out = out[:, :, :res.size(2)]
        
        return self.relu(out + res)

class TCNModel(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size, output_size=3):
        super(TCNModel, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation = 2 ** i
            in_ch = num_inputs if i == 0 else num_channels[i-1]
            out_ch = num_channels[i]
            layers.append(TemporalBlock(in_ch, out_ch, kernel_size, dilation))
        
        self.network = nn.Sequential(*layers)
        self.fc = nn.Linear(num_channels[-1], output_size)
    
    def forward(self, x):
        out = self.network(x)
        out = out[:, :, -1]
        return self.fc(out)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.pe = nn.Parameter(torch.randn(1, max_len, d_model))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, input_size, d_model, nhead, num_layers, dim_feedforward, fc1_size, output_size, max_len=5000):
        super(TransformerModel, self).__init__()
        self.input_projection = nn.Linear(input_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc1 = nn.Linear(d_model, fc1_size)
        self.fc2 = nn.Linear(fc1_size, output_size)
    
    def forward(self, x):
        x = self.input_projection(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x[:, -1, :]
        x = self.fc1(x)
        return self.fc2(x)

# ============================================================
# LOAD ALL 4 MODELS
# ============================================================
print("="*60)
print("LOADING ALL 4 MODELS")
print("="*60)

# 1. XGBoost
print("\nLoading XGBoost...")
xgb_model = xgb.Booster()
xgb_model.load_model(MODEL_PATH / "xgb_best_model.json")
print("✓ XGBoost loaded")

# 2. LSTM
print("Loading LSTM...")
lstm_state = torch.load(MODEL_PATH / "lstm_best_model.pt", map_location='cpu', weights_only=False)
weight_ih = lstm_state['lstm.weight_ih_l0']
input_size = weight_ih.shape[1]
hidden_size = weight_ih.shape[0] // 4
fc1_size = lstm_state['fc1.weight'].shape[0]
output_size = lstm_state['fc2.weight'].shape[0]
lstm_model = LSTMModel(input_size, hidden_size, 2, fc1_size, output_size)
lstm_model.load_state_dict(lstm_state)
lstm_model.eval()
print(f"✓ LSTM loaded (input={input_size}, hidden={hidden_size})")

# 3. TCN
print("Loading TCN...")
tcn_state = torch.load(MODEL_PATH / "tcn_best_model.pt", map_location='cpu', weights_only=False)

if 'network.0.conv1.conv.weight' in tcn_state:
    first_conv_weight = tcn_state['network.0.conv1.conv.weight']
    conv_key_pattern = 'network.{}.conv1.conv.weight'
elif 'network.0.conv1.weight' in tcn_state:
    first_conv_weight = tcn_state['network.0.conv1.weight']
    conv_key_pattern = 'network.{}.conv1.weight'
else:
    raise KeyError("Cannot find TCN conv weights in state dict")

num_inputs = first_conv_weight.shape[1]
kernel_size = first_conv_weight.shape[2]

num_channels = []
i = 0
while conv_key_pattern.format(i) in tcn_state:
    num_channels.append(tcn_state[conv_key_pattern.format(i)].shape[0])
    i += 1

tcn_model = TCNModel(num_inputs, num_channels, kernel_size, output_size=3)

if 'network.0.conv1.conv.weight' in tcn_state:
    new_state = {}
    for key, value in tcn_state.items():
        new_key = key.replace('.conv1.conv.', '.conv1.').replace('.conv2.conv.', '.conv2.')
        new_state[new_key] = value
    tcn_model.load_state_dict(new_state)
else:
    tcn_model.load_state_dict(tcn_state)

tcn_model.eval()
print(f"✓ TCN loaded (input={num_inputs}, channels={num_channels}, kernel={kernel_size})")

# 4. Transformer
print("Loading Transformer...")
checkpoint = torch.load(MODEL_PATH / "best_transformer_model.pth", map_location='cpu', weights_only=False)
transformer_state = checkpoint['model_state_dict']
hyperparams = checkpoint['hyperparameters']
fc1_size = transformer_state['fc1.weight'].shape[0]
output_size = transformer_state['fc2.weight'].shape[0]
max_len = transformer_state['pos_encoder.pe'].shape[1]
transformer_model = TransformerModel(
    hyperparams['input_size'], hyperparams['d_model'], hyperparams['nhead'],
    hyperparams['num_layers'], hyperparams['dim_feedforward'],
    fc1_size, output_size, max_len
)
transformer_model.load_state_dict(transformer_state)
transformer_model.eval()
print(f"✓ Transformer loaded")

print("\n✅ ALL 4 MODELS LOADED SUCCESSFULLY!\n")

# ============================================================
# FEATURE ENGINEERING
# ============================================================
def add_all_features(df):
    features = pd.DataFrame()
    features['date'] = df['date']
    
    features["mid_price"] = (df["BID_PRICE_1"] + df["ASK_PRICE_1"]) / 2
    features["microprice"] = (df["BID_PRICE_1"] * df["ASK_SIZE_1"] + df["ASK_PRICE_1"] * df["BID_SIZE_1"]) / (df["BID_SIZE_1"] + df["ASK_SIZE_1"] + 1e-10)
    features["spread"] = df["ASK_PRICE_1"] - df["BID_PRICE_1"]
    features["vol_imbalance"] = (df["BID_SIZE_1"] - df["ASK_SIZE_1"]) / (df["BID_SIZE_1"] + df["ASK_SIZE_1"] + 1e-6)
    features["bid_ask_spread_ratio"] = features["spread"] / features["mid_price"]
    
    for i in range(1, 4):
        features[f"BID_PRICE_{i}"] = df[f"BID_PRICE_{i}"]
        features[f"BID_SIZE_{i}"] = df[f"BID_SIZE_{i}"]
        features[f"ASK_PRICE_{i}"] = df[f"ASK_PRICE_{i}"]
        features[f"ASK_SIZE_{i}"] = df[f"ASK_SIZE_{i}"]
    
    features["bid_price_mean"] = (df["BID_PRICE_1"] + df["BID_PRICE_2"] + df["BID_PRICE_3"]) / 3
    features["ask_price_mean"] = (df["ASK_PRICE_1"] + df["ASK_PRICE_2"] + df["ASK_PRICE_3"]) / 3
    features["bid_qty_mean"] = (df["BID_SIZE_1"] + df["BID_SIZE_2"] + df["BID_SIZE_3"]) / 3
    features["ask_qty_mean"] = (df["ASK_SIZE_1"] + df["ASK_SIZE_2"] + df["ASK_SIZE_3"]) / 3
    
    features["price_cum_diff"] = (df["ASK_PRICE_1"] - df["BID_PRICE_1"] + 
                                  df["ASK_PRICE_2"] - df["BID_PRICE_2"] + 
                                  df["ASK_PRICE_3"] - df["BID_PRICE_3"])
    features["qty_cum_diff"] = (df["ASK_SIZE_1"] - df["BID_SIZE_1"] + 
                                df["ASK_SIZE_2"] - df["BID_SIZE_2"] + 
                                df["ASK_SIZE_3"] - df["BID_SIZE_3"])
    
    features["time_delta"] = 1
    features["mid_diff"] = features["mid_price"].diff()
    features["mid_return"] = features["mid_diff"] / features["mid_price"].shift(1)
    
    features["total_bid_qty"] = df["BID_SIZE_1"] + df["BID_SIZE_2"] + df["BID_SIZE_3"]
    features["total_ask_qty"] = df["ASK_SIZE_1"] + df["ASK_SIZE_2"] + df["ASK_SIZE_3"]
    features["bid_qty_change"] = features["total_bid_qty"].diff()
    features["ask_qty_change"] = features["total_ask_qty"].diff()
    features["OFI"] = features["bid_qty_change"] - features["ask_qty_change"]
    
    features["mv_1s"] = features["mid_price"].rolling(1000, min_periods=1).mean()
    features["mv_5s"] = features["mid_price"].rolling(5000, min_periods=1).mean()
    features["vol_10"] = features["mid_return"].rolling(10, min_periods=1).std()
    features["vol_100"] = features["mid_return"].rolling(100, min_periods=1).std()
    features["vol_1s"] = features["mid_return"].rolling(1000, min_periods=1).std()
    features["vol_5s"] = features["mid_return"].rolling(5000, min_periods=1).std()
    
    delta = features["microprice"].diff()
    gain = (delta.where(delta > 0, 0)).rolling(14, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14, min_periods=1).mean()
    rs = gain / (loss + 1e-10)
    features["rsi_14"] = 100 - (100 / (1 + rs))
    
    features["ema_fast"] = features["mid_price"].ewm(span=12, adjust=False).mean()
    features["ema_slow"] = features["mid_price"].ewm(span=26, adjust=False).mean()
    features["ema_diff"] = features["ema_fast"] - features["ema_slow"]
    
    features.drop(['total_bid_qty', 'total_ask_qty', 'bid_qty_change', 'ask_qty_change'], 
                  axis=1, inplace=True, errors='ignore')
    
    features = features.ffill().fillna(0)
    features.replace([np.inf, -np.inf], np.nan, inplace=True)
    features = features.ffill().fillna(0)
    
    return features

def add_labels(features, horizon=23):
    features['future_price'] = features['microprice'].shift(-horizon)
    price_change = features['future_price'] - features['microprice']
    features['target'] = 1
    features.loc[price_change > 0, 'target'] = 2
    features.loc[price_change < 0, 'target'] = 0
    features.drop('future_price', axis=1, inplace=True)
    return features

# ============================================================
# LOAD SCALER AND FEATURES
# ============================================================
print("Loading scaler and selected features...")
selected_features = pickle.load(open("C:/Users/wdkal/Downloads/IE421_XGBOOST_DATA/selected_features.pkl", "rb"))
print(f"✓ Using {len(selected_features)} selected features")

class IdentityScaler:
    def transform(self, X):
        return X.values if isinstance(X, pd.DataFrame) else X

scaler = IdentityScaler()
print("⚠️ Using identity scaler (no normalization)\n")

# ============================================================
# PREPROCESS ONE DAY
# ============================================================
def preprocess_day(date):
    file_path = RAW_DATA_PATH / f'{date}_book_updates.csv.gz'
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    df = pd.read_csv(file_path, compression='gzip')
    df['date'] = date
    df['COLLECTION_TIME'] = pd.to_datetime(df['COLLECTION_TIME'])
    df = df.set_index('COLLECTION_TIME')
    df = df.between_time("14:30", "21:00")
    df = df.reset_index()
    
    features = add_all_features(df)
    features = add_labels(features, horizon=23)
    
    X = features[selected_features]
    y = features['target']
    
    X_scaled = pd.DataFrame(scaler.transform(X), columns=selected_features)
    
    valid_idx = ~X_scaled.isna().any(axis=1) & ~y.isna()
    X_scaled = X_scaled[valid_idx].reset_index(drop=True)
    y = y[valid_idx].values
    
    return X_scaled, y

# ============================================================
# GENERATE PREDICTIONS WITH BATCHING
# ============================================================
def generate_predictions_for_day(date, xgb_model, lstm_model, tcn_model, transformer_model):
    import time
    import gc
    
    print(f"\nProcessing {date}...")
    
    t0 = time.time()
    X, y = preprocess_day(date)
    print(f"  Events: {len(X):,} (preprocessing: {time.time()-t0:.1f}s)")
    
    # 1. XGBoost
    t0 = time.time()
    dmatrix = xgb.DMatrix(X)
    xgb_probs = xgb_model.predict(dmatrix)
    
    if len(xgb_probs.shape) == 1:
        print(f"    ⚠️ XGBoost output is 1D (shape={xgb_probs.shape}), reshaping...")
        try:
            xgb_probs = xgb_probs.reshape(-1, 3)
        except:
            n_samples = len(xgb_probs)
            xgb_probs_2d = np.zeros((n_samples, 3))
            xgb_probs_2d[np.arange(n_samples), xgb_probs.astype(int)] = 1.0
            xgb_probs = xgb_probs_2d
    
    print(f"  XGBoost done: {time.time()-t0:.1f}s (shape={xgb_probs.shape})")
    
    # 2-4. Neural networks with batching
    X_array = X.values
    batch_size = 10000
    
    # LSTM
    print(f"  Starting LSTM (batch_size={batch_size})...")
    t0 = time.time()
    seq_len_lstm = 100
    lstm_probs = []
    y_seq = []
    
    for i in range(seq_len_lstm, len(X_array)):
        y_seq.append(y[i])
    
    num_sequences = len(X_array) - seq_len_lstm
    num_batches = (num_sequences + batch_size - 1) // batch_size
    print(f"    Processing {num_sequences:,} sequences in {num_batches} batches...")
    
    for batch_idx, start in enumerate(range(0, num_sequences, batch_size)):
        if batch_idx % 10 == 0:
            print(f"    LSTM batch {batch_idx+1}/{num_batches} ({100*batch_idx/num_batches:.0f}%)")
        
        end = min(start + batch_size, num_sequences)
        X_batch = []
        for i in range(start + seq_len_lstm, end + seq_len_lstm):
            X_batch.append(X_array[i-seq_len_lstm:i])
        
        if len(X_batch) > 0:
            X_tensor = torch.FloatTensor(np.array(X_batch))
            with torch.no_grad():
                output = lstm_model(X_tensor)
                probs = torch.softmax(output, dim=1).numpy()
                lstm_probs.append(probs)
    
    lstm_probs = np.vstack(lstm_probs) if lstm_probs else np.array([])
    print(f"  LSTM done: {time.time()-t0:.1f}s")
    
    # TCN
    print(f"  Starting TCN...")
    t0 = time.time()
    seq_len_tcn = 30
    tcn_probs = []
    
    num_sequences = len(X_array) - seq_len_tcn
    num_batches = (num_sequences + batch_size - 1) // batch_size
    
    for batch_idx, start in enumerate(range(0, num_sequences, batch_size)):
        if batch_idx % 10 == 0:
            print(f"    TCN batch {batch_idx+1}/{num_batches} ({100*batch_idx/num_batches:.0f}%)")
        
        end = min(start + batch_size, num_sequences)
        X_batch = []
        for i in range(start + seq_len_tcn, end + seq_len_tcn):
            X_batch.append(X_array[i-seq_len_tcn:i])
        
        if len(X_batch) > 0:
            X_tensor = torch.FloatTensor(np.array(X_batch)).transpose(1, 2)
            with torch.no_grad():
                output = tcn_model(X_tensor)
                probs = torch.softmax(output, dim=1).numpy()
                tcn_probs.append(probs)
    
    tcn_probs = np.vstack(tcn_probs) if tcn_probs else np.array([])
    print(f"  TCN done: {time.time()-t0:.1f}s")
    
    # Transformer
    print(f"  Starting Transformer...")
    t0 = time.time()
    seq_len_transformer = 100
    transformer_probs = []
    
    transformer_batch_size = 1000
    num_sequences = len(X_array) - seq_len_transformer
    num_batches = (num_sequences + transformer_batch_size - 1) // transformer_batch_size
    
    print(f"    Processing {num_sequences:,} sequences in {num_batches} batches (batch_size={transformer_batch_size})...")
    
    for batch_idx, start in enumerate(range(0, num_sequences, transformer_batch_size)):
        if batch_idx % 100 == 0 and batch_idx > 0:
            elapsed = time.time() - t0
            rate = batch_idx / elapsed
            eta = (num_batches - batch_idx) / rate
            pct_complete = 100 * batch_idx / num_batches
            print(f"    Progress: {batch_idx}/{num_batches} ({pct_complete:.0f}%) - ETA: {eta/60:.1f} min")
        
        end = min(start + transformer_batch_size, num_sequences)
        
        batch_len = end - start
        X_batch = np.zeros((batch_len, seq_len_transformer, X_array.shape[1]), dtype=np.float32)
        
        for j, i in enumerate(range(start + seq_len_transformer, end + seq_len_transformer)):
            X_batch[j] = X_array[i-seq_len_transformer:i]
        
        X_tensor = torch.FloatTensor(X_batch)
        with torch.no_grad():
            output = transformer_model(X_tensor)
            probs = torch.softmax(output, dim=1).numpy()
            transformer_probs.append(probs)
        
        del X_batch, X_tensor, output, probs
        
        if batch_idx % 10 == 0:
            gc.collect()
    
    transformer_probs = np.vstack(transformer_probs) if transformer_probs else np.array([])
    print(f"  Transformer done: {time.time()-t0:.1f}s ({len(transformer_probs):,} predictions)")
    
    # Align predictions
    start_idx = 100
    min_len = min(len(xgb_probs) - start_idx, len(lstm_probs), len(tcn_probs), len(transformer_probs))
    
    print(f"\n  Alignment summary:")
    print(f"    XGBoost: {len(xgb_probs):,} predictions (using {start_idx}:{start_idx+min_len})")
    print(f"    LSTM: {len(lstm_probs):,} predictions (using 0:{min_len})")
    print(f"    TCN: {len(tcn_probs):,} predictions (using 0:{min_len})")
    print(f"    Transformer: {len(transformer_probs):,} predictions (using 0:{min_len})")
    print(f"    Final aligned length: {min_len:,}")
    
    predictions_df = pd.DataFrame({
        'actual': y_seq[:min_len],
        'xgb_pred': xgb_probs[start_idx:start_idx+min_len].argmax(axis=1),
        'xgb_prob_down': xgb_probs[start_idx:start_idx+min_len, 0],
        'xgb_prob_neutral': xgb_probs[start_idx:start_idx+min_len, 1],
        'xgb_prob_up': xgb_probs[start_idx:start_idx+min_len, 2],
        'lstm_pred': lstm_probs[:min_len].argmax(axis=1),
        'lstm_prob_down': lstm_probs[:min_len, 0],
        'lstm_prob_neutral': lstm_probs[:min_len, 1],
        'lstm_prob_up': lstm_probs[:min_len, 2],
        'tcn_pred': tcn_probs[:min_len].argmax(axis=1),
        'tcn_prob_down': tcn_probs[:min_len, 0],
        'tcn_prob_neutral': tcn_probs[:min_len, 1],
        'tcn_prob_up': tcn_probs[:min_len, 2],
        'transformer_pred': transformer_probs[:min_len].argmax(axis=1),
        'transformer_prob_down': transformer_probs[:min_len, 0],
        'transformer_prob_neutral': transformer_probs[:min_len, 1],
        'transformer_prob_up': transformer_probs[:min_len, 2],
    })
    
    output_file = OUTPUT_PATH / f"predictions_{date}.csv"
    predictions_df.to_csv(output_file, index=False)
    
    print(f"\n  Accuracies:")
    for model_name in ['xgb', 'lstm', 'tcn', 'transformer']:
        acc = (predictions_df[f'{model_name}_pred'] == predictions_df['actual']).mean()
        print(f"    {model_name.upper():12s}: {acc:.3f}")
    print(f"  ✓ Saved {output_file}")
    
    return predictions_df

# ============================================================
# GENERATE PREDICTIONS FOR ALL DAYS
# ============================================================
print("="*60)
print("GENERATING PREDICTIONS ON DAYS 6-15")
print("="*60)
print(f"Processing {len(NEW_DAYS)} days: {NEW_DAYS[0]} to {NEW_DAYS[-1]}\n")

all_predictions = {}
successful = 0
failed = 0

for date in NEW_DAYS:
    try:
        import gc
        gc.collect()
        
        preds = generate_predictions_for_day(date, xgb_model, lstm_model, tcn_model, transformer_model)
        all_predictions[date] = preds
        successful += 1
        
        del preds
        gc.collect()
        
    except Exception as e:
        print(f"\n✗ ERROR processing {date}: {e}")
        import traceback
        traceback.print_exc()
        failed += 1
        gc.collect()

print("\n" + "="*60)
print("PREDICTION GENERATION COMPLETE!")
print("="*60)
print(f"Successfully processed: {successful}/{len(NEW_DAYS)} days")
print(f"Failed: {failed}/{len(NEW_DAYS)} days")
print(f"Output directory: {OUTPUT_PATH}")

if successful > 0:
    print("\n✅ Ready for Step 2: Combining predictions for RL training!")

Test 1: Imports...
