In [2]:
#transform raw prices into a dataset ready cor Ridge/MLP/LSTM

import pandas as pd
import numpy as np
import statsmodels.api as sm
import os

# Define paths
raw_dir = '../data/raw'
pairs_path = '../data/processed/valid_pairs.csv' # Or 'valid_pairs_with_metrics.csv'
output_path = '../data/processed/lstm_ready_data_multi.csv'

# 1. LOAD PAIRS
print(f"Loading pairs from {pairs_path}...")
valid_pairs = pd.read_csv(pairs_path)

# Check if 'Sector' column exists (it should from previous steps)
if 'Sector' not in valid_pairs.columns:
    print("Error: 'Sector' column missing in valid_pairs.csv. Cannot load correct prices.")
    exit()

print(f"Found {len(valid_pairs)} pairs across {valid_pairs['Sector'].nunique()} sectors.")

all_pairs_data = []

# 2. ITERATE BY SECTOR (Efficiency Boost)
# Instead of loading prices 500 times, we load the sector file once and process all its pairs.
grouped = valid_pairs.groupby('Sector')

for sector_name, group in grouped:
    print(f"\n--- Processing Sector: {sector_name} ({len(group)} pairs) ---")
    
    # Dynamic Price Loading
    price_file = f"{sector_name}_prices.csv"
    price_path = os.path.join(raw_dir, price_file)
    
    try:
        prices = pd.read_csv(price_path, index_col=0, parse_dates=True)
    except FileNotFoundError:
        print(f"   Warning: Could not find price file {price_path}. Skipping sector.")
        continue

    # Process each pair in this sector
    for i, row in group.iterrows():
        # --- FIX 1: Use Correct Column Names (Stock1 / Stock2) ---
        stock_y = row['Stock1'] 
        stock_x = row['Stock2']
        pair_name = f"{stock_y}-{stock_x}"
        
        # A. Get Prices & Clean
        try:
            Y = prices[stock_y]
            X = prices[stock_x]
        except KeyError:
            print(f"   Skipping {pair_name} (Ticker missing in price file)")
            continue

        # Align dates (Inner Join)
        df_pair = pd.concat([Y, X], axis=1).dropna()
        if len(df_pair) < 200: # Skip if history is too short
            continue
            
        Y, X = df_pair.iloc[:, 0], df_pair.iloc[:, 1]
        
        # B. Calculate Hedge Ratio (Rolling or Static)
        # We use static here for simplicity, but rolling is better for production
        X_const = sm.add_constant(X)
        model = sm.OLS(Y, X_const).fit()
        
        # --- FIX 2: Robust Parameter Extraction ---
        # Use .iloc[1] to get the slope, avoiding name mismatch errors
        if len(model.params) < 2: continue
        hedge_ratio = model.params.iloc[1]
        
        # C. Create Spread
        spread = Y - (hedge_ratio * X)
        
        # D. Feature Engineering
        df = pd.DataFrame(index=spread.index)
        df['Spread'] = spread
        
        window = 20
        # Z-Score
        rolling_mean = df['Spread'].rolling(window).mean()
        rolling_std = df['Spread'].rolling(window).std()
        df['Z_Score'] = (df['Spread'] - rolling_mean) / rolling_std
        
        # Volatility
        df['Volatility'] = df['Spread'].rolling(window).std()
        
        # Momentum (ROC)
        df['Momentum'] = df['Spread'].diff(5)
        
        # TARGET GENERATION (The Answer Key)
        # Did the spread go UP or DOWN tomorrow?
        df['Future_Return'] = df['Spread'].shift(-1) - df['Spread']
        df['Target_Direction'] = (df['Future_Return'] > 0).astype(int)
        
        # Metadata
        df['Pair_ID'] = pair_name
        df['Sector'] = sector_name
        
        # Drop NaNs created by rolling windows
        all_pairs_data.append(df.dropna())

# 3. COMBINE & SAVE
if all_pairs_data:
    giant_dataset = pd.concat(all_pairs_data)
    print(f"\nSUCCESS! Generated dataset with shape: {giant_dataset.shape}")
    print(f"Unique Pairs: {giant_dataset['Pair_ID'].nunique()}")
    
    giant_dataset.to_csv(output_path)
    print(f"Saved to {output_path}")
else:
    print("No data generated. Check your input files.")

Loading pairs from ../data/processed/valid_pairs.csv...
Found 121 pairs across 11 sectors.

--- Processing Sector: communication_services (1 pairs) ---

--- Processing Sector: consumer_discretionary (15 pairs) ---

--- Processing Sector: consumer_staples (5 pairs) ---

--- Processing Sector: energy (1 pairs) ---

--- Processing Sector: financials (28 pairs) ---

--- Processing Sector: health_care (3 pairs) ---

--- Processing Sector: industrials (2 pairs) ---

--- Processing Sector: information_technology (16 pairs) ---

--- Processing Sector: materials (2 pairs) ---

--- Processing Sector: tech (30 pairs) ---

--- Processing Sector: utilities (18 pairs) ---

SUCCESS! Generated dataset with shape: (113693, 8)
Unique Pairs: 118
Saved to ../data/processed/lstm_ready_data_multi.csv
