In [11]:
#TAKE GOLD TIER PAIRS AND CONVERT THEIR DATA INTO ML-READY FEATURES

import pandas as pd
import numpy as np
import os

# PATH CONFIGURATION
raw_dir = '../data/raw'
# Using the filtered Gold Tier file with top tier pairs
gold_pairs_path = '../data/processed/03_gold_tier_pairs.csv' 
output_path = '../data/processed/04_ml_ready_features.csv'

# 1. LOAD GOLD TIER PAIRS
print(f"Loading {gold_pairs_path}...")
gold_pairs = pd.read_csv(gold_pairs_path)

all_pairs_data = []

# 2. ITERATE BY SECTOR TO RETRIEVE GOLD TIER DATA
grouped = gold_pairs.groupby('Sector')
#group pairs by sector and load price data for each sector once

for sector_name, group in grouped:
    print(f"\n--- Processing Sector: {sector_name} ({len(group)} pairs) ---")
    
    price_path = os.path.join(raw_dir, f"{sector_name}_prices.csv")
    try:
        prices = pd.read_csv(price_path, index_col=0, parse_dates=True)
    except FileNotFoundError:
        print(f"   Missing price file for {sector_name}. Skipping.")
        continue

    for _, row in group.iterrows():
        s1, s2 = row['Stock1'], row['Stock2']
        pair_name = f"{s1}-{s2}"
        
        # A. Construction using validated Hedge Ratio
        try:
            # Align and construct the spread
            df_pair = prices[[s1, s2]].dropna()
            # Crucial: Use the beta already validated in your tradability analysis
            hedge_ratio = row['Hedge_Ratio']
            spread = df_pair[s1] - (hedge_ratio * df_pair[s2])
        except KeyError:
            continue

        # B. Feature Engineering -- construct two features
        df = pd.DataFrame(index=spread.index)
        df['Spread'] = spread
        
        window = 20 # Standard lookback window
        
        # (A) Z-Score: Normalizing the spread for mean-reversion detection
        rolling_mean = spread.rolling(window).mean()
        rolling_std = spread.rolling(window).std()
        df['Z_Score'] = (spread - rolling_mean) / rolling_std
        
        # (B) Volatility: Captures market regime/risk
        df['Volatility'] = rolling_std
        
        
        # C. Target Generation (Look-ahead)
        # Next-day return for Ridge (Regression)
        df['Target_Return'] = spread.shift(-1) - spread
        # Binary direction for MLP/LSTM (Classification)
        df['Target_Direction'] = (df['Target_Return'] > 0).astype(int)
        
        # Metadata
        df['Pair_ID'] = pair_name
        df['Sector'] = sector_name
        
        all_pairs_data.append(df.dropna())

# 3. COMBINE & SAVE
if all_pairs_data:
    ml_dataset = pd.concat(all_pairs_data)
    ml_dataset.to_csv(output_path)
    print(f"\n✅ SUCCESS! Generated dataset with {ml_dataset['Pair_ID'].nunique()} pairs.")
    print(f"Total Rows: {len(ml_dataset)}")

Loading ../data/processed/03_gold_tier_pairs.csv...

--- Processing Sector: financials (1 pairs) ---

--- Processing Sector: tech (3 pairs) ---

--- Processing Sector: utilities (1 pairs) ---

✅ SUCCESS! Generated dataset with 5 pairs.
Total Rows: 5223
