this notebook generates the imput features on which our models will learn

we start with the 2 more obvious features, which are Z-Score (metric that standardize spread relative to its recent mean and volatility) and spread volatlity

More possible feature inputs are computed, organized into 3 groups:
-standardized deviation metrics
-Oscillator and postioning metrics
-volatility and dynamics metrics


All features values are computed for each gold tier pairs choosen in the previous stage


In [1]:
#TAKE GOLD TIER PAIRS AND CONVERT THEIR DATA INTO ML-READY FEATURES

# create many features an test their correlation to the target 

import pandas as pd
import numpy as np
import os

raw_dir = '../data/raw'
gold_pairs_path = '../data/processed/03_gold_tier_pairs.csv' 
output_path = '../data/processed/04_ml_ready_features.csv'

print(f"Loading {gold_pairs_path}...")
gold_pairs = pd.read_csv(gold_pairs_path)

all_pairs_data = []
grouped = gold_pairs.groupby('Sector')

for sector_name, group in grouped:
    print(f"\n--- Processing Sector: {sector_name} ({len(group)} pairs) ---")
    
    price_path = os.path.join(raw_dir, f"{sector_name}_prices.csv")
    try:
        prices = pd.read_csv(price_path, index_col=0, parse_dates=True)
    except FileNotFoundError:
        print(f"   Missing price file for {sector_name}. Skipping.")
        continue

    for _, row in group.iterrows():
        s1, s2 = row['Stock1'], row['Stock2']
        pair_name = f"{s1}-{s2}"
        
        try:
            df_pair = prices[[s1, s2]].dropna()
            hedge_ratio = row['Hedge_Ratio']
            spread = df_pair[s1] - (hedge_ratio * df_pair[s2])
        except KeyError:
            continue

        df = pd.DataFrame(index=spread.index)
        df['Spread'] = spread
        
        window = 20
        
        # MEAN REVERSION FOCUSED FEATURES
        
        rolling_mean = spread.rolling(window).mean()
        rolling_std = spread.rolling(window).std()
        rolling_min = spread.rolling(window).min()
        rolling_max = spread.rolling(window).max()
        
        # 1. Z_Score (original feature for the model, standadrized deviation metric)
        df['Z_Score'] = (spread - rolling_mean) / rolling_std
        
        # 2. Volatility (original feature for the model)
        df['Volatility'] = rolling_std
        
#ADDITIONAL FEATURE FOR MODEL PRECISION ENHANCEMENTS

    ##standarddized deviation metrics

        # 3. EXTREME Z_SCORE INDICATOR (binary: is spread at extreme?)
        # High Z correlates with reversion if mean reversion works
        df['Extreme_Z'] = ((df['Z_Score'].abs() > 1.5).astype(int))
        
        # 4. DISTANCE FROM MEAN (absolute deviation)
        # How far from equilibrium? Bigger distance = stronger reversion pressure
        df['Distance_From_Mean'] = (spread - rolling_mean).abs() / (rolling_std + 1e-6)
        
        # 5. SPREAD ITSELF (normalized)
        # Sometimes raw spread level matters
        df['Spread_Normalized'] = spread / (rolling_std + 1e-6)


    ##Oscillator & positioning metrics

        # 6. RECENT RANGE POSITION (where in recent min/max?)
        # Low = near recent low (might bounce up)
        # High = near recent high (might bounce down)
        df['Range_Position'] = (spread - rolling_min) / (rolling_max - rolling_min + 1e-6)

        # 7. EXTREME BOUNCE INDICATOR
        # Was spread recently extreme? (strong predictor of reversal)
        was_extreme = ((spread.shift(1).abs() > rolling_std.shift(1) * 2).astype(int))
        df['Recent_Extreme'] = was_extreme
        

    ##volatility & dynamics metrics

        # 5. MEAN REVERSION SPEED (how fast should it revert?)
        # Distance * Z_Score direction = strength & direction of reversion
        df['MR_Strength'] = np.sign(rolling_mean - spread) * ((spread - rolling_mean).abs() / (rolling_std + 1e-6))
        
        
        # 8. VOLATILITY EXPANSION (is vol increasing?)
        # High vol might suppress mean reversion
        vol_sma = rolling_std.rolling(10).mean()
        df['Vol_Expansion'] = rolling_std / (vol_sma + 1e-6)
        
        # 9. MEAN REVERSION VELOCITY (speed of change toward mean)
        # How fast is spread moving toward mean?
        days_to_mean_at_current_speed = (spread - rolling_mean) / (spread.diff() + 1e-6)
        df['Speed_To_Mean'] = days_to_mean_at_current_speed.rolling(5).mean()
        
        
        # TARGET OUTPUT GENERATION 
        df['Target_Return'] = spread.shift(-10) - spread
        df['Target_Direction'] = (df['Target_Return'] > 0).astype(int)
        
        # Metadata
        df['Pair_ID'] = pair_name
        df['Sector'] = sector_name
        
        all_pairs_data.append(df.dropna())

# COMBINE & SAVE
if all_pairs_data:
    ml_dataset = pd.concat(all_pairs_data)
    ml_dataset.to_csv(output_path)
    
    print(f"\n‚úÖ SUCCESS! Generated dataset with {ml_dataset['Pair_ID'].nunique()} pairs.")
    print(f"Total Rows: {len(ml_dataset)}")
    
    # Show all generated features
    feature_cols = [col for col in ml_dataset.columns 
                   if col not in ['Spread', 'Target_Return', 'Target_Direction', 'Pair_ID', 'Sector']]
    print(f"\nüìä Features ({len(feature_cols)} total):")
    for i, col in enumerate(feature_cols, 1):
        print(f"   {i:2d}. {col}")
    
    
    
    
else:
    print("‚ùå No pairs found.")


Loading ../data/processed/03_gold_tier_pairs.csv...

--- Processing Sector: financials (3 pairs) ---

--- Processing Sector: industrials (1 pairs) ---

--- Processing Sector: utilities (1 pairs) ---

‚úÖ SUCCESS! Generated dataset with 5 pairs.
Total Rows: 5868

üìä Features (10 total):
    1. Z_Score
    2. Volatility
    3. Extreme_Z
    4. Distance_From_Mean
    5. Spread_Normalized
    6. Range_Position
    7. Recent_Extreme
    8. MR_Strength
    9. Vol_Expansion
   10. Speed_To_Mean


these metrics relevance to output target are tested using the pearson correlation factor
Only feature with significant correlation will be kept as input to feed the leaning models

Since we are trying to predict binary values with LSTM and Ridge Classifier, compute correlation  between target direction and features, instead of target return

In [8]:
import pandas as pd

df = pd.read_csv('../data/processed/04_ml_ready_features.csv', index_col=0, parse_dates=True)
print(df.drop(['Pair_ID', 'Sector'], axis=1).corr()['Target_Direction'].sort_values(ascending=False))

correlations = df.drop(['Pair_ID', 'Sector', 'Target_Return', 'Spread'], axis=1).corr()['Target_Direction']

results_table = pd.DataFrame({
    'Feature': correlations.index,
    'Correlation': correlations.values,
    'Abs_Correlation': correlations.abs().values  
})

results_table = results_table.sort_values(by='Abs_Correlation', ascending=False)

print("\n Feature Correlation Table:")
print(results_table)

# 8. Save to CSV for your report
results_table.to_csv('../data/processed/05_feature_correlations.csv', index=False)
print("\n Table saved to: ../data/processed/05_feature_correlations.csv")

Target_Direction      1.000000
Target_Return         0.635097
MR_Strength           0.138193
Vol_Expansion         0.012738
Speed_To_Mean         0.001621
Spread_Normalized     0.000190
Distance_From_Mean   -0.013124
Volatility           -0.014182
Recent_Extreme       -0.021957
Spread               -0.023312
Extreme_Z            -0.025408
Z_Score              -0.138193
Range_Position       -0.151624
Name: Target_Direction, dtype: float64

 Feature Correlation Table:
               Feature  Correlation  Abs_Correlation
10    Target_Direction     1.000000         1.000000
5       Range_Position    -0.151624         0.151624
7          MR_Strength     0.138193         0.138193
0              Z_Score    -0.138193         0.138193
2            Extreme_Z    -0.025408         0.025408
6       Recent_Extreme    -0.021957         0.021957
1           Volatility    -0.014182         0.014182
3   Distance_From_Mean    -0.013124         0.013124
8        Vol_Expansion     0.012738         0.012738

In [11]:
# 1. Define the paths
source_path = '../data/processed/05_feature_correlations.csv'
output_dir = '../results'
output_path = os.path.join(output_dir, 'feature_correlations.csv')

# 2. Load the CSV 
df_final = pd.read_csv(source_path)

# 4. Save the table to the output folder
df_final.to_csv(output_path, index=False)

print(f"Table successfully saved to: {output_path}")

Table successfully saved to: ../results\feature_correlations.csv


In [75]:
import pandas as pd
df = pd.read_csv('../data/processed/04_ml_ready_features.csv', index_col=0, parse_dates=True)

print(f"Total rows: {len(df)}")
print(f"Unique pairs: {df['Pair_ID'].nunique()}")
print(f"Rows per pair:")
for pair in df['Pair_ID'].unique():
    n = (df['Pair_ID'] == pair).sum()
    print(f"  {pair}: {n}")

Total rows: 5868
Unique pairs: 5
Rows per pair:
  FITB-PNC: 1146
  AIG-CB: 1146
  MS-STT: 1146
  AME-ITW: 1215
  CMS-DUK: 1215
