# Foreign Market Lead-Lag ML Strategy
## Notebook 2: Feature Engineering

This notebook creates and analyzes predictive features:
- Create lagged weekly return features (188 features: 47 markets × 4 lags)
- Apply cross-sectional standardization
- Winsorize extreme values
- Align features with target returns
- Analyze feature distributions and correlations

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import warnings
warnings.filterwarnings('ignore')

from feature_engineering import FeatureEngineering

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

%matplotlib inline
%load_ext autoreload
%autoreload 2

## 1. Load Data and Configuration

In [None]:
# Load config
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load data
sp500_returns = pd.read_csv('../data/sp500_daily_returns.csv', index_col=0, parse_dates=True)
foreign_returns = pd.read_csv('../data/foreign_weekly_returns.csv', index_col=0, parse_dates=True)

print(f"S&P 500 returns: {sp500_returns.shape}")
print(f"Foreign returns: {foreign_returns.shape}")

## 2. Create Lagged Features

In [None]:
# Initialize feature engineering
feature_eng = FeatureEngineering(config)

# Create lagged features
lagged_features = feature_eng.create_lagged_features(foreign_returns)

print(f"\nLagged features shape: {lagged_features.shape}")
print(f"Number of features: {len(lagged_features.columns)}")
print(f"\nFirst 10 feature names:")
print(lagged_features.columns[:10].tolist())

## 3. Feature Distribution Before Standardization

In [None]:
# Plot distribution of raw features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Sample 4 random features
sample_features = np.random.choice(lagged_features.columns, 4, replace=False)

for idx, feature in enumerate(sample_features):
    ax = axes[idx // 2, idx % 2]
    lagged_features[feature].dropna().hist(bins=50, ax=ax, edgecolor='black', alpha=0.7)
    ax.set_title(f'Distribution: {feature}', fontweight='bold')
    ax.set_xlabel('Return')
    ax.set_ylabel('Frequency')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print("\nFeature Statistics (Before Standardization):")
print(lagged_features.describe())

## 4. Apply Winsorization

In [None]:
# Winsorize features
winsorized_features = feature_eng.winsorize_features(lagged_features)

# Compare before and after
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

sample_feature = lagged_features.columns[0]

axes[0].hist(lagged_features[sample_feature].dropna(), bins=50, 
            edgecolor='black', alpha=0.7, color='steelblue')
axes[0].set_title(f'Before Winsorization: {sample_feature}', fontweight='bold')
axes[0].set_xlabel('Return')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)

axes[1].hist(winsorized_features[sample_feature].dropna(), bins=50, 
            edgecolor='black', alpha=0.7, color='coral')
axes[1].set_title(f'After Winsorization: {sample_feature}', fontweight='bold')
axes[1].set_xlabel('Return')
axes[1].set_ylabel('Frequency')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Apply Cross-Sectional Standardization

In [None]:
# Standardize features
standardized_features = feature_eng.standardize_features(winsorized_features)

print("\nFeature Statistics (After Standardization):")
print(standardized_features.describe())

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Before standardization
winsorized_features.iloc[100].plot(kind='hist', bins=50, ax=axes[0], 
                                   edgecolor='black', alpha=0.7, color='steelblue')
axes[0].set_title('Cross-Sectional Distribution (Before)', fontweight='bold')
axes[0].set_xlabel('Return')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)

# After standardization
standardized_features.iloc[100].plot(kind='hist', bins=50, ax=axes[1], 
                                     edgecolor='black', alpha=0.7, color='coral')
axes[1].set_title('Cross-Sectional Distribution (After)', fontweight='bold')
axes[1].set_xlabel('Standardized Return')
axes[1].set_ylabel('Frequency')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Prepare Training Data for Sample Stock

In [None]:
# Select a sample stock
sample_stock = sp500_returns.columns[0]
print(f"Sample stock: {sample_stock}")

# Prepare training data
X, y = feature_eng.prepare_training_data(foreign_returns, sp500_returns, sample_stock)

print(f"\nTraining data shape:")
print(f"  Features (X): {X.shape}")
print(f"  Target (y): {y.shape}")
print(f"\nDate range: {X.index[0]} to {X.index[-1]}")

## 7. Feature Correlation Analysis

In [None]:
# Calculate correlation between features and target
feature_target_corr = X.corrwith(y).sort_values(ascending=False)

# Plot top 20 most correlated features
fig, ax = plt.subplots(figsize=(12, 8))
feature_target_corr.head(20).plot(kind='barh', ax=ax, color='steelblue')
ax.set_title(f'Top 20 Features Correlated with {sample_stock} Returns', 
            fontsize=14, fontweight='bold')
ax.set_xlabel('Correlation')
ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nTop 10 Most Correlated Features:")
for feature, corr in feature_target_corr.head(10).items():
    print(f"  {feature}: {corr:.4f}")

## 8. Feature Importance by Lag

In [None]:
# Analyze correlation by lag
lag_correlations = {}

for lag in config['features']['lags']:
    lag_features = [col for col in X.columns if f'_lag{lag}' in col]
    lag_corr = X[lag_features].corrwith(y).abs().mean()
    lag_correlations[f'Lag {lag}'] = lag_corr

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
pd.Series(lag_correlations).plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_title(f'Average Absolute Correlation by Lag ({sample_stock})', 
            fontsize=14, fontweight='bold')
ax.set_xlabel('Lag')
ax.set_ylabel('Avg |Correlation|')
ax.grid(True, alpha=0.3)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print("\nAverage Correlation by Lag:")
for lag, corr in lag_correlations.items():
    print(f"  {lag}: {corr:.4f}")

## 9. Prepare Data for All Stocks

In [None]:
# Prepare data for all stocks (this may take a few minutes)
print("Preparing data for all stocks...")
stock_data = feature_eng.prepare_all_stocks(foreign_returns, sp500_returns)

print(f"\nPrepared data for {len(stock_data)} stocks")

# Analyze sample sizes
sample_sizes = {stock: len(X) for stock, (X, y) in stock_data.items()}
sample_sizes_series = pd.Series(sample_sizes)

print(f"\nSample Size Statistics:")
print(f"  Mean: {sample_sizes_series.mean():.0f}")
print(f"  Median: {sample_sizes_series.median():.0f}")
print(f"  Min: {sample_sizes_series.min():.0f}")
print(f"  Max: {sample_sizes_series.max():.0f}")

# Plot distribution
fig, ax = plt.subplots(figsize=(12, 6))
sample_sizes_series.hist(bins=50, ax=ax, edgecolor='black', alpha=0.7, color='steelblue')
ax.set_title('Distribution of Sample Sizes Across Stocks', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Samples')
ax.set_ylabel('Number of Stocks')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Summary

This notebook created and analyzed predictive features:
- Created 188 lagged features (47 markets × 4 lags)
- Applied winsorization to handle extreme values
- Applied cross-sectional standardization to prevent volatility bias
- Aligned features with target returns
- Analyzed feature correlations and importance

**Key Findings**:
- Features show varying correlations with target returns
- Different lags capture different predictive information
- Cross-sectional standardization ensures fair comparison across markets

**Next Steps**: Proceed to Notebook 3 for model training and validation.