# Feature Engineering Notebook

This notebook demonstrates feature engineering techniques for financial derivative pricing models.

In [None]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split

# Import our custom modules
from data.data_generator import FinancialDataGenerator
from data.feature_engineer import FinancialFeatureEngineer
from data.preprocessor import FinancialDataPreprocessor
from utils.visualization import FinancialVisualizer

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Generation and Initial Processing

In [None]:
# Generate sample data
generator = FinancialDataGenerator(seed=42)
option_data = generator.generate_option_prices(n_samples=10000)
swaption_data = generator.generate_swaption_data(n_samples=5000)

print(f"Option data shape: {option_data.shape}")
print(f"Swaption data shape: {swaption_data.shape}")
print(f"\nOption data columns: {list(option_data.columns)}")
print(f"Swaption data columns: {list(swaption_data.columns)}")

## 2. Basic Feature Engineering for Options

In [None]:
# Initialize feature engineer
feature_engineer = FinancialFeatureEngineer()

# Create option-specific features
option_features = feature_engineer.create_option_features(option_data)

print(f"Original features: {option_data.shape[1]}")
print(f"After feature engineering: {option_features.shape[1]}")
print(f"\nNew features added:")
new_features = set(option_features.columns) - set(option_data.columns)
for feature in sorted(new_features):
    print(f"  - {feature}")

In [None]:
# Display feature statistics
print("Feature Statistics:")
feature_stats = option_features[list(new_features)].describe()
print(feature_stats)

## 3. Swaption Feature Engineering

In [None]:
# Create swaption-specific features
swaption_features = feature_engineer.create_swaption_features(swaption_data)

print(f"Original swaption features: {swaption_data.shape[1]}")
print(f"After feature engineering: {swaption_features.shape[1]}")
print(f"\nNew swaption features added:")
new_swaption_features = set(swaption_features.columns) - set(swaption_data.columns)
for feature in sorted(new_swaption_features):
    print(f"  - {feature}")

## 4. Polynomial and Interaction Features

In [None]:
# Add polynomial features
poly_features = ['volatility', 'time_to_expiry', 'moneyness']
option_with_poly = feature_engineer.add_polynomial_features(option_features, poly_features, degree=2)

print(f"After polynomial features: {option_with_poly.shape[1]}")
print(f"\nPolynomial features added:")
poly_new_features = set(option_with_poly.columns) - set(option_features.columns)
for feature in sorted(poly_new_features):
    print(f"  - {feature}")

In [None]:
# Add interaction features
interaction_pairs = [
    ('volatility', 'time_to_expiry'),
    ('moneyness', 'volatility'),
    ('time_to_expiry', 'risk_free_rate')
]

option_with_interactions = feature_engineer.add_interaction_features(option_with_poly, interaction_pairs)

print(f"After interaction features: {option_with_interactions.shape[1]}")
print(f"\nInteraction features added:")
interaction_new_features = set(option_with_interactions.columns) - set(option_with_poly.columns)
for feature in sorted(interaction_new_features):
    print(f"  - {feature}")

## 5. Feature Scaling and Normalization

In [None]:
# Scale features
features_to_scale = ['spot_price', 'strike_price', 'volatility', 'time_to_expiry', 'risk_free_rate']
scaled_data = feature_engineer.scale_features(option_with_interactions, features_to_scale, method='standard')

print(f"After scaling: {scaled_data.shape[1]}")
print(f"\nScaled features:")
scaled_features = [f"{feature}_scaled" for feature in features_to_scale]
for feature in scaled_features:
    if feature in scaled_data.columns:
        print(f"  - {feature}")

In [None]:
# Compare original vs scaled distributions
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Original vs Scaled Feature Distributions', fontsize=16)

for i, feature in enumerate(features_to_scale[:3]):
    # Original distribution
    axes[0, i].hist(option_with_interactions[feature], bins=50, alpha=0.7, label='Original')
    axes[0, i].set_title(f'Original {feature}')
    axes[0, i].set_xlabel(feature)
    axes[0, i].set_ylabel('Frequency')
    axes[0, i].legend()
    
    # Scaled distribution
    scaled_feature = f"{feature}_scaled"
    if scaled_feature in scaled_data.columns:
        axes[1, i].hist(scaled_data[scaled_feature], bins=50, alpha=0.7, label='Scaled', color='orange')
        axes[1, i].set_title(f'Scaled {feature}')
        axes[1, i].set_xlabel(f'{feature} (scaled)')
        axes[1, i].set_ylabel('Frequency')
        axes[1, i].legend()

plt.tight_layout()
plt.show()

## 6. Dimensionality Reduction with PCA

In [None]:
# Select numerical features for PCA
numerical_features = scaled_data.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [f for f in numerical_features if not f.endswith('_scaled')]  # Exclude already scaled

# Remove target variables
features_for_pca = [f for f in numerical_features if f not in ['call_price', 'put_price']]

print(f"Features for PCA: {len(features_for_pca)}")
print(f"Features: {features_for_pca[:10]}...")  # Show first 10

# Apply PCA
pca_data = feature_engineer.apply_pca(scaled_data, features_for_pca, n_components=10)

print(f"\nAfter PCA: {pca_data.shape[1]}")
print(f"PCA components added: {[col for col in pca_data.columns if col.startswith('pca_component_')]}")

In [None]:
# Visualize PCA results
pca_cols = [col for col in pca_data.columns if col.startswith('pca_component_')]

plt.figure(figsize=(15, 6))

# Plot explained variance
plt.subplot(1, 2, 1)
explained_variance = feature_engineer.pca_models['pca'].explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7, label='Individual')
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-', label='Cumulative')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.legend()
plt.grid(True)

# Plot first two components
plt.subplot(1, 2, 2)
plt.scatter(pca_data['pca_component_1'], pca_data['pca_component_2'], alpha=0.6, s=1)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA Components Scatter Plot')
plt.grid(True)

plt.tight_layout()
plt.show()

## 7. Feature Selection

In [None]:
# Prepare data for feature selection
X = pca_data.drop(['call_price', 'put_price'], axis=1)
y = pca_data['call_price']

# Remove non-numerical columns
X_numerical = X.select_dtypes(include=[np.number])

print(f"Features for selection: {X_numerical.shape[1]}")

# Select top k features
selector = SelectKBest(score_func=f_regression, k=20)
X_selected = selector.fit_transform(X_numerical, y)

# Get selected feature names
selected_mask = selector.get_support()
selected_features = X_numerical.columns[selected_mask].tolist()

print(f"\nTop 20 selected features:")
for i, feature in enumerate(selected_features, 1):
    score = selector.scores_[X_numerical.columns.get_loc(feature)]
    print(f"{i:2d}. {feature:<30} Score: {score:.2f}")

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 8))

# Get all feature scores
all_scores = selector.scores_
all_features = X_numerical.columns

# Sort by score
sorted_indices = np.argsort(all_scores)[::-1]
top_features = all_features[sorted_indices][:20]
top_scores = all_scores[sorted_indices][:20]

plt.barh(range(len(top_features)), top_scores)
plt.yticks(range(len(top_features)), top_features)
plt.xlabel('F-Statistic Score')
plt.ylabel('Features')
plt.title('Top 20 Feature Importance Scores')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Rolling Statistics and Time-Based Features

In [None]:
# Create time-based features (simulate time series)
# Add a time index to simulate temporal data
option_with_time = option_features.copy()
option_with_time['time_index'] = range(len(option_with_time))
option_with_time = option_with_time.sort_values('time_index')

# Create rolling features
rolling_features = feature_engineer.create_rolling_features(
    option_with_time, 
    ['volatility', 'call_price'], 
    windows=[5, 10, 20]
)

print(f"After rolling features: {rolling_features.shape[1]}")
print(f"\nRolling features added:")
rolling_new_features = set(rolling_features.columns) - set(option_with_time.columns)
for feature in sorted(rolling_new_features):
    print(f"  - {feature}")

## 9. Complete Feature Engineering Pipeline

In [None]:
def create_complete_feature_set(data, target_column='call_price', n_pca_components=10, n_select_features=20):
    """
    Complete feature engineering pipeline
    """
    print("Starting complete feature engineering pipeline...")
    
    # Initialize feature engineer
    engineer = FinancialFeatureEngineer()
    
    # Step 1: Domain-specific features
    if 'strike_price' in data.columns:  # Option data
        data = engineer.create_option_features(data)
        print(f"✓ Created option-specific features: {data.shape[1]} total features")
    elif 'swap_rate' in data.columns:  # Swaption data
        data = engineer.create_swaption_features(data)
        print(f"✓ Created swaption-specific features: {data.shape[1]} total features")
    
    # Step 2: Polynomial features
    poly_features = ['volatility', 'time_to_expiry']
    if 'moneyness' in data.columns:
        poly_features.append('moneyness')
    
    available_poly_features = [f for f in poly_features if f in data.columns]
    data = engineer.add_polynomial_features(data, available_poly_features, degree=2)
    print(f"✓ Added polynomial features: {data.shape[1]} total features")
    
    # Step 3: Interaction features
    interaction_pairs = [
        ('volatility', 'time_to_expiry'),
        ('volatility', 'risk_free_rate')
    ]
    available_pairs = [(f1, f2) for f1, f2 in interaction_pairs 
                      if f1 in data.columns and f2 in data.columns]
    data = engineer.add_interaction_features(data, available_pairs)
    print(f"✓ Added interaction features: {data.shape[1]} total features")
    
    # Step 4: Scaling
    features_to_scale = ['spot_price', 'strike_price', 'volatility', 'time_to_expiry', 'risk_free_rate']
    available_scale_features = [f for f in features_to_scale if f in data.columns]
    data = engineer.scale_features(data, available_scale_features, method='standard')
    print(f"✓ Scaled features: {data.shape[1]} total features")
    
    # Step 5: PCA
    numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
    pca_features = [f for f in numerical_features 
                   if not f.endswith('_scaled') and f != target_column and f in data.columns]
    
    data = engineer.apply_pca(data, pca_features, n_components=n_pca_components)
    print(f"✓ Applied PCA: {data.shape[1]} total features")
    
    # Step 6: Feature selection
    X = data.drop([target_column], axis=1, errors='ignore')
    y = data[target_column] if target_column in data.columns else None
    
    if y is not None:
        X_numerical = X.select_dtypes(include=[np.number])
        
        selector = SelectKBest(score_func=f_regression, k=min(n_select_features, X_numerical.shape[1]))
        X_selected = selector.fit_transform(X_numerical, y)
        
        selected_mask = selector.get_support()
        selected_feature_names = X_numerical.columns[selected_mask].tolist()
        
        # Keep only selected features plus target
        keep_columns = selected_feature_names + ([target_column] if target_column in data.columns else [])
        data = data[keep_columns]
        print(f"✓ Selected top features: {data.shape[1]} total features")
    
    print(f"\nFeature engineering complete!")
    print(f"Final dataset shape: {data.shape}")
    
    return data, engineer

# Test the complete pipeline
sample_data = option_data.head(1000).copy()  # Use smaller sample for testing
processed_data, engineer = create_complete_feature_set(sample_data)

print(f"\nProcessed data columns:")
for i, col in enumerate(processed_data.columns, 1):
    print(f"{i:2d}. {col}")

## 10. Feature Engineering Summary

In [None]:
print("=== FEATURE ENGINEERING SUMMARY ===")
print(f"Original option features: {len(option_data.columns)}")
print(f"After domain-specific features: {len(option_features.columns)}")
print(f"After polynomial features: {len(option_with_poly.columns)}")
print(f"After interaction features: {len(option_with_interactions.columns)}")
print(f"After scaling: {len(scaled_data.columns)}")
print(f"After PCA: {len(pca_data.columns)}")
print(f"After feature selection: {len(processed_data.columns)}")
print()

print("FEATURE TYPES CREATED:")
print("1. Domain-specific features (moneyness, time-value, etc.)")
print("2. Polynomial features (squared terms, etc.)")
print("3. Interaction features (feature products)")
print("4. Scaled features (standardized)")
print("5. PCA components (dimensionality reduction)")
print("6. Selected features (most important)")
print()

print("KEY BENEFITS:")
print("- Captures non-linear relationships")
print("- Reduces dimensionality while preserving information")
print("- Improves model interpretability")
print("- Handles multicollinearity")
print("- Enables better generalization")
print()

print("NEXT STEPS:")
print("1. Model training with engineered features")
print("2. Cross-validation to prevent overfitting")
print("3. Feature importance analysis")
print("4. Model interpretation and validation")