# Deep Learning Options Trading - Feature Analysis

This notebook analyzes the engineered features used by the LSTM model:
- Feature importance and predictive power
- Correlation with straddle returns
- Stationarity and time series properties
- Feature engineering validation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression
from statsmodels.tsa.stattools import adfuller
from pathlib import Path
import yaml
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## 1. Load Engineered Features

In [None]:
# Load features and targets
try:
    features_df = pd.read_csv('../data/processed/features.csv')
    features_df['date'] = pd.to_datetime(features_df['date'])
    print(f"Loaded features: {len(features_df)} records")
    
    # Separate features and target
    feature_cols = [col for col in features_df.columns if col not in ['date', 'ticker', 'straddle_price']]
    target_col = 'straddle_price'
    
    X = features_df[feature_cols]
    y = features_df[target_col]
    
    print(f"Features: {len(feature_cols)}")
    print(f"Target: {target_col}")
    
except FileNotFoundError:
    print("Features not found. Run feature engineering first.")
    X, y = None, None

## 2. Feature Correlation Analysis

In [None]:
if X is not None:
    # Correlation with target
    correlations = X.corrwith(y).sort_values(ascending=False)
    
    plt.figure(figsize=(12, 6))
    correlations.plot(kind='bar')
    plt.title('Feature Correlations with Straddle Price')
    plt.xlabel('Feature')
    plt.ylabel('Correlation Coefficient')
    plt.xticks(rotation=45)
    plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()
    
    print("Top 5 positive correlations:")
    print(correlations.head())
    print("\nTop 5 negative correlations:")
    print(correlations.tail())

In [None]:
if X is not None:
    # Feature-to-feature correlations
    plt.figure(figsize=(12, 10))
    corr_matrix = X.corr()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True)
    plt.title('Feature Correlation Matrix (Lower Triangle)')
    plt.tight_layout()
    plt.show()
    
    # Identify highly correlated features
    high_corr = np.where(np.abs(corr_matrix) > 0.8)
    high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) 
                      for x, y in zip(*high_corr) if x != y and x < y]
    
    if high_corr_pairs:
        print("Highly correlated feature pairs (|corr| > 0.8):")
        for pair in high_corr_pairs:
            corr_value = corr_matrix.loc[pair[0], pair[1]]
            print(f"{pair[0]} - {pair[1]}: {corr_value:.3f}")
    else:
        print("No highly correlated feature pairs found.")

## 3. Feature Importance Analysis

In [None]:
if X is not None:
    # Mutual information
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
    
    plt.figure(figsize=(12, 6))
    mi_scores.plot(kind='bar')
    plt.title('Feature Importance (Mutual Information)')
    plt.xlabel('Feature')
    plt.ylabel('Mutual Information Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print("Mutual Information Scores:")
    for feature, score in mi_scores.items():
        print(f"{feature}: {score:.4f}")

In [None]:
if X is not None:
    # Linear regression coefficients
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    lr = LinearRegression()
    lr.fit(X_scaled, y)
    
    # Get coefficients
    coefficients = pd.Series(lr.coef_, index=X.columns).sort_values(ascending=False)
    
    plt.figure(figsize=(12, 6))
    coefficients.plot(kind='bar')
    plt.title('Linear Regression Coefficients (Standardized)')
    plt.xlabel('Feature')
    plt.ylabel('Coefficient Value')
    plt.xticks(rotation=45)
    plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()
    
    print(f"RÂ² Score: {lr.score(X_scaled, y):.4f}")
    print("\nTop positive coefficients:")
    print(coefficients.head())
    print("\nTop negative coefficients:")
    print(coefficients.tail())

## 4. Stationarity Testing

In [None]:
if X is not None:
    # Augmented Dickey-Fuller test for stationarity
    stationarity_results = {}
    
    for col in X.columns:
        try:
            adf_result = adfuller(X[col].dropna())
            stationarity_results[col] = {
                'adf_statistic': adf_result[0],
                'p_value': adf_result[1],
                'critical_values': adf_result[4],
                'stationary': adf_result[1] < 0.05
            }
        except:
            stationarity_results[col] = {'error': 'Could not test stationarity'}
    
    # Display results
    print("Stationarity Test Results (Augmented Dickey-Fuller):")
    print("=" * 60)
    
    for feature, result in stationarity_results.items():
        if 'error' not in result:
            print(f"{feature}:")
            print(f"  ADF Statistic: {result['adf_statistic']:.4f}")
            print(f"  p-value: {result['p_value']:.4f}")
            print(f"  Stationary: {result['stationary']}")
            print(f"  5% Critical Value: {result['critical_values']['5%']:.4f}")
            print()
        else:
            print(f"{feature}: {result['error']}")
    
    # Summary
    stationary_features = [f for f, r in stationarity_results.items() 
                          if 'stationary' in r and r['stationary']]
    non_stationary_features = [f for f, r in stationarity_results.items() 
                              if 'stationary' in r and not r['stationary']]
    
    print(f"\nStationary features: {len(stationary_features)}/{len(X.columns)}")
    print(f"Non-stationary features: {len(non_stationary_features)}/{len(X.columns)}")
    
    if non_stationary_features:
        print(f"\nNon-stationary features: {', '.join(non_stationary_features)}")
        print("Consider differencing or other transformations for LSTM input.")

## 5. Time Series Properties

In [None]:
if X is not None:
    # Autocorrelation analysis for key features
    from statsmodels.graphics.tsaplots import plot_acf
    
    key_features = ['moneyness', 'time_to_expiry', 'premium_normalized', 'implied_volatility']
    key_features = [f for f in key_features if f in X.columns]
    
    fig, axes = plt.subplots(len(key_features), 1, figsize=(12, 4*len(key_features)))
    if len(key_features) == 1:
        axes = [axes]
    
    for i, feature in enumerate(key_features):
        plot_acf(X[feature].dropna(), lags=30, ax=axes[i], title=f'Autocorrelation - {feature}')
    
    plt.tight_layout()
    plt.show()

In [None]:
if X is not None:
    # Rolling statistics to check for structural breaks
    fig, axes = plt.subplots(len(feature_cols), 1, figsize=(15, 3*len(feature_cols)))
    if len(feature_cols) == 1:
        axes = [axes]
    
    window = 252  # 1 year of trading days
    
    for i, feature in enumerate(feature_cols):
        rolling_mean = X[feature].rolling(window=window).mean()
        rolling_std = X[feature].rolling(window=window).std()
        
        ax1 = axes[i]
        ax1.plot(X[feature].index, X[feature].values, alpha=0.7, label='Feature')
        ax1.plot(X[feature].index, rolling_mean, color='red', label=f'{window}-day Mean')
        ax1.set_title(f'{feature} with Rolling Mean')
        ax1.legend()
        
        # Add secondary y-axis for std
        ax2 = ax1.twinx()
        ax2.plot(X[feature].index, rolling_std, color='green', alpha=0.7, label=f'{window}-day Std')
        ax2.set_ylabel('Rolling Std', color='green')
        ax2.legend(loc='upper right')
    
    plt.tight_layout()
    plt.show()

## 6. Feature Engineering Validation

In [None]:
if X is not None:
    # Check for outliers
    fig, axes = plt.subplots(len(feature_cols), 1, figsize=(12, 3*len(feature_cols)))
    if len(feature_cols) == 1:
        axes = [axes]
    
    for i, feature in enumerate(feature_cols):
        # Box plot for outlier detection
        axes[i].boxplot(X[feature].dropna(), vert=False)
        axes[i].set_title(f'Box Plot - {feature}')
        axes[i].set_xlabel(feature)
    
    plt.tight_layout()
    plt.show()
    
    # Outlier statistics
    outlier_stats = {}
    for feature in feature_cols:
        data = X[feature].dropna()
        q1, q3 = data.quantile([0.25, 0.75])
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = data[(data < lower_bound) | (data > upper_bound)]
        outlier_stats[feature] = {
            'outlier_count': len(outliers),
            'outlier_percentage': len(outliers) / len(data) * 100,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound
        }
    
    print("Outlier Analysis:")
    for feature, stats in outlier_stats.items():
        print(f"{feature}: {stats['outlier_count']} outliers ({stats['outlier_percentage']:.2f}%)")

## 7. Feature Analysis Summary

In [None]:
# Generate comprehensive feature analysis report
if X is not None:
    print("=== FEATURE ANALYSIS SUMMARY ===\n")
    
    # Feature overview
    print(f"Total features: {len(feature_cols)}")
    print(f"Total samples: {len(X)}")
    print(f"Missing values: {X.isnull().sum().sum()}")
    
    # Predictive power ranking
    print("\nTop 5 features by correlation with target:")
    top_corr = correlations.head()
    for i, (feature, corr) in enumerate(top_corr.items(), 1):
        print(f"{i}. {feature}: {corr:.4f}")
    
    print("\nTop 5 features by mutual information:")
    top_mi = mi_scores.head()
    for i, (feature, mi) in enumerate(top_mi.items(), 1):
        print(f"{i}. {feature}: {mi:.4f}")
    
    # Stationarity summary
    stationary_count = sum(1 for r in stationarity_results.values() if r.get('stationary', False))
    print(f"\nStationary features: {stationary_count}/{len(feature_cols)}")
    
    # Data quality
    total_outliers = sum(stats['outlier_count'] for stats in outlier_stats.values())
    avg_outlier_pct = np.mean([stats['outlier_percentage'] for stats in outlier_stats.values()])
    print(f"Total outliers: {total_outliers}")
    print(f"Average outlier percentage: {avg_outlier_pct:.2f}%")
    
    # Recommendations
    print("\n=== RECOMMENDATIONS ===")
    
    if non_stationary_features:
        print(f"Consider differencing for non-stationary features: {', '.join(non_stationary_features[:3])}...")
    
    if avg_outlier_pct > 5:
        print("High outlier percentage detected. Consider robust scaling or outlier treatment.")
    
    low_corr_features = correlations[correlations.abs() < 0.1]
    if len(low_corr_features) > 0:
        print(f"Consider removing low-correlation features: {', '.join(low_corr_features.index[:3])}...")
    
    print("\n=== ANALYSIS COMPLETE ===")
else:
    print("Features not available for analysis. Run feature engineering first.")