# Deep Learning Options Trading - Data Exploration

This notebook explores the S&P 100 options dataset used for training the LSTM model. We analyze:
- Universe composition and coverage
- Options data quality and liquidity
- Feature distributions and correlations
- Time series characteristics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load configuration
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## 1. Load and Inspect Raw Data

In [None]:
# Load underlying price data
try:
    prices_df = pd.read_csv('data/underlying_prices/underlying_prices.csv')
    prices_df['Date'] = pd.to_datetime(prices_df['Date'])
    prices_df = prices_df.set_index(['ticker', 'Date'])
    print(f"Loaded underlying prices: {prices_df.shape[0]} records for {len(prices_df.index.levels[0])} tickers")
except FileNotFoundError:
    print("Underlying price data not found. Run data acquisition first.")
    prices_df = None

In [None]:
# Load options data
try:
    options_df = pd.read_csv('data/options_data/options_data.csv')
    options_df['date'] = pd.to_datetime(options_df['date'])
    options_df['expiry'] = pd.to_datetime(options_df['expiry'])
    print(f"Loaded options data: {len(options_df)} records")
    print(f"Date range: {options_df['date'].min()} to {options_df['date'].max()}")
except FileNotFoundError:
    print("Options data not found. Run data acquisition first.")
    options_df = None

## 2. Universe Composition Analysis

In [None]:
if prices_df is not None:
    # Analyze ticker coverage
    ticker_counts = prices_df.groupby('ticker').size().sort_values(ascending=False)
    
    plt.figure(figsize=(15, 8))
    ticker_counts.head(20).plot(kind='bar')
    plt.title('Top 20 S&P 100 Tickers by Data Points')
    plt.xlabel('Ticker')
    plt.ylabel('Number of Trading Days')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print(f"Total unique tickers: {len(ticker_counts)}")
    print(f"Average trading days per ticker: {ticker_counts.mean():.0f}")
    print(f"Median trading days per ticker: {ticker_counts.median():.0f}")

In [None]:
if prices_df is not None:
    # Price distribution analysis
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Price levels
    prices_df['Adj Close'].hist(bins=50, ax=axes[0,0])
    axes[0,0].set_title('Distribution of Stock Prices')
    axes[0,0].set_xlabel('Price ($)')
    axes[0,0].set_ylabel('Frequency')
    
    # Daily returns
    prices_df['return_1d'].hist(bins=50, ax=axes[0,1])
    axes[0,1].set_title('Distribution of Daily Returns')
    axes[0,1].set_xlabel('Daily Return')
    axes[0,1].set_ylabel('Frequency')
    
    # Returns by ticker (top 10)
    top_tickers = ticker_counts.head(10).index
    returns_by_ticker = prices_df.loc[top_tickers]['return_1d'].unstack(level=0)
    returns_by_ticker.boxplot(ax=axes[1,0])
    axes[1,0].set_title('Daily Returns by Ticker (Top 10)')
    axes[1,0].set_xlabel('Ticker')
    axes[1,0].set_ylabel('Daily Return')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Volatility clustering
    abs_returns = prices_df['return_1d'].abs()
    abs_returns.rolling(30).mean().plot(ax=axes[1,1])
    axes[1,1].set_title('30-Day Rolling Average Absolute Returns')
    axes[1,1].set_xlabel('Date')
    axes[1,1].set_ylabel('Absolute Return')
    
    plt.tight_layout()
    plt.show()

## 3. Options Data Quality Analysis

In [None]:
if options_df is not None:
    # Basic statistics
    print("Options Data Summary:")
    print(options_df.describe())
    
    # Liquidity analysis
    liquidity_stats = options_df.groupby('ticker').agg({
        'volume': ['mean', 'median', 'min', 'max'],
        'open_interest': ['mean', 'median', 'min', 'max']
    })
    
    print("\nLiquidity Statistics by Ticker:")
    print(liquidity_stats.head())
    
    # Check liquidity filters
    min_volume = config['data']['min_volume']
    min_oi = config['data']['min_open_interest']
    
    liquid_options = options_df[
        (options_df['volume'] >= min_volume) & 
        (options_df['open_interest'] >= min_oi)
    ]
    
    print(f"\nLiquidity Filter Results:")
    print(f"Total options: {len(options_df)}")
    print(f"Liquid options: {len(liquid_options)}")
    print(f"Liquidity ratio: {len(liquid_options)/len(options_df):.2%}")

In [None]:
if options_df is not None:
    # Options characteristics visualization
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    
    # Straddle prices
    options_df['straddle_price'].hist(bins=50, ax=axes[0,0])
    axes[0,0].set_title('Distribution of Straddle Prices')
    axes[0,0].set_xlabel('Straddle Price ($)')
    axes[0,0].set_ylabel('Frequency')
    
    # Moneyness
    options_df['moneyness'].hist(bins=50, ax=axes[0,1])
    axes[0,1].set_title('Distribution of Moneyness')
    axes[0,1].set_xlabel('Moneyness (Strike/Spot)')
    axes[0,1].set_ylabel('Frequency')
    
    # Time to expiry
    options_df['days_to_expiry'].hist(bins=50, ax=axes[0,2])
    axes[0,2].set_title('Distribution of Days to Expiry')
    axes[0,2].set_xlabel('Days to Expiry')
    axes[0,2].set_ylabel('Frequency')
    
    # Implied volatility
    options_df['implied_vol'].hist(bins=50, ax=axes[1,0])
    axes[1,0].set_title('Distribution of Implied Volatility')
    axes[1,0].set_xlabel('Implied Volatility')
    axes[1,0].set_ylabel('Frequency')
    
    # Volume distribution
    np.log10(options_df['volume'] + 1).hist(bins=50, ax=axes[1,1])
    axes[1,1].set_title('Distribution of Log Volume')
    axes[1,1].set_xlabel('Log10(Volume + 1)')
    axes[1,1].set_ylabel('Frequency')
    
    # Open interest distribution
    np.log10(options_df['open_interest'] + 1).hist(bins=50, ax=axes[1,2])
    axes[1,2].set_title('Distribution of Log Open Interest')
    axes[1,2].set_xlabel('Log10(Open Interest + 1)')
    axes[1,2].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## 4. Feature Engineering Validation

In [None]:
# Load engineered features if available
try:
    features_df = pd.read_csv('data/processed/features.csv')
    features_df['date'] = pd.to_datetime(features_df['date'])
    print(f"Loaded engineered features: {len(features_df)} records")
    
    # Display feature correlations
    feature_cols = [col for col in features_df.columns if col not in ['date', 'ticker', 'straddle_price']]
    
    plt.figure(figsize=(12, 8))
    correlation_matrix = features_df[feature_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
except FileNotFoundError:
    print("Engineered features not found. Run feature engineering first.")
    features_df = None

In [None]:
if features_df is not None:
    # Feature distributions
    n_features = len(feature_cols)
    n_cols = 3
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    axes = axes.flatten()
    
    for i, feature in enumerate(feature_cols):
        if i < len(axes):
            features_df[feature].hist(bins=50, ax=axes[i])
            axes[i].set_title(f'Distribution of {feature}')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(len(feature_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 5. Time Series Analysis

In [None]:
if options_df is not None:
    # Time series of key metrics
    fig, axes = plt.subplots(3, 1, figsize=(15, 12))
    
    # Average straddle price over time
    daily_avg_price = options_df.groupby('date')['straddle_price'].mean()
    daily_avg_price.plot(ax=axes[0])
    axes[0].set_title('Average Daily Straddle Price Over Time')
    axes[0].set_xlabel('Date')
    axes[0].set_ylabel('Average Straddle Price ($)')
    
    # Average implied volatility over time
    daily_avg_iv = options_df.groupby('date')['implied_vol'].mean()
    daily_avg_iv.plot(ax=axes[1])
    axes[1].set_title('Average Daily Implied Volatility Over Time')
    axes[1].set_xlabel('Date')
    axes[1].set_ylabel('Average Implied Volatility')
    
    # Trading volume over time
    daily_volume = options_df.groupby('date')['volume'].sum()
    daily_volume.plot(ax=axes[2])
    axes[2].set_title('Total Daily Options Volume Over Time')
    axes[2].set_xlabel('Date')
    axes[2].set_ylabel('Total Volume')
    
    plt.tight_layout()
    plt.show()

## 6. Data Quality Summary

In [None]:
# Generate data quality report
if options_df is not None and prices_df is not None:
    print("=== DATA QUALITY SUMMARY ===\n")
    
    # Coverage statistics
    print(f"Date Range: {options_df['date'].min()} to {options_df['date'].max()}")
    print(f"Total Trading Days: {options_df['date'].nunique()}")
    print(f"Unique Tickers: {len(options_df['ticker'].unique())}")
    
    # Data completeness
    total_expected = len(options_df['date'].unique()) * len(options_df['ticker'].unique())
    total_actual = len(options_df)
    completeness = total_actual / total_expected
    print(f"Data Completeness: {completeness:.2%}")
    
    # Liquidity assessment
    liquid_pct = len(liquid_options) / len(options_df) if 'liquid_options' in locals() else 0
    print(f"Liquid Options (%): {liquid_pct:.2%}")
    
    # Statistical summary
    print("\nStatistical Summary:")
    stats_summary = options_df[['straddle_price', 'moneyness', 'days_to_expiry', 'implied_vol']].describe()
    print(stats_summary)
    
    print("\n=== ANALYSIS COMPLETE ===")
else:
    print("Data not available for quality assessment. Run data acquisition pipeline first.")