# Data Exploration Notebook

This notebook explores the financial data used for training pricing models in the Price Matrix system.

In [None]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Import our custom modules
from data.data_generator import FinancialDataGenerator
from data.preprocessor import FinancialDataPreprocessor
from utils.visualization import FinancialVisualizer

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Generation

Let's start by generating synthetic financial data for our analysis.

In [None]:
# Initialize data generator
generator = FinancialDataGenerator(seed=42)

# Generate different types of financial data
print("Generating financial data...")

# Yield curve data
yield_curves = generator.generate_yield_curve(n_samples=1000)
print(f"Yield curves shape: {yield_curves.shape}")

# Volatility surface data
vol_surfaces = generator.generate_volatility_surface(n_samples=1000)
print(f"Volatility surfaces shape: {vol_surfaces.shape}")

# Option pricing data
option_data = generator.generate_option_prices(n_samples=5000)
print(f"Option data shape: {option_data.shape}")

# Swaption data
swaption_data = generator.generate_swaption_data(n_samples=2000)
print(f"Swaption data shape: {swaption_data.shape}")

## 2. Basic Data Exploration

Let's examine the basic statistics and distributions of our data.

In [None]:
# Display basic statistics for option data
print("Option Data Statistics:")
print(option_data.describe())

# Check for missing values
print("\nMissing Values:")
print(option_data.isnull().sum())

In [None]:
# Visualize distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Option Data Distributions', fontsize=16)

# Spot price distribution
axes[0, 0].hist(option_data['spot_price'], bins=50, alpha=0.7)
axes[0, 0].set_title('Spot Price Distribution')
axes[0, 0].set_xlabel('Spot Price')
axes[0, 0].set_ylabel('Frequency')

# Strike price distribution
axes[0, 1].hist(option_data['strike_price'], bins=50, alpha=0.7)
axes[0, 1].set_title('Strike Price Distribution')
axes[0, 1].set_xlabel('Strike Price')
axes[0, 1].set_ylabel('Frequency')

# Volatility distribution
axes[0, 2].hist(option_data['volatility'], bins=50, alpha=0.7)
axes[0, 2].set_title('Volatility Distribution')
axes[0, 2].set_xlabel('Volatility')
axes[0, 2].set_ylabel('Frequency')

# Time to expiry distribution
axes[1, 0].hist(option_data['time_to_expiry'], bins=50, alpha=0.7)
axes[1, 0].set_title('Time to Expiry Distribution')
axes[1, 0].set_xlabel('Time to Expiry (years)')
axes[1, 0].set_ylabel('Frequency')

# Risk-free rate distribution
axes[1, 1].hist(option_data['risk_free_rate'], bins=50, alpha=0.7)
axes[1, 1].set_title('Risk-Free Rate Distribution')
axes[1, 1].set_xlabel('Risk-Free Rate')
axes[1, 1].set_ylabel('Frequency')

# Option price distribution
axes[1, 2].hist(option_data['call_price'], bins=50, alpha=0.7, label='Call')
axes[1, 2].hist(option_data['put_price'], bins=50, alpha=0.7, label='Put')
axes[1, 2].set_title('Option Price Distribution')
axes[1, 2].set_xlabel('Option Price')
axes[1, 2].set_ylabel('Frequency')
axes[1, 2].legend()

plt.tight_layout()
plt.show()

## 3. Correlation Analysis

Let's examine the correlations between different variables.

In [None]:
# Calculate correlation matrix
correlation_matrix = option_data.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix - Option Data')
plt.tight_layout()
plt.show()

## 4. Moneyness Analysis

Let's analyze the relationship between moneyness and option prices.

In [None]:
# Create moneyness categories
option_data['moneyness_category'] = pd.cut(option_data['moneyness'], 
                                         bins=[0, 0.8, 0.95, 1.05, 1.2, float('inf')],
                                         labels=['Deep ITM', 'ITM', 'ATM', 'OTM', 'Deep OTM'])

# Plot average prices by moneyness
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
avg_prices = option_data.groupby('moneyness_category')[['call_price', 'put_price']].mean()
avg_prices.plot(kind='bar', ax=plt.gca())
plt.title('Average Option Prices by Moneyness')
plt.xlabel('Moneyness Category')
plt.ylabel('Average Price')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.scatterplot(data=option_data, x='moneyness', y='call_price', alpha=0.6)
plt.title('Call Price vs Moneyness')
plt.xlabel('Moneyness')
plt.ylabel('Call Price')

plt.tight_layout()
plt.show()

## 5. Volatility Analysis

Let's examine the volatility patterns in our data.

In [None]:
# Plot volatility relationships
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Volatility vs time to expiry
axes[0, 0].scatter(option_data['time_to_expiry'], option_data['volatility'], alpha=0.6)
axes[0, 0].set_xlabel('Time to Expiry (years)')
axes[0, 0].set_ylabel('Volatility')
axes[0, 0].set_title('Volatility vs Time to Expiry')

# Volatility vs moneyness
axes[0, 1].scatter(option_data['moneyness'], option_data['volatility'], alpha=0.6)
axes[0, 1].set_xlabel('Moneyness')
axes[0, 1].set_ylabel('Volatility')
axes[0, 1].set_title('Volatility Smile')

# Volatility distribution by moneyness category
vol_by_moneyness = option_data.groupby('moneyness_category')['volatility'].mean()
vol_by_moneyness.plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Average Volatility by Moneyness')
axes[1, 0].set_xlabel('Moneyness Category')
axes[1, 0].set_ylabel('Average Volatility')
axes[1, 0].tick_params(axis='x', rotation=45)

# Price vs volatility
axes[1, 1].scatter(option_data['volatility'], option_data['call_price'], alpha=0.6)
axes[1, 1].set_xlabel('Volatility')
axes[1, 1].set_ylabel('Call Price')
axes[1, 1].set_title('Call Price vs Volatility')

plt.tight_layout()
plt.show()

## 6. Data Preprocessing

Let's demonstrate data preprocessing techniques.

In [None]:
# Initialize preprocessor
preprocessor = FinancialDataPreprocessor(random_state=42)

# Handle outliers
print("Detecting outliers...")
outliers = preprocessor.detect_outliers(option_data, ['volatility', 'call_price'])
print(f"Outliers detected: {sum(len(v) for v in outliers.values())}")

# Remove outliers
clean_data = preprocessor.remove_outliers(option_data, outliers, strategy='cap')
print(f"Data shape after outlier removal: {clean_data.shape}")

# Handle skewness
print("\nHandling skewness...")
processed_data = preprocessor.handle_skewness(clean_data, ['volatility'])
print(f"Data shape after preprocessing: {processed_data.shape}")

# Scale features
scaled_data = preprocessor.scale_features(processed_data, ['spot_price', 'strike_price', 'volatility'])
print(f"Data shape after scaling: {scaled_data.shape}")

## 7. Principal Component Analysis

Let's perform PCA to understand the main components of our data.

In [None]:
# Select numerical features for PCA
numerical_features = ['spot_price', 'strike_price', 'time_to_expiry', 'risk_free_rate', 'volatility']
X = option_data[numerical_features]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Plot explained variance
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         pca.explained_variance_ratio_, 'bo-', linewidth=2)
plt.title('Explained Variance by Principal Components')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         np.cumsum(pca.explained_variance_ratio_), 'ro-', linewidth=2)
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)

plt.tight_layout()
plt.show()

# Display component loadings
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(len(numerical_features))],
    index=numerical_features
)

print("Principal Component Loadings:")
print(loadings.round(3))

## 8. Summary and Insights

Let's summarize our findings from the data exploration.

In [None]:
print("=== DATA EXPLORATION SUMMARY ===")
print(f"Total option samples: {len(option_data):,}")
print(f"Total swaption samples: {len(swaption_data):,}")
print(f"Total yield curve samples: {len(yield_curves):,}")
print(f"Total volatility surface samples: {len(vol_surfaces):,}")
print()

print("KEY FINDINGS:")
print("1. Option prices range from $" + f"{option_data['call_price'].min():.2f} to $" + f"{option_data['call_price'].max():.2f}")
print("2. Most options are ATM (moneyness between 0.95-1.05)")
print("3. Volatility shows a smile pattern with higher values for OTM options")
print("4. Strong correlation between volatility and option prices")
print("5. First 3 principal components explain ~85% of variance")
print()

print("DATA QUALITY:")
print("- No missing values in generated data")
print("- Some outliers detected and handled")
print("- Volatility distribution is realistic")
print("- Correlations are consistent with financial theory")
print()

print("NEXT STEPS:")
print("1. Feature engineering for model training")
print("2. Model selection and training")
print("3. Cross-validation and hyperparameter tuning")
print("4. Model evaluation and comparison")