In [13]:
# Setup and Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("📊 Volatility Estimators Project")
print("=================================")
print("Notebook 3: Real Market Data Analysis")
print("Using high-frequency data as benchmark")

np.random.seed(42)
plt.style.use('seaborn-v0_8')

# Import our implementations
from volatility_estimators.core import VolatilityEstimators, DataSimulator, PerformanceMetrics

📊 Volatility Estimators Project
Notebook 3: Real Market Data Analysis
Using high-frequency data as benchmark


In [14]:
# Realized Variance Calculator
from volatility_estimators.data_loader import RealizedVariance

print("✅ Realized variance calculator implemented")

✅ Realized variance calculator implemented


In [15]:
# Data Download and Processing
print("📥 Downloading Real Market Data")
print("===============================")

assets = {
    'Equity': 'SPY',           # S&P 500 ETF
    'Tech Stock': 'AAPL',      # Apple stock
    'Currency': 'EURUSD=X',    # EUR/USD
    'Commodity': 'GC=F',       # Gold futures
    'Bond': 'TLT',             # 20+ Year Treasury ETF
    'Crypto': 'BTC-USD'        # Bitcoin
}


start_date = "2023-01-01"
end_date = "2024-10-22"  

print(f"Downloading data from {start_date} to {end_date}")

all_data = {}
failed_downloads = []

for asset_class, ticker in assets.items():
    print(f"Downloading {asset_class} ({ticker})...")
    
    try:
        
        daily_data = yf.download(ticker, start=start_date, end=end_date, interval='1d', progress=False)

        if not daily_data.empty and len(daily_data) > 50:
            print(f"  ✅ Daily: {len(daily_data)} bars")
            
            
            last_available_date = daily_data.index[-1]
            hf_start_date = (last_available_date - timedelta(days=30)).strftime('%Y-%m-%d')
            hf_end_date = last_available_date.strftime('%Y-%m-%d')
            
            try:
                
                hf_data = yf.download(ticker, start=hf_start_date, end=hf_end_date, interval='5m', progress=False)
                
                if hf_data.empty or len(hf_data) < 10:
                    
                    print(f"  ⚠️  No 5-min data, trying 1-hour data...")
                    hf_data = yf.download(ticker, start=hf_start_date, end=hf_end_date, interval='1h', progress=False)
                
            except Exception as hf_error:
                print(f"  ⚠️  High-frequency download failed: {str(hf_error)[:50]}")
                hf_data = None
            
            
            all_data[asset_class] = {
                'ticker': ticker,
                'daily': daily_data,
                'high_freq': hf_data
            }
            
            if hf_data is not None and not hf_data.empty:
                print(f"  ✅ HF: {len(hf_data)} bars")
            else:
                print(f"  ⚠️  No high-frequency data available")
                
        else:
            print(f"  ❌ Insufficient daily data: {len(daily_data) if not daily_data.empty else 0} bars")
            failed_downloads.append(asset_class)
            
    except Exception as e:
        print(f"  ❌ Download failed: {str(e)[:50]}")
        failed_downloads.append(asset_class)

print(f"\n✅ Successfully downloaded DAILY data for {len(all_data)}/{len(assets)} assets")

if all_data:
    print("\n📊 Data Summary:")
    print("================")
    for asset_class, data in all_data.items():
        ticker = data['ticker']
        daily_len = len(data['daily'])
        hf_len = len(data['high_freq']) if data['high_freq'] is not None else 0
        start_date = data['daily'].index[0].strftime('%Y-%m-%d')
        end_date = data['daily'].index[-1].strftime('%Y-%m-%d')
        
        hf_status = f"{hf_len} HF bars" if hf_len > 0 else "No HF data"
        print(f"  {asset_class} ({ticker}): {daily_len} daily bars, {hf_status}")
        
        try:
    
            close_prices = data['daily']['Close']
            returns = close_prices.pct_change().dropna()
            
            
            mean_return = returns.mean()
            std_return = returns.std()
            
            
            if hasattr(mean_return, 'iloc'):
                mean_return = mean_return.iloc[0]
            if hasattr(std_return, 'iloc'):
                std_return = std_return.iloc[0]
                
            print(f"        Returns: μ={float(mean_return):.4f}, σ={float(std_return):.4f}")
        except Exception as e:
            print(f"        Returns: Could not calculate - {str(e)[:50]}")

if failed_downloads:
    print(f"\n❌ Failed: {failed_downloads}")

📥 Downloading Real Market Data
Downloading data from 2023-01-01 to 2024-10-22
Downloading Equity (SPY)...
  ✅ Daily: 453 bars



1 Failed download:
['SPY']: YFPricesMissingError('possibly delisted; no price data found  (5m 2024-09-21 -> 2024-10-21) (Yahoo error = "5m data not available for startTime=1726891200 and endTime=1729483200. The requested range must be within the last 60 days.")')


  ⚠️  No 5-min data, trying 1-hour data...
  ✅ HF: 140 bars
Downloading Tech Stock (AAPL)...
  ✅ Daily: 453 bars



1 Failed download:
['AAPL']: YFPricesMissingError('possibly delisted; no price data found  (5m 2024-09-21 -> 2024-10-21) (Yahoo error = "5m data not available for startTime=1726891200 and endTime=1729483200. The requested range must be within the last 60 days.")')


  ⚠️  No 5-min data, trying 1-hour data...
  ✅ HF: 140 bars
Downloading Currency (EURUSD=X)...
  ✅ Daily: 471 bars



1 Failed download:
['EURUSD=X']: YFPricesMissingError('possibly delisted; no price data found  (5m 2024-09-21 -> 2024-10-21) (Yahoo error = "5m data not available for startTime=1726873200 and endTime=1729465200. The requested range must be within the last 60 days.")')


  ⚠️  No 5-min data, trying 1-hour data...
  ✅ HF: 476 bars
Downloading Commodity (GC=F)...
  ✅ Daily: 453 bars



1 Failed download:
['GC=F']: YFPricesMissingError('possibly delisted; no price data found  (5m 2024-09-21 -> 2024-10-21) (Yahoo error = "5m data not available for startTime=1726891200 and endTime=1729483200. The requested range must be within the last 60 days.")')


  ⚠️  No 5-min data, trying 1-hour data...
  ✅ HF: 466 bars
Downloading Bond (TLT)...
  ✅ Daily: 453 bars



1 Failed download:
['TLT']: YFPricesMissingError('possibly delisted; no price data found  (5m 2024-09-21 -> 2024-10-21) (Yahoo error = "5m data not available for startTime=1726891200 and endTime=1729483200. The requested range must be within the last 60 days.")')


  ⚠️  No 5-min data, trying 1-hour data...
  ✅ HF: 140 bars
Downloading Crypto (BTC-USD)...
  ✅ Daily: 660 bars



1 Failed download:
['BTC-USD']: YFPricesMissingError('possibly delisted; no price data found  (5m 2024-09-21 -> 2024-10-21) (Yahoo error = "5m data not available for startTime=1726876800 and endTime=1729468800. The requested range must be within the last 60 days.")')


  ⚠️  No 5-min data, trying 1-hour data...
  ✅ HF: 720 bars

✅ Successfully downloaded DAILY data for 6/6 assets

📊 Data Summary:
  Equity (SPY): 453 daily bars, 140 HF bars
        Returns: μ=0.0010, σ=0.0080
  Tech Stock (AAPL): 453 daily bars, 140 HF bars
        Returns: μ=0.0015, σ=0.0138
  Currency (EURUSD=X): 471 daily bars, 476 HF bars
        Returns: μ=0.0000, σ=0.0042
  Commodity (GC=F): 453 daily bars, 466 HF bars
        Returns: μ=0.0009, σ=0.0086
  Bond (TLT): 453 daily bars, 140 HF bars
        Returns: μ=-0.0000, σ=0.0104
  Crypto (BTC-USD): 660 daily bars, 720 HF bars
        Returns: μ=0.0024, σ=0.0254


In [16]:
# Calculate Realized Variance Benchmark
print("🎯 Calculating Realized Variance Benchmark")
print("=========================================")

rv_calculator = RealizedVariance()
benchmark_results = {}

for asset_class, data in all_data.items():
    print(f"Processing {asset_class}...")
    
    try:
        daily_rv = rv_calculator.calculate_daily_rv(data['high_freq'])
        annualized_rv = rv_calculator.annualize_rv(daily_rv)
        benchmark_results[asset_class] = {
            'daily_rv': daily_rv,
            'annualized_rv': annualized_rv,
            'mean_annualized_rv': np.mean(annualized_rv)
        }
        
        print(f"  ✅ Mean RV: {benchmark_results[asset_class]['mean_annualized_rv']:.4f}")
        
    except Exception as e:
        print(f"  ❌ Error calculating RV for {asset_class}: {e}")
        benchmark_results[asset_class] = None

print("\n📊 Realized Variance Summary:")
for asset_class, result in benchmark_results.items():
    if result is not None:
        print(f"  {asset_class}: {result['mean_annualized_rv']:.4f}")

🎯 Calculating Realized Variance Benchmark
Processing Equity...
  ✅ Mean RV: 0.0780
Processing Tech Stock...
  ✅ Mean RV: 0.1804
Processing Currency...
  ✅ Mean RV: 0.0476
Processing Commodity...
  ✅ Mean RV: 0.1003
Processing Bond...
  ✅ Mean RV: 0.1154
Processing Crypto...
  ✅ Mean RV: 0.3005

📊 Realized Variance Summary:
  Equity: 0.0780
  Tech Stock: 0.1804
  Currency: 0.0476
  Commodity: 0.1003
  Bond: 0.1154
  Crypto: 0.3005


In [None]:
# Calculate Estimators on Real Data
from volatility_estimators.core import VolatilityEstimators
print("🧮 Calculating Volatility Estimators on Real Data")
print("================================================")

window = 30
estimator = VolatilityEstimators(window=window)

real_world_results = {}

for asset_class, data in all_data.items():
    if asset_class not in benchmark_results or benchmark_results[asset_class] is None:
        continue
        
    print(f"Calculating estimators for {asset_class}...")
    
    daily_data = data['daily']
    results = {}
    
    test_period = daily_data.last('60D')
    
    if len(test_period) < window:
        print(f"  ❌ Insufficient data: {len(test_period)} days")
        continue
    
    estimators = {
        'Close-to-Close': lambda: estimator.close_to_close(test_period['Close']),
        'Parkinson': lambda: estimator.parkinson(test_period['High'], test_period['Low']),
        'Garman-Klass': lambda: estimator.garman_klass(test_period['Open'], test_period['High'], 
                                                      test_period['Low'], test_period['Close']),
        'Rogers-Satchell': lambda: estimator.rogers_satchell(test_period['Open'], test_period['High'],
                                                            test_period['Low'], test_period['Close']),
        'Yang-Zhang': lambda: estimator.yang_zhang(test_period['Open'], test_period['High'],
                                                  test_period['Low'], test_period['Close'])
    }
    
    for name, func in estimators.items():
        try:
            results[name] = func()
        except Exception as e:
            results[name] = np.nan
            print(f"  ❌ {name} failed: {e}")
    
    real_world_results[asset_class] = {
        'estimates': results,
        'benchmark': benchmark_results[asset_class]['mean_annualized_rv'],
        'data': test_period
    }
    
    print(f"  ✅ Benchmark RV: {benchmark_results[asset_class]['mean_annualized_rv']:.4f}")

print("\n✅ Real data analysis complete!")

🧮 Calculating Volatility Estimators on Real Data
Calculating estimators for Equity...
  ✅ Benchmark RV: 0.0780
Calculating estimators for Tech Stock...
  ✅ Benchmark RV: 0.1804
Calculating estimators for Currency...
  ✅ Benchmark RV: 0.0476
Calculating estimators for Commodity...
  ✅ Benchmark RV: 0.1003
Calculating estimators for Bond...
  ✅ Benchmark RV: 0.1154
Calculating estimators for Crypto...
  ✅ Benchmark RV: 0.3005

✅ Real data analysis complete!


In [18]:
# Performance Comparison on Real Data
print("📊 Performance on Real Market Data")
print("==================================")

# Calculate errors relative to realized variance
performance_real = []

for asset_class, results in real_world_results.items():
    # Extract scalar value from benchmark (if it's a Series)
    benchmark = results['benchmark']
    if hasattr(benchmark, 'iloc'):  # It's a pandas Series
        benchmark = benchmark.iloc[0] if len(benchmark) > 0 else np.nan
    elif hasattr(benchmark, 'item'):  # It's a numpy array
        benchmark = benchmark.item()
    
    # Skip if benchmark is not a valid number
    if benchmark is None or np.isnan(benchmark) or benchmark == 0:
        print(f"⚠️  Skipping {asset_class}: invalid benchmark {benchmark}")
        continue
    
    for estimator_name, estimate in results['estimates'].items():
        # Extract scalar value from estimate (if it's a Series)
        if hasattr(estimate, 'iloc'):  # It's a pandas Series
            estimate = estimate.iloc[0] if len(estimate) > 0 else np.nan
        elif hasattr(estimate, 'item'):  # It's a numpy array
            estimate = estimate.item()
        
        if estimate is not None and not np.isnan(estimate):
            error = estimate - benchmark
            error_pct = (error / benchmark) * 100
            
            performance_real.append({
                'Asset_Class': asset_class,
                'Estimator': estimator_name,
                'Estimate': estimate,
                'Benchmark': benchmark,
                'Error': error,
                'Error_Pct': error_pct,
                'Absolute_Error': np.abs(error)
            })

performance_df = pd.DataFrame(performance_real)

print(f"📋 Performance DataFrame shape: {performance_df.shape}")

if performance_df.empty:
    print("❌ No performance data to analyze!")
else:
    # Calculate summary statistics
    summary_stats = performance_df.groupby('Estimator').agg({
        'Absolute_Error': 'mean',
        'Error': 'mean',
        'Error_Pct': 'mean'
    }).round(4)

    print("📈 Performance Summary (vs Realized Variance):")
    print(summary_stats.sort_values('Absolute_Error'))

    # Best performer by asset class
    best_by_asset = performance_df.loc[performance_df.groupby('Asset_Class')['Absolute_Error'].idxmin()]
    print("\n🏆 Best Estimator by Asset Class:")
    print(best_by_asset[['Asset_Class', 'Estimator', 'Error_Pct']].round(2))

📊 Performance on Real Market Data
📋 Performance DataFrame shape: (30, 7)
📈 Performance Summary (vs Realized Variance):
                 Absolute_Error   Error  Error_Pct
Estimator                                         
Close-to-Close           0.0140  0.0138    11.3844
Parkinson                0.0197 -0.0074    -6.7015
Garman-Klass             0.0222 -0.0031    -1.9860
Rogers-Satchell          0.0228  0.0002     3.3065
Yang-Zhang               0.0270  0.0248    26.5660

🏆 Best Estimator by Asset Class:
   Asset_Class        Estimator  Error_Pct
20        Bond   Close-to-Close      -0.55
19   Commodity       Yang-Zhang      -6.65
25      Crypto   Close-to-Close       2.50
11    Currency        Parkinson      -1.41
0       Equity   Close-to-Close      19.70
8   Tech Stock  Rogers-Satchell      -0.42
