# Deep Learning Options Trading - Backtest Evaluation

This notebook evaluates the LSTM model's backtesting performance:
- Performance metrics and risk analysis
- Benchmark comparisons (buy-and-hold, momentum, mean-reversion)
- Factor attribution and capacity analysis
- Stress testing and robustness checks

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import yaml
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## 1. Load Backtest Results

In [None]:
# Load backtest results
try:
    with open('../results/backtest_results.json', 'r') as f:
        backtest_results = json.load(f)
    
    print("Backtest results loaded successfully")
    print(f"Backtest period: {backtest_results.get('dates', [None, None])[0]} to {backtest_results.get('dates', [None, None])[-1]}")
    
except FileNotFoundError:
    print("Backtest results not found. Run backtesting first.")
    backtest_results = None

## 2. Performance Metrics Analysis

In [None]:
if backtest_results:
    # Extract key metrics
    metrics = {
        'Total Return': backtest_results.get('total_return', 0),
        'Annual Return': backtest_results.get('annual_return', 0),
        'Annual Volatility': backtest_results.get('annual_volatility', 0),
        'Sharpe Ratio': backtest_results.get('sharpe_ratio', 0),
        'Max Drawdown': backtest_results.get('max_drawdown', 0),
        'Win Rate': backtest_results.get('win_rate', 0),
        'Total Trades': backtest_results.get('total_trades', 0),
        'Avg Daily Turnover': backtest_results.get('avg_daily_turnover', 0)
    }
    
    # Display metrics
    print("=== LSTM STRATEGY PERFORMANCE ===\n")
    for metric, value in metrics.items():
        if 'Rate' in metric or 'Return' in metric:
            print(f"{metric}: {value:.2%}")
        elif 'Ratio' in metric:
            print(f"{metric}: {value:.2f}")
        elif 'Trades' in metric:
            print(f"{metric}: {int(value)}")
        else:
            print(f"{metric}: {value:.2f}")
    
    # Check against research targets (if available)
    target_sharpe = 2.0  # Example target from research
    if metrics['Sharpe Ratio'] >= target_sharpe:
        print(f"\n Sharpe ratio ({metrics['Sharpe Ratio']:.2f}) meets or exceeds target ({target_sharpe:.2f})")
    else:
        print(f"\n Sharpe ratio ({metrics['Sharpe Ratio']:.2f}) below target ({target_sharpe:.2f})")
    
    # Risk-adjusted return analysis
    print("\nRisk-Adjusted Return Analysis:")
    print(f"Return/MaxDD Ratio: {metrics['Total Return'] / abs(metrics['Max Drawdown']):.2f}")
    print(f"Sharpe/Volatility: {metrics['Sharpe Ratio'] / metrics['Annual Volatility']:.4f}")

In [None]:
if backtest_results:
    # Portfolio value and returns visualization
    portfolio_values = backtest_results.get('portfolio_values', [])
    daily_returns = backtest_results.get('daily_returns', [])
    dates = pd.to_datetime(backtest_results.get('dates', []))
    
    if portfolio_values and dates.any():
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Portfolio value over time
        axes[0,0].plot(dates, portfolio_values, linewidth=2, color='blue')
        axes[0,0].set_title('Portfolio Value Over Time')
        axes[0,0].set_xlabel('Date')
        axes[0,0].set_ylabel('Portfolio Value ($)')
        axes[0,0].grid(True, alpha=0.3)
        axes[0,0].tick_params(axis='x', rotation=45)
        
        # Drawdown chart
        cumulative = pd.Series(portfolio_values) / portfolio_values[0]
        running_max = cumulative.expanding().max()
        drawdowns = (cumulative - running_max) / running_max
        
        axes[0,1].fill_between(range(len(drawdowns)), 0, drawdowns, color='red', alpha=0.3)
        axes[0,1].set_title('Portfolio Drawdown')
        axes[0,1].set_xlabel('Time')
        axes[0,1].set_ylabel('Drawdown (%)')
        axes[0,1].set_ylim(bottom=-0.5, top=0)
        axes[0,1].grid(True, alpha=0.3)
        
        # Daily returns distribution
        if daily_returns:
            axes[1,0].hist(daily_returns, bins=50, alpha=0.7, color='green', edgecolor='black')
            axes[1,0].axvline(np.mean(daily_returns), color='red', linestyle='--', 
                            label=f'Mean: {np.mean(daily_returns):.4%}')
            axes[1,0].set_title('Daily Returns Distribution')
            axes[1,0].set_xlabel('Daily Return')
            axes[1,0].set_ylabel('Frequency')
            axes[1,0].legend()
            axes[1,0].grid(True, alpha=0.3)
        
        # Rolling Sharpe ratio
        if daily_returns:
            rolling_window = 252  # 1 year
            rolling_sharpe = (pd.Series(daily_returns).rolling(rolling_window).mean() / 
                            pd.Series(daily_returns).rolling(rolling_window).std()) * np.sqrt(252)
            
            axes[1,1].plot(dates, rolling_sharpe, linewidth=1, color='purple')
            axes[1,1].axhline(y=backtest_results.get('sharpe_ratio', 0), color='red', linestyle='--', 
                            alpha=0.7, label=f'Overall: {backtest_results.get("sharpe_ratio", 0):.2f}')
            axes[1,1].set_title(f'Rolling Sharpe Ratio ({rolling_window} days)')
            axes[1,1].set_xlabel('Date')
            axes[1,1].set_ylabel('Sharpe Ratio')
            axes[1,1].legend()
            axes[1,1].grid(True, alpha=0.3)
            axes[1,1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()

## 3. Benchmark Comparison

In [None]:
if backtest_results:
    # Load benchmark results
    benchmarks = backtest_results.get('benchmarks', {})
    
    if benchmarks:
        print("=== BENCHMARK COMPARISON ===\n")
        
        # Create comparison table
        comparison_data = {
            'Strategy': ['LSTM Model'] + list(benchmarks.keys()),
            'Sharpe Ratio': [backtest_results.get('sharpe_ratio', 0)] + 
                          [benchmarks[name].get('sharpe_ratio', 0) for name in benchmarks.keys()],
            'Total Return': [backtest_results.get('total_return', 0)] + 
                          [benchmarks[name].get('total_return', 0) for name in benchmarks.keys()],
            'Max Drawdown': [backtest_results.get('max_drawdown', 0)] + 
                          [benchmarks[name].get('max_drawdown', 0) for name in benchmarks.keys()],
            'Win Rate': [backtest_results.get('win_rate', 0)] + 
                          [benchmarks[name].get('win_rate', 0) for name in benchmarks.keys()]
        }
        
        comparison_df = pd.DataFrame(comparison_data)
        print(comparison_df.to_string(index=False, float_format='%.2f'))
        
        # Statistical significance test (Diebold-Mariano)
        print("\nDiebold-Mariano Test Results:")
        lstm_returns = np.array(backtest_results.get('daily_returns', []))
        
        for bench_name, bench_results in benchmarks.items():
            bench_returns = np.array(bench_results.get('daily_returns', []))
            
            if len(lstm_returns) == len(bench_returns) and len(lstm_returns) > 0:
                # Simplified DM test (actual implementation would use statsmodels)
                diff = lstm_returns - bench_returns
                dm_stat = np.mean(diff) / np.std(diff) * np.sqrt(len(diff))
                p_value = 2 * (1 - stats.norm.cdf(abs(dm_stat)))
                
                print(f"LSTM vs {bench_name}: DM stat = {dm_stat:.2f}, p-value = {p_value:.4f}")
                if p_value < 0.05:
                    print(f"  â†’ Significant difference at 5% level")
                else:
                    print(f"  â†’ No significant difference at 5% level")
            else:
                print(f"LSTM vs {bench_name}: Cannot perform test (unequal lengths)")
    else:
        print("No benchmark results available")

In [None]:
if backtest_results and benchmarks:
    # Benchmark comparison visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Sharpe ratio comparison
    strategies = ['LSTM'] + list(benchmarks.keys())
    sharpe_ratios = [backtest_results.get('sharpe_ratio', 0)] + \
                   [benchmarks[name].get('sharpe_ratio', 0) for name in benchmarks.keys()]
    
    bars = axes[0,0].bar(strategies, sharpe_ratios, color=['blue'] + ['gray'] * len(benchmarks))
    axes[0,0].set_title('Sharpe Ratio Comparison')
    axes[0,0].set_ylabel('Sharpe Ratio')
    axes[0,0].tick_params(axis='x', rotation=45)
    axes[0,0].grid(True, alpha=0.3)
    
    # Highlight LSTM bar
    bars[0].set_color('darkblue')
    
    # Total return comparison
    total_returns = [backtest_results.get('total_return', 0)] + \
                   [benchmarks[name].get('total_return', 0) for name in benchmarks.keys()]
    
    bars = axes[0,1].bar(strategies, total_returns, color=['blue'] + ['gray'] * len(benchmarks))
    axes[0,1].set_title('Total Return Comparison')
    axes[0,1].set_ylabel('Total Return (%)')
    axes[0,1].tick_params(axis='x', rotation=45)
    axes[0,1].grid(True, alpha=0.3)
    bars[0].set_color('darkblue')
    
    # Maximum drawdown comparison
    max_drawdowns = [backtest_results.get('max_drawdown', 0)] + \
                   [benchmarks[name].get('max_drawdown', 0) for name in benchmarks.keys()]
    
    bars = axes[1,0].bar(strategies, max_drawdowns, color=['red'] + ['lightcoral'] * len(benchmarks))
    axes[1,0].set_title('Maximum Drawdown Comparison')
    axes[1,0].set_ylabel('Max Drawdown (%)')
    axes[1,0].tick_params(axis='x', rotation=45)
    axes[1,0].grid(True, alpha=0.3)
    bars[0].set_color('darkred')
    
    # Win rate comparison
    win_rates = [backtest_results.get('win_rate', 0)] + \
               [benchmarks[name].get('win_rate', 0) for name in benchmarks.keys()]
    
    bars = axes[1,1].bar(strategies, win_rates, color=['green'] + ['lightgreen'] * len(benchmarks))
    axes[1,1].set_title('Win Rate Comparison')
    axes[1,1].set_ylabel('Win Rate (%)')
    axes[1,1].tick_params(axis='x', rotation=45)
    axes[1,1].grid(True, alpha=0.3)
    bars[0].set_color('darkgreen')
    
    plt.tight_layout()
    plt.show()

## 4. Capacity Analysis

In [None]:
# Capacity analysis - test performance with increasing capital
if backtest_results:
    print("=== CAPACITY ANALYSIS ===\n")
    
    # Simulate different capital levels
    capital_levels = [1e6, 5e6, 10e6, 50e6, 100e6]  # $1M to $100M
    capacity_results = []
    
    base_sharpe = backtest_results.get('sharpe_ratio', 0)
    base_return = backtest_results.get('total_return', 0)
    
    for capital in capital_levels:
        # Simplified capacity decay model
        # In practice, this would require re-running backtest with different position sizes
        
        # Assume Sharpe decays with sqrt(capital) due to market impact
        decay_factor = np.sqrt(capital_levels[0] / capital)  # Relative to $1M
        decayed_sharpe = base_sharpe * decay_factor
        
        # Assume returns decay linearly with market impact
        impact_cost = 0.01 * np.log(capital / 1e6)  # 1% impact per log capital increase
        decayed_return = base_return * (1 - impact_cost)
        
        capacity_results.append({
            'capital': capital,
            'sharpe_ratio': decayed_sharpe,
            'total_return': decayed_return,
            'annual_return': (1 + decayed_return) ** (252 / len(backtest_results.get('daily_returns', []))) - 1
        })
    
    capacity_df = pd.DataFrame(capacity_results)
    print("Capacity Analysis Results:")
    print(capacity_df.to_string(index=False, float_format='%.2f'))
    
    # Plot capacity decay
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    axes[0].plot(capacity_df['capital'], capacity_df['sharpe_ratio'], 'bo-', linewidth=2, markersize=8)
    axes[0].set_xlabel('Capital ($)')
    axes[0].set_ylabel('Sharpe Ratio')
    axes[0].set_title('Sharpe Ratio vs Capital')
    axes[0].set_xscale('log')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(capacity_df['capital'], capacity_df['annual_return'], 'ro-', linewidth=2, markersize=8)
    axes[1].set_xlabel('Capital ($)')
    axes[1].set_ylabel('Annual Return (%)')
    axes[1].set_title('Annual Return vs Capital')
    axes[1].set_xscale('log')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Estimate optimal capital
    optimal_idx = np.argmax(capacity_df['sharpe_ratio'])
    optimal_capital = capacity_df.loc[optimal_idx, 'capital']
    optimal_sharpe = capacity_df.loc[optimal_idx, 'sharpe_ratio']
    
    print(f"\nEstimated optimal capital: ${optimal_capital:,.0f}")
    print(f"Optimal Sharpe ratio: {optimal_sharpe:.2f}")
    print(f"Capacity limit reached at: ${capacity_df.loc[capacity_df['sharpe_ratio'] < 1.0, 'capital'].min():,.0f}")

## 5. Stress Testing

In [None]:
# Stress testing during market crises
if backtest_results and dates.any():
    print("=== STRESS TESTING ===\n")
    
    daily_returns = pd.Series(backtest_results.get('daily_returns', []), index=dates)
    
    # Define stress periods
    stress_periods = {
        'COVID-19 Crash': ('2020-02-01', '2020-04-30'),
        'Financial Crisis': ('2008-09-01', '2009-03-31'),
        'Tech Bubble': ('2000-03-01', '2000-10-31')
    }
    
    stress_results = []
    
    for period_name, (start_date, end_date) in stress_periods.items():
        try:
            period_returns = daily_returns[start_date:end_date]
            
            if len(period_returns) > 0:
                period_sharpe = (period_returns.mean() / period_returns.std()) * np.sqrt(252)
                period_return = (1 + period_returns).prod() - 1
                max_dd = (1 - (1 + period_returns).cumprod() / (1 + period_returns).cumprod().expanding().max()).min()
                
                stress_results.append({
                    'period': period_name,
                    'sharpe_ratio': period_sharpe,
                    'total_return': period_return,
                    'max_drawdown': max_dd,
                    'days': len(period_returns)
                })
                
                print(f"{period_name}:")
                print(f"  Sharpe Ratio: {period_sharpe:.2f}")
                print(f"  Total Return: {period_return:.2%}")
                print(f"  Max Drawdown: {max_dd:.2%}")
                print(f"  Days: {len(period_returns)}")
                print()
            else:
                print(f"{period_name}: No data available\n")
                
        except KeyError:
            print(f"{period_name}: Period not in backtest data\n")
    
    # Overall stress test summary
    if stress_results:
        stress_df = pd.DataFrame(stress_results)
        
        print("Stress Test Summary:")
        print(f"Worst Sharpe ratio: {stress_df['sharpe_ratio'].min():.2f} ({stress_df.loc[stress_df['sharpe_ratio'].idxmin(), 'period']})")
        print(f"Worst drawdown: {stress_df['max_drawdown'].min():.2%} ({stress_df.loc[stress_df['max_drawdown'].idxmin(), 'period']})")
        print(f"Average stress Sharpe: {stress_df['sharpe_ratio'].mean():.2f}")
        
        # Check if strategy survives stress periods
        surviving_periods = stress_df[stress_df['sharpe_ratio'] > 0]['period'].tolist()
        failing_periods = stress_df[stress_df['sharpe_ratio'] <= 0]['period'].tolist()
        
        print(f"\nSurviving periods: {', '.join(surviving_periods) if surviving_periods else 'None'}")
        print(f"Failing periods: {', '.join(failing_periods) if failing_periods else 'None'}")
    else:
        print("No stress periods available for testing")

## 6. Transaction Cost Analysis

In [None]:
# Transaction cost sensitivity analysis
if backtest_results:
    print("=== TRANSACTION COST ANALYSIS ===\n")
    
    # Test different cost levels
    cost_levels = [0.005, 0.01, 0.02, 0.05]  # 0.5% to 5%
    cost_sensitivity = []
    
    base_sharpe = backtest_results.get('sharpe_ratio', 0)
    base_turnover = backtest_results.get('avg_daily_turnover', 0)
    
    for cost_pct in cost_levels:
        # Estimate cost drag on Sharpe ratio
        # Simplified: assume costs reduce returns proportionally to turnover
        cost_drag = base_turnover * cost_pct
        adjusted_sharpe = base_sharpe * (1 - cost_drag / base_sharpe) if base_sharpe > 0 else 0
        
        cost_sensitivity.append({
            'cost_pct': cost_pct,
            'adjusted_sharpe': adjusted_sharpe,
            'cost_drag': cost_drag
        })
        
        print(f"Cost level {cost_pct:.1%}: Adjusted Sharpe = {adjusted_sharpe:.2f}")
    
    # Plot cost sensitivity
    cost_df = pd.DataFrame(cost_sensitivity)
    
    plt.figure(figsize=(10, 6))
    plt.plot(cost_df['cost_pct'] * 100, cost_df['adjusted_sharpe'], 'ro-', linewidth=2, markersize=8)
    plt.axhline(y=base_sharpe, color='blue', linestyle='--', alpha=0.7, 
                label=f'Base Sharpe: {base_sharpe:.2f}')
    plt.xlabel('Transaction Cost (% of trade value)')
    plt.ylabel('Adjusted Sharpe Ratio')
    plt.title('Sharpe Ratio vs Transaction Costs')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Breakeven analysis
    breakeven_cost = base_sharpe / base_turnover if base_turnover > 0 else float('inf')
    print(f"\nBreakeven cost level: {breakeven_cost:.2%}")
    print(f"Current cost assumption: {config['backtest']['transaction_cost_per_contract'] * 100:.1f}Â¢ per contract")
    
    if breakeven_cost > 0.01:  # 1%
        print(" Strategy robust to typical transaction costs")
    else:
        print(" Strategy sensitive to transaction costs")

## 7. Backtest Evaluation Summary

In [None]:
# Generate comprehensive evaluation summary
if backtest_results:
    print("=== BACKTEST EVALUATION SUMMARY ===\n")
    
    # Overall assessment
    sharpe_ratio = backtest_results.get('sharpe_ratio', 0)
    total_return = backtest_results.get('total_return', 0)
    max_drawdown = backtest_results.get('max_drawdown', 0)
    
    print("Strategy Performance:")
    print(f"- Sharpe Ratio: {sharpe_ratio:.2f}")
    print(f"- Total Return: {total_return:.2%}")
    print(f"- Max Drawdown: {max_drawdown:.2%}")
    print(f"- Win Rate: {backtest_results.get('win_rate', 0):.2%}")
    
    # Benchmark comparison
    if benchmarks:
        best_benchmark_sharpe = max([bench.get('sharpe_ratio', 0) for bench in benchmarks.values()])
        outperformance = sharpe_ratio - best_benchmark_sharpe
        print(f"\nBenchmark Outperformance: {outperformance:.2f} Sharpe points")
        
        if outperformance > 0.5:
            print(" Strong outperformance vs benchmarks")
        elif outperformance > 0:
            print(" Moderate outperformance vs benchmarks")
        else:
            print(" Underperformance vs benchmarks")
    
    # Risk assessment
    return_to_dd = abs(total_return / max_drawdown) if max_drawdown != 0 else float('inf')
    print(f"\nRisk Metrics:")
    print(f"- Return/MaxDD Ratio: {return_to_dd:.2f}")
    print(f"- Annual Turnover: {backtest_results.get('avg_daily_turnover', 0) * 252:.2f}")
    
    # Capacity assessment
    if 'capacity_df' in locals():
        optimal_capital = capacity_df.loc[capacity_df['sharpe_ratio'].idxmax(), 'capital']
        print(f"\nCapacity Estimate: ${optimal_capital:,.0f} optimal capital")
    
    # Stress test results
    if 'stress_df' in locals() and not stress_df.empty:
        stress_survival_rate = (stress_df['sharpe_ratio'] > 0).mean()
        print(f"\nStress Test Survival Rate: {stress_survival_rate:.1%}")
    
    # Overall recommendation
    print("\n=== FINAL ASSESSMENT ===")
    
    score = 0
    if sharpe_ratio > 1.5: score += 1
    if return_to_dd > 1.0: score += 1
    if outperformance > 0: score += 1
    if stress_survival_rate > 0.5: score += 1
    
    if score >= 3:
        print("ðŸŸ¢ RECOMMEND: Strong candidate for live trading")
    elif score >= 2:
        print("ðŸŸ¡ CAUTION: Moderate performance, further testing recommended")
    else:
        print(" REJECT: Insufficient performance or risk-adjusted returns")
    
    print("\n=== EVALUATION COMPLETE ===")
else:
    print("Backtest results not available for evaluation.")