# Backtest Evaluation: Strategy Performance Analysis

Evaluate strategy performance, compare to benchmarks, and validate failure modes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from data_acquisition import DataAcquisition
from feature_engineering import FeatureEngineer
from ml_model import CryptoMLModel
from backtester import CryptoBacktester

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

# Load data and run full pipeline
data_acq = DataAcquisition('config.yaml')
dataset = data_acq.fetch_full_dataset()

engineer = FeatureEngineer('config.yaml')
features = engineer.engineer_all_features(dataset['prices'], dataset['events'])
target = engineer.create_target_variable(dataset['prices'])

model = CryptoMLModel('config.yaml')
ml_results = model.walk_forward_validation(features, target, optimize_hyperparams=False)

backtester = CryptoBacktester('config.yaml')
strategy_results = backtester.run_backtest(ml_results['results_df'], dataset['prices'], features)

# Benchmark
start_date = ml_results['results_df'].index[0]
end_date = ml_results['results_df'].index[-1]
benchmark_results = backtester.backtest_buy_and_hold(dataset['prices'], start_date, end_date)

print("Data and backtests loaded successfully.")

## 1. Cumulative Returns Comparison

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

# Strategy
strategy_pv = strategy_results['backtest_df']['portfolio_value']
strategy_returns = (strategy_pv / strategy_pv.iloc[0] - 1) * 100

# Benchmark
benchmark_pv = benchmark_results['backtest_df']['portfolio_value']
benchmark_returns = (benchmark_pv / benchmark_pv.iloc[0] - 1) * 100

ax.plot(strategy_returns.index, strategy_returns, linewidth=2, label='Strategy', color='darkblue')
ax.plot(benchmark_returns.index, benchmark_returns, linewidth=2, label='BTC Buy-and-Hold', color='orange', alpha=0.7)
ax.axhline(0, color='black', linestyle='--', alpha=0.5)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Cumulative Return (%)', fontsize=12)
ax.set_title('Cumulative Returns: Strategy vs Buy-and-Hold', fontsize=14, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nFinal Returns:")
print(f"  Strategy: {strategy_returns.iloc[-1]:.2f}%")
print(f"  Buy-and-Hold: {benchmark_returns.iloc[-1]:.2f}%")

## 2. Performance Metrics Comparison

In [None]:
comparison_df = pd.DataFrame({
    'Strategy': strategy_results['metrics'],
    'BTC_Buy_Hold': benchmark_results['metrics']
})

# Format for display
display_df = comparison_df.T
for col in display_df.columns:
    if col != 'n_days':
        display_df[col] = display_df[col].apply(lambda x: f"{x:.4f}")

print("\n" + "="*80)
print("PERFORMANCE METRICS COMPARISON")
print("="*80)
print(display_df.to_string())

# Visualization
metrics_to_plot = ['sharpe_ratio', 'sortino_ratio', 'calmar_ratio', 'win_rate']
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics_to_plot):
    strategy_val = strategy_results['metrics'][metric]
    benchmark_val = benchmark_results['metrics'][metric]
    
    axes[i].bar(['Strategy', 'Buy-and-Hold'], [strategy_val, benchmark_val], 
                color=['darkblue', 'orange'], alpha=0.7)
    axes[i].set_ylabel(metric.replace('_', ' ').title(), fontsize=11)
    axes[i].set_title(f"{metric.replace('_', ' ').title()}", fontsize=12, fontweight='bold')
    axes[i].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 3. Drawdown Analysis

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

strategy_dd = strategy_results['backtest_df']['drawdown'] * 100
benchmark_dd = benchmark_results['backtest_df']['drawdown'] * 100

ax.fill_between(strategy_dd.index, 0, -strategy_dd, alpha=0.5, label='Strategy', color='darkblue')
ax.fill_between(benchmark_dd.index, 0, -benchmark_dd, alpha=0.3, label='BTC Buy-and-Hold', color='orange')
ax.axhline(-10, color='red', linestyle='--', alpha=0.5, label='10% Stop-Loss Threshold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Drawdown (%)', fontsize=12)
ax.set_title('Drawdown Analysis', fontsize=14, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nMax Drawdown:")
print(f"  Strategy: {strategy_results['metrics']['max_drawdown']:.2%}")
print(f"  Buy-and-Hold: {benchmark_results['metrics']['max_drawdown']:.2%}")

# Check if stop-loss was triggered
stop_loss_triggered = (strategy_dd > 10).any()
if stop_loss_triggered:
    first_trigger = strategy_dd[strategy_dd > 10].index[0]
    print(f"\n⚠️  Stop-loss triggered on {first_trigger}")
else:
    print("\n✓ No stop-loss triggered")

## 4. Position Evolution

In [None]:
positions = strategy_results['backtest_df']['position']

fig, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

# Position sizes
axes[0].plot(positions.index, positions, linewidth=1.5, color='darkgreen')
axes[0].axhline(0, color='black', linestyle='--', alpha=0.5)
axes[0].axhline(1, color='blue', linestyle='--', alpha=0.5, label='1x Leverage')
axes[0].axhline(2, color='red', linestyle='--', alpha=0.5, label='2x Max Leverage')
axes[0].axhline(-1, color='blue', linestyle='--', alpha=0.5)
axes[0].axhline(-2, color='red', linestyle='--', alpha=0.5)
axes[0].set_ylabel('Position Size', fontsize=12)
axes[0].set_title('Position Evolution Over Time', fontsize=13, fontweight='bold')
axes[0].legend(loc='best')
axes[0].grid(True, alpha=0.3)

# Position histogram
axes[1].hist(positions, bins=50, alpha=0.7, color='steelblue', edgecolor='black')
axes[1].axvline(0, color='black', linestyle='--', alpha=0.5)
axes[1].set_xlabel('Position Size', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Position Size Distribution', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\nPosition Statistics:")
print(f"  Mean Position: {positions.mean():.4f}")
print(f"  Max Long: {positions.max():.4f}")
print(f"  Max Short: {positions.min():.4f}")
print(f"  Long Days: {(positions > 0).sum()}")
print(f"  Short Days: {(positions < 0).sum()}")
print(f"  Neutral Days: {(positions == 0).sum()}")
print(f"\n  At 2x Leverage: {(np.abs(positions) >= 1.9).sum()} days")
print(f"  At 1x Leverage: {((np.abs(positions) >= 0.9) & (np.abs(positions) < 1.1)).sum()} days")

## 5. Transaction Cost Impact

In [None]:
gross_returns = strategy_results['backtest_df']['gross_return']
net_returns = strategy_results['backtest_df']['net_return']
transaction_costs = strategy_results['backtest_df']['transaction_cost']

# Cumulative impact
cumulative_gross = (1 + gross_returns).cumprod()
cumulative_net = (1 + net_returns).cumprod()

fig, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

# Gross vs Net returns
axes[0].plot(cumulative_gross.index, (cumulative_gross - 1) * 100, label='Gross Returns', linewidth=2, color='darkblue')
axes[0].plot(cumulative_net.index, (cumulative_net - 1) * 100, label='Net Returns', linewidth=2, color='darkgreen')
axes[0].set_ylabel('Cumulative Return (%)', fontsize=12)
axes[0].set_title('Gross vs Net Returns', fontsize=13, fontweight='bold')
axes[0].legend(loc='best')
axes[0].grid(True, alpha=0.3)

# Transaction costs over time
axes[1].plot(transaction_costs.index, transaction_costs.cumsum() * 100, linewidth=2, color='darkred')
axes[1].set_xlabel('Date', fontsize=12)
axes[1].set_ylabel('Cumulative Transaction Costs (%)', fontsize=12)
axes[1].set_title('Cumulative Transaction Costs', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nTransaction Cost Impact:")
print(f"  Total Costs: {transaction_costs.sum():.4%}")
print(f"  Gross Sharpe: {(gross_returns.mean() * 252) / (gross_returns.std() * np.sqrt(252)):.4f}")
print(f"  Net Sharpe: {strategy_results['metrics']['sharpe_ratio']:.4f}")
print(f"  Sharpe Degradation: {((gross_returns.mean() * 252) / (gross_returns.std() * np.sqrt(252))) - strategy_results['metrics']['sharpe_ratio']:.4f}")

## 6. Statistical Tests

In [None]:
strategy_returns = strategy_results['backtest_df']['net_return'].values
benchmark_returns = benchmark_results['backtest_df']['net_return'].values

# One-sided t-test
t_stat, p_value = stats.ttest_ind(strategy_returns, benchmark_returns, alternative='greater')
is_significant = p_value < 0.05

print("\n" + "="*60)
print("STATISTICAL TEST: Strategy vs Buy-and-Hold")
print("="*60)
print(f"\nOne-sided t-test (Strategy > Benchmark):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Significant (α=0.05): {'✓ YES' if is_significant else '✗ NO'}")
print(f"\n  Mean Strategy Return: {strategy_returns.mean():.6f}")
print(f"  Mean Benchmark Return: {benchmark_returns.mean():.6f}")
print(f"  Mean Difference: {strategy_returns.mean() - benchmark_returns.mean():.6f}")

if is_significant:
    print("\n✓ Strategy significantly outperforms buy-and-hold at 95% confidence level.")
else:
    print("\n✗ No significant outperformance detected.")

## 7. Stress Test Analysis

In [None]:
# Define stress periods
stress_periods = [
    ('COVID', '2020-03-01', '2020-04-01'),
    ('Fed Tightening', '2022-01-01', '2022-12-31'),
    ('FTX Collapse', '2022-11-01', '2022-12-01')
]

print("\n" + "="*80)
print("STRESS TEST RESULTS")
print("="*80)

for name, start, end in stress_periods:
    strategy_period = strategy_results['backtest_df'].loc[start:end]
    benchmark_period = benchmark_results['backtest_df'].loc[start:end]
    
    if len(strategy_period) > 0 and len(benchmark_period) > 0:
        strategy_ret = ((strategy_period['portfolio_value'].iloc[-1] / strategy_period['portfolio_value'].iloc[0]) - 1) * 100
        benchmark_ret = ((benchmark_period['portfolio_value'].iloc[-1] / benchmark_period['portfolio_value'].iloc[0]) - 1) * 100
        
        print(f"\n{name} ({start} to {end}):")
        print(f"  Strategy Return: {strategy_ret:.2f}%")
        print(f"  Benchmark Return: {benchmark_ret:.2f}%")
        print(f"  Outperformance: {strategy_ret - benchmark_ret:.2f}%")
        print(f"  Strategy Max DD: {strategy_period['drawdown'].max():.2%}")
        print(f"  Benchmark Max DD: {benchmark_period['drawdown'].max():.2%}")

## 8. Capacity Analysis

In [None]:
capacity_df = backtester.capacity_analysis(strategy_results['backtest_df'])

print("\n" + "="*80)
print("CAPACITY ANALYSIS")
print("="*80)
print(capacity_df.to_string(index=False))

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Sharpe degradation
axes[0].plot(capacity_df['capital'] / 1e6, capacity_df['sharpe_ratio'], marker='o', linewidth=2, markersize=8)
axes[0].axhline(strategy_results['metrics']['sharpe_ratio'], color='green', linestyle='--', alpha=0.7, label='Baseline')
axes[0].set_xlabel('Capital (Millions USD)', fontsize=12)
axes[0].set_ylabel('Sharpe Ratio', fontsize=12)
axes[0].set_title('Sharpe Ratio vs Capital', fontsize=13, fontweight='bold')
axes[0].legend(loc='best')
axes[0].grid(True, alpha=0.3)

# Degradation percentage
axes[1].plot(capacity_df['capital'] / 1e6, capacity_df['degradation_pct'], marker='s', linewidth=2, markersize=8, color='darkred')
axes[1].axhline(10, color='orange', linestyle='--', alpha=0.7, label='10% Degradation Threshold')
axes[1].set_xlabel('Capital (Millions USD)', fontsize=12)
axes[1].set_ylabel('Sharpe Degradation (%)', fontsize=12)
axes[1].set_title('Sharpe Degradation vs Capital', fontsize=13, fontweight='bold')
axes[1].legend(loc='best')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find capacity where degradation exceeds 10%
threshold_exceeded = capacity_df[capacity_df['degradation_pct'] > 10]
if len(threshold_exceeded) > 0:
    capacity_limit = threshold_exceeded.iloc[0]['capital']
    print(f"\n⚠️  Strategy capacity (10% Sharpe degradation): ${capacity_limit / 1e6:.1f}M")
else:
    print(f"\n✓ No significant degradation detected up to ${capacity_df['capital'].max() / 1e6:.1f}M")

## 9. Failure Mode Validation

In [None]:
print("\n" + "="*80)
print("FAILURE MODE VALIDATION")
print("="*80)

# 1. Regime change check: 6-month rolling correlation
if 'external_macro' in features.columns:
    btc_returns = dataset['prices']['BTC-USD'].pct_change()
    common_idx = features.index.intersection(btc_returns.index)
    rolling_corr = features.loc[common_idx, 'external_macro'].rolling(126).corr(btc_returns.loc[common_idx])
    
    low_corr_periods = (rolling_corr.abs() < 0.1).sum()
    print(f"\n1. Regime Change Detection:")
    print(f"   6-month rolling correlation (macro vs BTC) < 0.1: {low_corr_periods} days")
    print(f"   Minimum correlation: {rolling_corr.min():.4f}")
    if low_corr_periods > 20:
        print("   ⚠️  Significant regime change detected - consider pausing macro component")
    else:
        print("   ✓ Macro signal remains correlated with BTC")

# 2. Stablecoin peg stability (placeholder check)
print(f"\n2. Stablecoin Peg Stability:")
print(f"   ✓ Using placeholder data - implement real-time peg monitoring in production")

# 3. Overfitting evidence: Compare in-sample vs out-of-sample
fold_df = pd.DataFrame(ml_results['fold_results'])
sharpe_std = fold_df['sharpe'].std()
print(f"\n3. Overfitting Check:")
print(f"   Sharpe std across folds: {sharpe_std:.4f}")
print(f"   Mean fold Sharpe: {fold_df['sharpe'].mean():.4f}")
print(f"   Overall Sharpe: {ml_results['overall_sharpe']:.4f}")
if sharpe_std > 0.5:
    print("   ⚠️  High variability - potential overfitting")
else:
    print("   ✓ Consistent performance across folds")

# 4. Exchange risk (conceptual)
print(f"\n4. Exchange Risk:")
print(f"   ✓ Strategy designed for multi-exchange deployment")
print(f"   ✓ Implement position limits per venue in production")

# 5. Black swan protection
max_daily_loss = strategy_results['backtest_df']['net_return'].min()
print(f"\n5. Black Swan Protection:")
print(f"   Worst single-day loss: {max_daily_loss:.2%}")
print(f"   Stop-loss triggered: {'Yes' if (strategy_results['backtest_df']['drawdown'] > 0.1).any() else 'No'}")
print(f"   ✓ Hard stop-loss active at 10% drawdown")

## 10. Summary and Conclusions

In [None]:
print("\n" + "="*80)
print("STRATEGY EVALUATION SUMMARY")
print("="*80)

# Objectives vs Actuals
objectives = {
    'Sharpe Ratio': {'Target': '>1.0', 'Actual': f"{strategy_results['metrics']['sharpe_ratio']:.4f}"},
    'Max Drawdown': {'Target': '<40%', 'Actual': f"{strategy_results['metrics']['max_drawdown']:.2%}"},
    'Win Rate': {'Target': '>55%', 'Actual': f"{strategy_results['metrics']['win_rate']:.2%}"},
    'Volatility': {'Target': '~15%', 'Actual': f"{strategy_results['metrics']['volatility']:.2%}"}
}

print("\nObjectives vs Actuals:")
for metric, values in objectives.items():
    print(f"  {metric}: Target {values['Target']} | Actual {values['Actual']}")

# Key findings
print("\nKey Findings:")
print(f"  1. Strategy Sharpe ({strategy_results['metrics']['sharpe_ratio']:.2f}) vs "
      f"Buy-Hold Sharpe ({benchmark_results['metrics']['sharpe_ratio']:.2f})")
print(f"  2. Max drawdown reduced by "
      f"{(benchmark_results['metrics']['max_drawdown'] - strategy_results['metrics']['max_drawdown']) * 100:.1f}pp")
print(f"  3. Statistical significance: {'✓ YES' if is_significant else '✗ NO'}")
print(f"  4. Transaction costs: {transaction_costs.sum():.2%} total impact")

# Factor contribution (from feature importance)
print("\nFactor Contribution (Top 3):")
common_idx = features.index.intersection(target.index)
X = features.loc[common_idx].values
y = target.loc[common_idx].values
params = {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8}
trained_model = model.train_xgboost(X, y, None, None, params)
importance = model.get_feature_importance(trained_model)
importance.index = features.columns
importance = importance.sort_values(ascending=False)
for i, (feature, imp) in enumerate(importance.head(3).items(), 1):
    print(f"  {i}. {feature}: {imp:.4f}")

print("\n" + "="*80)