# Day 5: Realistic Backtesting

## Learning Objectives
- Avoid common backtesting biases
- Implement proper validation frameworks
- Statistical significance testing

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("‚úÖ Libraries loaded!")
print("üìö Day 5: Realistic Backtesting")

## Part 1: Backtesting Biases

In [None]:
# ============================================================
# DEMONSTRATING LOOK-AHEAD BIAS
# ============================================================

print("LOOK-AHEAD BIAS DEMONSTRATION")
print("="*60)

np.random.seed(42)
n_days = 1000

# Generate data
returns = np.random.normal(0.0003, 0.015, n_days)
prices = 100 * np.cumprod(1 + returns)
df = pd.DataFrame({'price': prices, 'returns': returns})

# Features
df['ret_5d'] = df['price'].pct_change(5)
df['vol_5d'] = df['returns'].rolling(5).std()
df['target'] = (df['returns'].shift(-1) > 0).astype(int)

# WRONG: Using future information (scaled with test data)
df_wrong = df.dropna()
X_wrong = df_wrong[['ret_5d', 'vol_5d']].values
y_wrong = df_wrong['target'].values

scaler_wrong = StandardScaler()
X_scaled_wrong = scaler_wrong.fit_transform(X_wrong)  # Fits on ALL data!

split = int(len(X_wrong) * 0.7)
model_wrong = RandomForestClassifier(n_estimators=50, random_state=42)
model_wrong.fit(X_scaled_wrong[:split], y_wrong[:split])
acc_wrong = model_wrong.score(X_scaled_wrong[split:], y_wrong[split:])

# RIGHT: Proper train/test split
df_right = df.dropna()
X_right = df_right[['ret_5d', 'vol_5d']].values
y_right = df_right['target'].values

X_train = X_right[:split]
X_test = X_right[split:]
y_train = y_right[:split]
y_test = y_right[split:]

scaler_right = StandardScaler()
X_train_scaled = scaler_right.fit_transform(X_train)  # Fit only on train
X_test_scaled = scaler_right.transform(X_test)  # Transform test

model_right = RandomForestClassifier(n_estimators=50, random_state=42)
model_right.fit(X_train_scaled, y_train)
acc_right = model_right.score(X_test_scaled, y_test)

print(f"\nAccuracy with look-ahead bias: {acc_wrong:.3f}")
print(f"Accuracy without bias: {acc_right:.3f}")
print(f"Bias inflation: {(acc_wrong - acc_right) / acc_right * 100:.1f}%")

## Part 2: Multiple Hypothesis Testing

In [None]:
# ============================================================
# DATA SNOOPING / MULTIPLE TESTING
# ============================================================

print("\nDATA SNOOPING DEMONSTRATION")
print("="*60)

def run_random_strategy(n=100):
    """Run a random strategy and return Sharpe ratio."""
    np.random.seed()
    signals = np.random.choice([-1, 1], n)
    market_returns = np.random.normal(0.0003, 0.015, n)
    strategy_returns = signals * market_returns
    sharpe = np.sqrt(252) * strategy_returns.mean() / strategy_returns.std()
    return sharpe

# "Test" 1000 random strategies
n_strategies = 1000
sharpes = [run_random_strategy(500) for _ in range(n_strategies)]

# Find "best" strategy
best_sharpe = max(sharpes)
top_5_pct = np.percentile(sharpes, 95)

print(f"Tested {n_strategies} random strategies")
print(f"Best Sharpe: {best_sharpe:.2f}")
print(f"Top 5% threshold: {top_5_pct:.2f}")
print(f"Strategies with Sharpe > 1: {sum(np.array(sharpes) > 1)}")

print("\n‚ö†Ô∏è WARNING: Even random strategies can look profitable!")

In [None]:
# Visualize
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(sharpes, bins=50, density=True, alpha=0.7, color='steelblue')
ax.axvline(0, color='gray', linestyle='--', label='Zero')
ax.axvline(best_sharpe, color='red', linewidth=2, label=f'Best: {best_sharpe:.2f}')
ax.axvline(1, color='green', linestyle='--', label='Sharpe=1')
ax.set_xlabel('Sharpe Ratio')
ax.set_ylabel('Density')
ax.set_title('Distribution of Random Strategy Sharpe Ratios', fontweight='bold')
ax.legend()
plt.tight_layout()
plt.show()

## Part 3: Statistical Significance

In [None]:
# ============================================================
# SHARPE RATIO SIGNIFICANCE TEST
# ============================================================

print("STATISTICAL SIGNIFICANCE TESTING")
print("="*60)

def sharpe_t_stat(returns, target_sharpe=0):
    """
    Test if Sharpe ratio is significantly different from target.
    
    Under null hypothesis H0: SR = target_sharpe
    t-stat = (SR - target) / SE(SR)
    
    SE(SR) ‚âà sqrt((1 + SR^2/2) / n)
    """
    n = len(returns)
    sr = np.sqrt(252) * returns.mean() / returns.std()
    
    # Standard error of Sharpe ratio
    se = np.sqrt((1 + sr**2/2) / n)
    
    t_stat = (sr - target_sharpe) / se
    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=n-1))
    
    return sr, t_stat, p_value

# Test our strategy
np.random.seed(42)
strategy_returns = np.random.normal(0.0005, 0.012, 500)  # Slightly positive alpha

sr, t_stat, p_val = sharpe_t_stat(strategy_returns)

print(f"Strategy Sharpe: {sr:.2f}")
print(f"t-statistic: {t_stat:.2f}")
print(f"p-value: {p_val:.4f}")
print(f"Significant at 5%: {'Yes ‚úì' if p_val < 0.05 else 'No ‚úó'}")

In [None]:
# ============================================================
# MINIMUM TRACK RECORD LENGTH
# ============================================================

print("\nMINIMUM TRACK RECORD LENGTH")
print("="*60)

def min_track_record(target_sharpe, significance=0.05):
    """
    Calculate minimum days needed to confirm Sharpe ratio.
    
    From Bailey & L√≥pez de Prado (2012):
    n* = 1 + (1 + SR^2/2) * (z_alpha / SR)^2
    """
    z_alpha = stats.norm.ppf(1 - significance/2)
    n_star = 1 + (1 + target_sharpe**2/2) * (z_alpha / target_sharpe)**2
    return int(np.ceil(n_star))

print(f"Minimum days needed to confirm:")
for sr in [0.5, 1.0, 1.5, 2.0, 2.5]:
    days = min_track_record(sr)
    years = days / 252
    print(f"  Sharpe {sr:.1f}: {days:>4} days ({years:.1f} years)")

print("\nüí° Higher Sharpe requires less data to confirm")

## Part 4: Robust Validation Framework

In [None]:
# ============================================================
# ROBUST VALIDATION FRAMEWORK
# ============================================================

print("\nROBUST VALIDATION FRAMEWORK")
print("="*60)

class RobustValidator:
    """Comprehensive backtesting validation."""
    
    def __init__(self, returns, benchmark_returns=None):
        self.returns = returns
        self.benchmark = benchmark_returns
        
    def performance_metrics(self):
        """Calculate key metrics."""
        r = self.returns
        n = len(r)
        
        sharpe = np.sqrt(252) * r.mean() / r.std()
        
        # Drawdown
        cumulative = np.cumprod(1 + r)
        running_max = np.maximum.accumulate(cumulative)
        max_dd = ((cumulative - running_max) / running_max).min()
        
        # Calmar ratio (return / max drawdown)
        annual_return = (cumulative[-1] ** (252/n)) - 1
        calmar = annual_return / abs(max_dd) if max_dd != 0 else 0
        
        return {
            'sharpe': sharpe,
            'annual_return': annual_return,
            'volatility': r.std() * np.sqrt(252),
            'max_drawdown': max_dd,
            'calmar': calmar,
            'n_observations': n
        }
    
    def statistical_tests(self):
        """Run statistical tests."""
        r = self.returns
        
        # Sharpe significance
        sr, t_stat, p_val = sharpe_t_stat(r)
        
        # Normality test
        _, normality_p = stats.normaltest(r)
        
        # Autocorrelation test
        autocorr = np.corrcoef(r[:-1], r[1:])[0, 1]
        
        return {
            'sharpe_t_stat': t_stat,
            'sharpe_p_value': p_val,
            'significant_5pct': p_val < 0.05,
            'normality_p': normality_p,
            'autocorrelation': autocorr
        }
    
    def bootstrap_sharpe(self, n_bootstrap=1000):
        """Bootstrap confidence interval for Sharpe."""
        r = self.returns
        bootstrap_sharpes = []
        
        for _ in range(n_bootstrap):
            # Resample with replacement (block bootstrap for time series)
            idx = np.random.choice(len(r), len(r), replace=True)
            boot_r = r[idx]
            boot_sharpe = np.sqrt(252) * boot_r.mean() / boot_r.std()
            bootstrap_sharpes.append(boot_sharpe)
        
        return {
            'mean': np.mean(bootstrap_sharpes),
            'ci_lower': np.percentile(bootstrap_sharpes, 2.5),
            'ci_upper': np.percentile(bootstrap_sharpes, 97.5)
        }
    
    def full_report(self):
        """Generate comprehensive report."""
        metrics = self.performance_metrics()
        tests = self.statistical_tests()
        bootstrap = self.bootstrap_sharpe()
        
        print("\n" + "="*50)
        print("VALIDATION REPORT")
        print("="*50)
        
        print("\nPERFORMANCE:")
        print(f"  Sharpe Ratio: {metrics['sharpe']:.2f}")
        print(f"  Annual Return: {metrics['annual_return']*100:.1f}%")
        print(f"  Volatility: {metrics['volatility']*100:.1f}%")
        print(f"  Max Drawdown: {metrics['max_drawdown']*100:.1f}%")
        print(f"  Calmar Ratio: {metrics['calmar']:.2f}")
        
        print("\nSTATISTICAL TESTS:")
        print(f"  Sharpe t-stat: {tests['sharpe_t_stat']:.2f}")
        print(f"  Sharpe p-value: {tests['sharpe_p_value']:.4f}")
        print(f"  Significant (5%): {tests['significant_5pct']}")
        print(f"  Autocorrelation: {tests['autocorrelation']:.3f}")
        
        print("\nBOOTSTRAP 95% CI:")
        print(f"  Sharpe: [{bootstrap['ci_lower']:.2f}, {bootstrap['ci_upper']:.2f}]")
        
        print("="*50)

# Test
np.random.seed(42)
test_returns = np.random.normal(0.0004, 0.012, 750)

validator = RobustValidator(test_returns)
validator.full_report()

In [None]:
print("""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë            DAY 5 COMPLETE: REALISTIC BACKTESTING                 ‚ïë
‚ï†‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï£
‚ïë  ‚úì Look-ahead bias demonstration                                ‚ïë
‚ïë  ‚úì Multiple hypothesis testing / data snooping                  ‚ïë
‚ïë  ‚úì Sharpe ratio significance testing                            ‚ïë
‚ïë  ‚úì Minimum track record calculation                             ‚ïë
‚ïë  ‚úì Robust validation framework                                  ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

Tomorrow: Day 6 - Model Deployment
""")