# Performance Evaluation for DRL Portfolio

This notebook compares DRL agent to benchmarks (Markowitz, 60/40, Equal Weight) and performs statistical tests.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from data_acquisition import DataAcquisition
from portfolio_env import PortfolioEnv
from rl_agent import DRLAgent
from benchmark import MarkowitzOptimizer, Classic6040, EqualWeight, BenchmarkBacktester
from backtester import Backtester

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## 1. Load Data and Models

In [None]:
# Load data
data_acq = DataAcquisition('config.yaml')
dataset = data_acq.fetch_full_dataset()

test_prices = dataset['test']['prices']
test_returns = dataset['test']['returns']

print(f"Test period: {test_prices.index[0]} to {test_prices.index[-1]}")
print(f"Test days: {len(test_prices)}")

In [None]:
# Load trained DRL agent
# Note: This assumes you've run training first
test_env = PortfolioEnv(test_prices, test_returns)
agent = DRLAgent(test_env, algorithm='ppo')

# Try to load best model
try:
    agent.load('models/best_model')
    print("\nLoaded best model from training")
except:
    print("\nWarning: Could not load trained model. Using untrained agent.")
    print("Run 'python main.py --mode train' first.")

## 2. Backtest All Strategies

In [None]:
# Backtest DRL agent
print("Backtesting DRL agent...")
backtester = Backtester()
drl_results = backtester.run_backtest(agent, test_prices, test_returns)
print("✓ DRL complete")

In [None]:
# Backtest benchmarks
print("\nBacktesting benchmarks...")
benchmark_backtester = BenchmarkBacktester()

# Markowitz
print("  - Markowitz MVO...")
mvo_results = benchmark_backtester.backtest_markowitz(test_prices, test_returns)
mvo_df = pd.DataFrame({
    'portfolio_value': mvo_results['portfolio_history'][1:],
    'returns': test_returns.values @ np.array(mvo_results['weights_history'][:-1]).T
}, index=test_returns.index)
mvo_metrics = backtester.calculate_metrics(mvo_df)

# 60/40
print("  - 60/40...")
classic = Classic6040()
classic_results = benchmark_backtester.backtest_static(test_prices, test_returns, classic)
classic_df = pd.DataFrame({
    'portfolio_value': classic_results['portfolio_history'][1:],
    'returns': test_returns.values @ classic.get_weights()
}, index=test_returns.index)
classic_metrics = backtester.calculate_metrics(classic_df)

# Equal Weight
print("  - Equal Weight...")
equal = EqualWeight(n_assets=4)
equal_results = benchmark_backtester.backtest_static(test_prices, test_returns, equal)
equal_df = pd.DataFrame({
    'portfolio_value': equal_results['portfolio_history'][1:],
    'returns': test_returns.values @ equal.get_weights()
}, index=test_returns.index)
equal_metrics = backtester.calculate_metrics(equal_df)

print("✓ All backtests complete")

## 3. Comparison Table

In [None]:
# Create comparison
all_results = {
    'DRL': {'results_df': drl_results['results_df'], 'metrics': drl_results['metrics']},
    'Markowitz': {'results_df': mvo_df, 'metrics': mvo_metrics},
    '60/40': {'results_df': classic_df, 'metrics': classic_metrics},
    'Equal Weight': {'results_df': equal_df, 'metrics': equal_metrics}
}

comparison_df = backtester.compare_strategies(all_results)

print("\n" + "="*80)
print("PERFORMANCE COMPARISON")
print("="*80)
print(comparison_df)

# Save to CSV
comparison_df.to_csv('results/comparison.csv')
print("\n✓ Comparison saved to results/comparison.csv")

## 4. Portfolio Evolution

In [None]:
# Plot cumulative returns
fig, ax = plt.subplots(figsize=(16, 8))

for strategy, results in all_results.items():
    portfolio_values = results['results_df']['portfolio_value'].values
    ax.plot(results['results_df'].index, portfolio_values, linewidth=2, label=strategy)

ax.set_title('Portfolio Value Evolution (All Strategies)', fontsize=16, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Portfolio Value', fontsize=12)
ax.legend(fontsize=12, loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Drawdown Analysis

In [None]:
# Calculate drawdowns
fig, ax = plt.subplots(figsize=(16, 8))

for strategy, results in all_results.items():
    portfolio_values = results['results_df']['portfolio_value'].values
    cummax = np.maximum.accumulate(portfolio_values)
    drawdowns = (cummax - portfolio_values) / cummax
    
    ax.plot(results['results_df'].index, -drawdowns * 100, linewidth=2, label=strategy)

ax.set_title('Drawdown Evolution (All Strategies)', fontsize=16, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Drawdown (%)', fontsize=12)
ax.legend(fontsize=12, loc='lower right')
ax.grid(True, alpha=0.3)
ax.axhline(y=-15, color='red', linestyle='--', alpha=0.5, label='Circuit Breaker (-15%)')
plt.tight_layout()
plt.show()

## 6. DRL Weight Evolution

In [None]:
# Plot DRL weight evolution
if 'weights_history' in drl_results:
    weights_array = np.array(drl_results['weights_history'])
    
    fig, ax = plt.subplots(figsize=(16, 8))
    
    symbols = test_prices.columns
    for i, symbol in enumerate(symbols):
        ax.plot(drl_results['results_df'].index, weights_array[:, i], linewidth=2, label=symbol)
    
    ax.set_title('DRL Agent - Weight Evolution', fontsize=16, fontweight='bold')
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Weight', fontsize=12)
    ax.legend(fontsize=12, loc='best')
    ax.axhline(y=0.4, color='red', linestyle='--', alpha=0.5, label='Max Weight Constraint')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 7. Statistical Tests

In [None]:
# Perform statistical tests
print("\n" + "="*80)
print("STATISTICAL TESTS (DRL vs Benchmarks)")
print("="*80)

drl_returns = all_results['DRL']['results_df']['returns'].values

for benchmark in ['Markowitz', '60/40', 'Equal Weight']:
    benchmark_returns = all_results[benchmark]['results_df']['returns'].values
    
    test_results = backtester.statistical_tests(drl_returns, benchmark_returns)
    
    print(f"\nDRL vs {benchmark}:")
    print(f"  Mean return difference: {test_results['mean_diff']:.6f}")
    print(f"  T-statistic: {test_results['t_statistic']:.3f}")
    print(f"  P-value: {test_results['p_value']:.4f}")
    print(f"  Statistically significant (p<0.05): {'Yes ✓' if test_results['is_significant'] else 'No ✗'}")

## 8. Return Distribution Comparison

In [None]:
# Plot return distributions
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, (strategy, results) in enumerate(all_results.items()):
    ax = axes[idx]
    returns = results['results_df']['returns'].values
    
    ax.hist(returns, bins=50, alpha=0.7, edgecolor='black', density=True)
    ax.axvline(np.mean(returns), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(returns):.6f}')
    ax.set_title(f'{strategy} - Return Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel('Daily Return', fontsize=10)
    ax.set_ylabel('Density', fontsize=10)
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Risk-Return Scatter

In [None]:
# Risk-return scatter plot
fig, ax = plt.subplots(figsize=(10, 8))

for strategy, results in all_results.items():
    metrics = results['metrics']
    
    # Extract annualized return and volatility
    ret = metrics['annualized_return'] * 100
    vol = metrics['volatility'] * 100
    
    ax.scatter(vol, ret, s=200, alpha=0.7, label=strategy)
    ax.annotate(strategy, (vol, ret), fontsize=11, ha='right')

ax.set_title('Risk-Return Profile', fontsize=16, fontweight='bold')
ax.set_xlabel('Annualized Volatility (%)', fontsize=12)
ax.set_ylabel('Annualized Return (%)', fontsize=12)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 10. Summary and Conclusions

In [None]:
print("\n" + "="*80)
print("RESEARCH SUMMARY")
print("="*80)

print("\n1. Research Objective:")
print("   - Test if DRL can outperform Markowitz MVO by +10% return, -15% volatility")

print("\n2. Methodology:")
print("   - Agent: PPO with Actor-Critic architecture [128, 128]")
print("   - State: Current weights + 20-day features + correlations")
print("   - Action: Target portfolio weights (0-40% per asset)")
print("   - Reward: Log return - 0.5*volatility - transaction costs")

print("\n3. Results:")
print(comparison_df)

print("\n4. Key Findings:")
drl_sharpe = all_results['DRL']['metrics']['sharpe_ratio']
mvo_sharpe = all_results['Markowitz']['metrics']['sharpe_ratio']

if drl_sharpe > mvo_sharpe:
    improvement = (drl_sharpe - mvo_sharpe) / mvo_sharpe * 100
    print(f"   ✓ DRL achieved {improvement:.1f}% higher Sharpe ratio vs Markowitz")
else:
    print("   ✗ DRL did not outperform Markowitz (may need more training)")

print("\n5. Limitations:")
print("   - Overfitting risk (test on out-of-sample data)")
print("   - Non-stationarity (market regime changes)")
print("   - Transaction costs (real-world implementation gap)")
print("   - Black box nature (addressed via SHAP/LIME explainability)")

print("\n6. Next Steps:")
print("   - Run full training (500k timesteps)")
print("   - Hyperparameter optimization (Optuna)")
print("   - Stress testing (2008, 2020, 2022 crises)")
print("   - Explainability analysis (SHAP feature importance)")
print("   - Ablation studies (reward functions, architectures)")