# Phase 2: Measure Risk for Each Cluster

This notebook contains the analysis for the Risk & Portfolio Interaction study.

Phase 2: Measure Risk for Each Cluster
======================================
Goal: Quantify risk per cluster using volatility, Sharpe, drawdown, skewness, kurtosis

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("=" * 80)
print("PHASE 2: MEASURE RISK FOR EACH CLUSTER")
print("=" * 80)

# Load daily PnL data from Phase 1
daily_pnl = pd.read_csv('daily_pnl_per_cluster.csv', index_col=0, parse_dates=True)

# Load original trades for additional analysis
trades_df = pd.read_csv('trades_with_clusters.csv')
trades_df['entry_time'] = pd.to_datetime(trades_df['entry_time'])
trades_df['exit_time'] = pd.to_datetime(trades_df['exit_time'])

# Define cluster columns
cluster_cols = [col for col in daily_pnl.columns if 'cluster' in col and 'total' not in col]

PHASE 2: MEASURE RISK FOR EACH CLUSTER


In [None]:
# RISK METRICS CALCULATION

In [7]:
def calculate_max_drawdown(pnl_series):
    """Calculate maximum drawdown from cumulative PnL"""
    cumulative = pnl_series.cumsum()
    running_max = cumulative.cummax()
    drawdown = running_max - cumulative
    return drawdown.max()

def calculate_calmar_ratio(pnl_series, annual_factor=252):
    """Calculate Calmar Ratio (Annual Return / Max Drawdown)"""
    annual_return = pnl_series.mean() * annual_factor
    max_dd = calculate_max_drawdown(pnl_series)
    return annual_return / max_dd if max_dd != 0 else np.nan

def calculate_sortino_ratio(pnl_series, target=0):
    """Calculate Sortino Ratio (Mean / Downside Deviation)"""
    mean_return = pnl_series.mean()
    downside_returns = pnl_series[pnl_series < target]
    downside_std = np.sqrt(np.mean(downside_returns**2)) if len(downside_returns) > 0 else 0
    return mean_return / downside_std if downside_std != 0 else np.nan

def calculate_var(pnl_series, confidence=0.95):
    """Calculate Value at Risk (historical)"""
    return -np.percentile(pnl_series, (1 - confidence) * 100)

def calculate_cvar(pnl_series, confidence=0.95):
    """Calculate Conditional VaR (Expected Shortfall)"""
    var = calculate_var(pnl_series, confidence)
    return -pnl_series[pnl_series <= -var].mean()

print("\n Computing Risk Metrics for Each Cluster...")
print("-" * 80)

risk_metrics = {}

for col in cluster_cols + ['total_pnl']:
    series = daily_pnl[col]
    
    metrics = {
        'Total PnL': series.sum(),
        'Mean Daily PnL': series.mean(),
        'Std Dev (Volatility)': series.std(),
        'Sharpe Ratio (Daily)': series.mean() / series.std() if series.std() != 0 else np.nan,
        'Annualized Sharpe': (series.mean() / series.std()) * np.sqrt(252) if series.std() != 0 else np.nan,
        'Max Drawdown': calculate_max_drawdown(series),
        'Calmar Ratio': calculate_calmar_ratio(series),
        'Sortino Ratio': calculate_sortino_ratio(series),
        'Skewness': stats.skew(series),
        'Kurtosis': stats.kurtosis(series),
        'VaR 95%': calculate_var(series, 0.95),
        'CVaR 95%': calculate_cvar(series, 0.95),
        'Hit Ratio (Days > 0)': (series > 0).mean() * 100,
        'Best Day': series.max(),
        'Worst Day': series.min(),
        'Days Active': (series != 0).sum()
    }
    
    risk_metrics[col] = metrics

# Create DataFrame
risk_df = pd.DataFrame(risk_metrics).T
risk_df.index.name = 'Cluster'

print("\n" + "=" * 80)
print("CLUSTER RISK METRICS SUMMARY")
print("=" * 80)

# Display key metrics
display_metrics = ['Total PnL', 'Mean Daily PnL', 'Std Dev (Volatility)', 
                   'Annualized Sharpe', 'Max Drawdown', 'Sortino Ratio',
                   'Skewness', 'Kurtosis', 'VaR 95%', 'CVaR 95%', 'Hit Ratio (Days > 0)']

print(risk_df[display_metrics].round(3).to_string())

# Save risk metrics
risk_df.to_csv('cluster_risk_metrics.csv')
print(f"\n Risk metrics saved to: cluster_risk_metrics.csv")


 Computing Risk Metrics for Each Cluster...
--------------------------------------------------------------------------------

CLUSTER RISK METRICS SUMMARY
               Total PnL  Mean Daily PnL  Std Dev (Volatility)  Annualized Sharpe  Max Drawdown  Sortino Ratio  Skewness  Kurtosis  VaR 95%  CVaR 95%  Hit Ratio (Days > 0)
Cluster                                                                                                                                                                    
cluster_0_pnl    2318.39          13.883                38.213              5.767        211.25          0.320     0.543     6.535   28.714    66.948                50.898
cluster_1_pnl    1630.88           9.766                28.715              5.399         92.50          0.301     1.767     4.589    9.260    37.637                29.341
cluster_2_pnl     298.43           1.787                25.636              1.107        200.05          0.035    -0.120    14.680   21.879    58.893       

In [None]:
# TRADE-LEVEL RISK ANALYSIS

In [3]:
print("\n" + "=" * 80)
print("TRADE-LEVEL RISK ANALYSIS")
print("=" * 80)

trade_risk = {}
for cluster in sorted(trades_df['cluster'].unique()):
    cluster_trades = trades_df[trades_df['cluster'] == cluster]['profit']
    
    trade_risk[f'Cluster {int(cluster)}'] = {
        'Trade Count': len(cluster_trades),
        'Win Rate (%)': (cluster_trades > 0).mean() * 100,
        'Avg Win': cluster_trades[cluster_trades > 0].mean() if (cluster_trades > 0).any() else 0,
        'Avg Loss': cluster_trades[cluster_trades < 0].mean() if (cluster_trades < 0).any() else 0,
        'Profit Factor': abs(cluster_trades[cluster_trades > 0].sum() / cluster_trades[cluster_trades < 0].sum()) if (cluster_trades < 0).any() and cluster_trades[cluster_trades < 0].sum() != 0 else np.nan,
        'Max Win': cluster_trades.max(),
        'Max Loss': cluster_trades.min(),
        'Expectancy': cluster_trades.mean(),
        'Std Dev': cluster_trades.std()
    }

trade_risk_df = pd.DataFrame(trade_risk).T
print(trade_risk_df.round(2).to_string())

trade_risk_df.to_csv('trade_level_risk.csv')


TRADE-LEVEL RISK ANALYSIS
           Trade Count  Win Rate (%)  Avg Win  Avg Loss  Profit Factor  Max Win  Max Loss  Expectancy  Std Dev
Cluster 0        200.0         70.50    24.89    -20.20           2.95    99.36   -100.56       11.59    30.03
Cluster 1         93.0         76.34    28.72    -18.55           5.00    99.33    -60.55       17.54    29.62
Cluster 2         76.0         57.89    22.72    -21.92           1.43    76.51   -100.41        3.93    31.03
Cluster 3         61.0         60.66    20.35    -23.96           1.37    39.28   -100.41        3.31    28.68


In [None]:
# VISUALIZATIONS

In [8]:
fig, axes = plt.subplots(3, 2, figsize=(16, 18))

# Plot 1: Sharpe Ratio Comparison
ax1 = axes[0, 0]
sharpe_data = risk_df.loc[cluster_cols, 'Annualized Sharpe']
colors = ['green' if x > 0 else 'red' for x in sharpe_data]
bars = ax1.bar(range(len(sharpe_data)), sharpe_data, color=colors, alpha=0.7, edgecolor='black')
ax1.set_xticks(range(len(sharpe_data)))
ax1.set_xticklabels([c.replace('_pnl', '') for c in sharpe_data.index])
ax1.set_title('Annualized Sharpe Ratio by Cluster', fontsize=14, fontweight='bold')
ax1.set_ylabel('Sharpe Ratio')
ax1.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax1.axhline(y=1, color='green', linestyle='--', alpha=0.3, label='Sharpe = 1')
ax1.legend()
for bar, val in zip(bars, sharpe_data):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05, 
             f'{val:.2f}', ha='center', va='bottom', fontweight='bold')
ax1.grid(True, alpha=0.3)

# Plot 2: Max Drawdown Comparison
ax2 = axes[0, 1]
dd_data = risk_df.loc[cluster_cols, 'Max Drawdown']
ax2.bar(range(len(dd_data)), dd_data, color='red', alpha=0.7, edgecolor='black')
ax2.set_xticks(range(len(dd_data)))
ax2.set_xticklabels([c.replace('_pnl', '') for c in dd_data.index])
ax2.set_title('Maximum Drawdown by Cluster', fontsize=14, fontweight='bold')
ax2.set_ylabel('Max Drawdown ($)')
for i, val in enumerate(dd_data):
    ax2.text(i, val + 5, f'${val:.0f}', ha='center', va='bottom', fontweight='bold')
ax2.grid(True, alpha=0.3)

# Plot 3: Skewness and Kurtosis
ax3 = axes[1, 0]
x = np.arange(len(cluster_cols))
width = 0.35
skew_data = risk_df.loc[cluster_cols, 'Skewness']
kurt_data = risk_df.loc[cluster_cols, 'Kurtosis']
bars1 = ax3.bar(x - width/2, skew_data, width, label='Skewness', color='blue', alpha=0.7)
bars2 = ax3.bar(x + width/2, kurt_data, width, label='Kurtosis', color='orange', alpha=0.7)
ax3.set_xticks(x)
ax3.set_xticklabels([c.replace('_pnl', '') for c in cluster_cols])
ax3.set_title('Skewness & Kurtosis by Cluster (Tail Risk Indicators)', fontsize=14, fontweight='bold')
ax3.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax3.legend()
ax3.grid(True, alpha=0.3)
# Add interpretation
ax3.text(0.02, 0.98, 'Negative skew = Left tail (big losses)', transform=ax3.transAxes, 
         fontsize=9, verticalalignment='top', style='italic')

# Plot 4: VaR and CVaR Comparison
ax4 = axes[1, 1]
var_data = risk_df.loc[cluster_cols, 'VaR 95%']
cvar_data = risk_df.loc[cluster_cols, 'CVaR 95%']
bars1 = ax4.bar(x - width/2, var_data, width, label='VaR 95%', color='coral', alpha=0.7)
bars2 = ax4.bar(x + width/2, cvar_data, width, label='CVaR 95%', color='darkred', alpha=0.7)
ax4.set_xticks(x)
ax4.set_xticklabels([c.replace('_pnl', '') for c in cluster_cols])
ax4.set_title('Value at Risk & Expected Shortfall by Cluster', fontsize=14, fontweight='bold')
ax4.set_ylabel('Risk ($)')
ax4.legend()
ax4.grid(True, alpha=0.3)

# Plot 5: Daily PnL Distributions
ax5 = axes[2, 0]
for col in cluster_cols:
    data = daily_pnl[col][daily_pnl[col] != 0]  # Exclude zero days
    ax5.hist(data, bins=30, alpha=0.5, label=col.replace('_pnl', ''), density=True)
ax5.axvline(x=0, color='black', linestyle='--', alpha=0.5)
ax5.set_title('Daily PnL Distribution (Non-Zero Days)', fontsize=14, fontweight='bold')
ax5.set_xlabel('Daily PnL ($)')
ax5.set_ylabel('Density')
ax5.legend()
ax5.grid(True, alpha=0.3)

# Plot 6: Cumulative Drawdown
ax6 = axes[2, 1]
for col in cluster_cols:
    cumulative = daily_pnl[col].cumsum()
    running_max = cumulative.cummax()
    drawdown = running_max - cumulative
    ax6.fill_between(daily_pnl.index, 0, -drawdown, alpha=0.3, label=col.replace('_pnl', ''))
ax6.set_title('Drawdown Over Time by Cluster', fontsize=14, fontweight='bold')
ax6.set_xlabel('Date')
ax6.set_ylabel('Drawdown ($)')
ax6.legend()
ax6.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('phase2_risk_metrics.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"\n Phase 2 visualization saved to: phase2_risk_metrics.png")


 Phase 2 visualization saved to: phase2_risk_metrics.png


In [None]:
# KEY FINDINGS SUMMARY

In [9]:
print("\n" + "=" * 80)
print("PHASE 2 KEY FINDINGS")
print("=" * 80)

# Rank clusters by Sharpe
sharpe_ranking = risk_df.loc[cluster_cols, 'Annualized Sharpe'].sort_values(ascending=False)
print("\n Cluster Ranking by Sharpe Ratio:")
for i, (cluster, sharpe) in enumerate(sharpe_ranking.items(), 1):
    print(f"   {i}. {cluster.replace('_pnl', '')}: {sharpe:.3f}")

# Risk Assessment
print("\n  Risk Assessment:")
for col in cluster_cols:
    skew = risk_df.loc[col, 'Skewness']
    kurt = risk_df.loc[col, 'Kurtosis']
    sharpe = risk_df.loc[col, 'Annualized Sharpe']
    
    risk_level = " LOW" if sharpe > 1 and skew > -0.5 else " MODERATE" if sharpe > 0.5 else " HIGH"
    tail_risk = " Significant" if skew < -0.5 or kurt > 3 else " Normal"
    
    print(f"   {col.replace('_pnl', '')}:")
    print(f"      Risk Level: {risk_level}, Tail Risk: {tail_risk}")
    print(f"      Skewness: {skew:.2f} {'(left tail)' if skew < 0 else '(right tail)'}")

print("\n PHASE 2 COMPLETED AND PUSHED TO GIT")


PHASE 2 KEY FINDINGS

 Cluster Ranking by Sharpe Ratio:
   1. cluster_0: 5.767
   2. cluster_1: 5.399
   3. cluster_2: 1.107
   4. cluster_3: 1.070

  Risk Assessment:
   cluster_0:
      Risk Level:  LOW, Tail Risk:  Significant
      Skewness: 0.54 (right tail)
   cluster_1:
      Risk Level:  LOW, Tail Risk:  Significant
      Skewness: 1.77 (right tail)
   cluster_2:
      Risk Level:  LOW, Tail Risk:  Significant
      Skewness: -0.12 (left tail)
   cluster_3:
      Risk Level:  MODERATE, Tail Risk:  Significant
      Skewness: -1.27 (left tail)

 PHASE 2 COMPLETED
