# Phase 3: Covariance & Correlation Between Clusters

This notebook contains the analysis for the Risk & Portfolio Interaction study.

Phase 3: Estimate Covariance & Correlation Between Clusters
===========================================================
Goal: See how strategies interact in a portfolio context

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("=" * 80)
print("PHASE 3: COVARIANCE & CORRELATION BETWEEN CLUSTERS")
print("=" * 80)

# Load daily PnL data
daily_pnl = pd.read_csv('daily_pnl_per_cluster.csv', index_col=0, parse_dates=True)
trades_df = pd.read_csv('trades_with_clusters.csv')
trades_df['entry_time'] = pd.to_datetime(trades_df['entry_time'])
trades_df['exit_time'] = pd.to_datetime(trades_df['exit_time'])

cluster_cols = [col for col in daily_pnl.columns if 'cluster' in col and 'total' not in col]
cluster_pnl = daily_pnl[cluster_cols]

PHASE 3: COVARIANCE & CORRELATION BETWEEN CLUSTERS


In [None]:
# CORRELATION MATRIX

In [2]:
print("\nðŸ“Š Computing Correlation Matrix...")
print("-" * 80)

# Standard correlation
correlation_matrix = cluster_pnl.corr()
print("\nðŸ”— Correlation Matrix:")
print(correlation_matrix.round(4).to_string())


ðŸ“Š Computing Correlation Matrix...
--------------------------------------------------------------------------------

ðŸ”— Correlation Matrix:
               cluster_0_pnl  cluster_1_pnl  cluster_2_pnl  cluster_3_pnl
cluster_0_pnl         1.0000         0.2765         0.0334        -0.1148
cluster_1_pnl         0.2765         1.0000        -0.0274         0.0392
cluster_2_pnl         0.0334        -0.0274         1.0000         0.0627
cluster_3_pnl        -0.1148         0.0392         0.0627         1.0000


In [None]:
# COVARIANCE MATRIX

In [3]:
print("\nðŸ“Š Computing Covariance Matrix...")
covariance_matrix = cluster_pnl.cov()
print("\nðŸ“‰ Covariance Matrix:")
print(covariance_matrix.round(2).to_string())

# Save matrices
correlation_matrix.to_csv('correlation_matrix.csv')
covariance_matrix.to_csv('covariance_matrix.csv')


ðŸ“Š Computing Covariance Matrix...

ðŸ“‰ Covariance Matrix:
               cluster_0_pnl  cluster_1_pnl  cluster_2_pnl  cluster_3_pnl
cluster_0_pnl        1460.20         303.39          32.68         -78.65
cluster_1_pnl         303.39         824.57         -20.14          20.16
cluster_2_pnl          32.68         -20.14         657.21          28.83
cluster_3_pnl         -78.65          20.16          28.83         321.38


In [None]:
# ROLLING CORRELATION ANALYSIS

In [4]:
print("\nðŸ“Š Computing Rolling Correlations (20-day window)...")

rolling_corr = {}
window = 20

for i, col1 in enumerate(cluster_cols):
    for j, col2 in enumerate(cluster_cols):
        if i < j:  # Only upper triangle
            rolling_corr[f'{col1}_vs_{col2}'] = cluster_pnl[col1].rolling(window).corr(cluster_pnl[col2])

rolling_corr_df = pd.DataFrame(rolling_corr)

# Summary statistics of rolling correlations
print("\nðŸ“ˆ Rolling Correlation Statistics (20-day window):")
print(rolling_corr_df.describe().round(3).to_string())


ðŸ“Š Computing Rolling Correlations (20-day window)...

ðŸ“ˆ Rolling Correlation Statistics (20-day window):
       cluster_0_pnl_vs_cluster_1_pnl  cluster_0_pnl_vs_cluster_2_pnl  cluster_0_pnl_vs_cluster_3_pnl  cluster_1_pnl_vs_cluster_2_pnl  cluster_1_pnl_vs_cluster_3_pnl  cluster_2_pnl_vs_cluster_3_pnl
count                         148.000                         146.000                         146.000                         146.000                         146.000                         144.000
mean                            0.248                           0.087                          -0.032                          -0.023                           0.057                          -0.048
std                             0.286                           0.286                           0.251                           0.217                           0.145                           0.271
min                            -0.579                          -0.696                          -0.

In [None]:
# REGIME-DEPENDENT CORRELATIONS

In [10]:
print("\n" + "=" * 80)
print("REGIME-DEPENDENT CORRELATION ANALYSIS")
print("=" * 80)

# Get entry ATR and ADX values for each trade
trades_df['date'] = trades_df['exit_time'].dt.floor('D')

# Calculate median ATR and ADX
median_atr = trades_df['entry_ATR(14)'].median()
median_adx = trades_df['entry_ADX(14)'].median()

print(f"\nMedian ATR: {median_atr:.4f}")
print(f"Median ADX: {median_adx:.2f}")

# Get daily regime indicators (average of all trades that day)
daily_regime = trades_df.groupby('date').agg({
    'entry_ATR(14)': 'mean',
    'entry_ADX(14)': 'mean'
}).rename(columns={'entry_ATR(14)': 'avg_ATR', 'entry_ADX(14)': 'avg_ADX'})

# Merge with daily PnL
daily_pnl_with_regime = daily_pnl.join(daily_regime, how='left')

# Define regimes
high_vol_days = daily_pnl_with_regime['avg_ATR'] > median_atr
low_vol_days = daily_pnl_with_regime['avg_ATR'] <= median_atr
high_adx_days = daily_pnl_with_regime['avg_ADX'] > median_adx
low_adx_days = daily_pnl_with_regime['avg_ADX'] <= median_adx

# Calculate correlations by regime
print("\n HIGH VOLATILITY (ATR > median) Correlation Matrix:")
high_vol_corr = cluster_pnl[high_vol_days].corr()
print(high_vol_corr.round(4).to_string())

print("\n  LOW VOLATILITY (ATR <= median) Correlation Matrix:")
low_vol_corr = cluster_pnl[low_vol_days].corr()
print(low_vol_corr.round(4).to_string())

print("\n HIGH TREND (ADX > median) Correlation Matrix:")
high_adx_corr = cluster_pnl[high_adx_days].corr()
print(high_adx_corr.round(4).to_string())

print("\n LOW TREND (ADX <= median) Correlation Matrix:")
low_adx_corr = cluster_pnl[low_adx_days].corr()
print(low_adx_corr.round(4).to_string())

# Save regime correlations
pd.DataFrame({
    'High_Vol': high_vol_corr.values.flatten(),
    'Low_Vol': low_vol_corr.values.flatten(),
    'High_ADX': high_adx_corr.values.flatten(),
    'Low_ADX': low_adx_corr.values.flatten()
}).to_csv('regime_correlations.csv')


REGIME-DEPENDENT CORRELATION ANALYSIS

Median ATR: 3.2868
Median ADX: 31.30

 HIGH VOLATILITY (ATR > median) Correlation Matrix:
               cluster_0_pnl  cluster_1_pnl  cluster_2_pnl  cluster_3_pnl
cluster_0_pnl         1.0000        -0.0419         0.0600        -0.2343
cluster_1_pnl        -0.0419         1.0000        -0.0319         0.1030
cluster_2_pnl         0.0600        -0.0319         1.0000         0.0748
cluster_3_pnl        -0.2343         0.1030         0.0748         1.0000

  LOW VOLATILITY (ATR <= median) Correlation Matrix:
               cluster_0_pnl  cluster_1_pnl  cluster_2_pnl  cluster_3_pnl
cluster_0_pnl         1.0000         0.4241         0.0407         0.0818
cluster_1_pnl         0.4241         1.0000        -0.0012         0.0092
cluster_2_pnl         0.0407        -0.0012         1.0000        -0.1809
cluster_3_pnl         0.0818         0.0092        -0.1809         1.0000

 HIGH TREND (ADX > median) Correlation Matrix:
               cluster_0_pnl

In [None]:
# PORTFOLIO RISK DECOMPOSITION

In [11]:
print("\n" + "=" * 80)
print("PORTFOLIO VARIANCE DECOMPOSITION")
print("=" * 80)

# Equal weights
n_clusters = len(cluster_cols)
weights = np.array([1/n_clusters] * n_clusters)

# Portfolio variance
cov_matrix = covariance_matrix.values
portfolio_variance = weights.T @ cov_matrix @ weights
portfolio_std = np.sqrt(portfolio_variance)

print(f"\n Equal-Weighted Portfolio:")
print(f"   Portfolio Variance: {portfolio_variance:.2f}")
print(f"   Portfolio Std Dev: {portfolio_std:.2f}")

# Marginal Contribution to Risk (MCR)
mcr = (cov_matrix @ weights) / portfolio_std
print("\n Marginal Contribution to Risk (MCR):")
for i, col in enumerate(cluster_cols):
    print(f"   {col.replace('_pnl', '')}: {mcr[i]:.4f}")

# Total Contribution to Risk (TCR)
tcr = weights * mcr
print("\n Total Contribution to Risk (TCR):")
total_tcr = 0
for i, col in enumerate(cluster_cols):
    pct = (tcr[i] / portfolio_std) * 100
    total_tcr += tcr[i]
    print(f"   {col.replace('_pnl', '')}: {tcr[i]:.4f} ({pct:.1f}%)")

# Verify: sum of TCR should equal portfolio std
print(f"\n   Sum of TCR: {total_tcr:.4f} (should equal portfolio std: {portfolio_std:.4f})")

# Save risk decomposition
risk_decomp_df = pd.DataFrame({
    'Cluster': [c.replace('_pnl', '') for c in cluster_cols],
    'Weight': weights,
    'Variance': [cov_matrix[i,i] for i in range(n_clusters)],
    'Std_Dev': [np.sqrt(cov_matrix[i,i]) for i in range(n_clusters)],
    'MCR': mcr,
    'TCR': tcr,
    'Risk_Contribution_Pct': (tcr / portfolio_std) * 100
})
risk_decomp_df.to_csv('risk_decomposition.csv', index=False)


PORTFOLIO VARIANCE DECOMPOSITION

 Equal-Weighted Portfolio:
   Portfolio Variance: 239.74
   Portfolio Std Dev: 15.48

 Marginal Contribution to Risk (MCR):
   cluster_0: 27.7328
   cluster_1: 18.2124
   cluster_2: 11.2795
   cluster_3: 4.7102

 Total Contribution to Risk (TCR):
   cluster_0: 6.9332 (44.8%)
   cluster_1: 4.5531 (29.4%)
   cluster_2: 2.8199 (18.2%)
   cluster_3: 1.1775 (7.6%)

   Sum of TCR: 15.4837 (should equal portfolio std: 15.4837)


In [None]:
# VISUALIZATIONS

In [12]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot 1: Correlation Heatmap
ax1 = axes[0, 0]
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='RdYlGn', center=0,
            ax=ax1, vmin=-1, vmax=1, square=True)
ax1.set_title('Cluster Correlation Matrix', fontsize=14, fontweight='bold')
ax1.set_xticklabels([c.replace('_pnl', '') for c in correlation_matrix.columns], rotation=45)
ax1.set_yticklabels([c.replace('_pnl', '') for c in correlation_matrix.index], rotation=0)

# Plot 2: Covariance Heatmap
ax2 = axes[0, 1]
sns.heatmap(covariance_matrix, annot=True, fmt='.0f', cmap='coolwarm', center=0,
            ax=ax2, square=True)
ax2.set_title('Cluster Covariance Matrix', fontsize=14, fontweight='bold')
ax2.set_xticklabels([c.replace('_pnl', '') for c in covariance_matrix.columns], rotation=45)
ax2.set_yticklabels([c.replace('_pnl', '') for c in covariance_matrix.index], rotation=0)

# Plot 3: Rolling Correlations
ax3 = axes[0, 2]
for col in rolling_corr_df.columns:
    ax3.plot(rolling_corr_df.index, rolling_corr_df[col], label=col.replace('cluster_', 'C').replace('_pnl', '').replace('_vs_', ' vs '), alpha=0.7)
ax3.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax3.set_title('Rolling 20-Day Correlations', fontsize=14, fontweight='bold')
ax3.set_xlabel('Date')
ax3.set_ylabel('Correlation')
ax3.legend(fontsize=8, loc='upper left')
ax3.grid(True, alpha=0.3)
ax3.set_ylim(-1, 1)

# Plot 4: Regime Correlation Comparison
ax4 = axes[1, 0]
regimes = ['High Vol', 'Low Vol', 'High ADX', 'Low ADX']
corr_matrices = [high_vol_corr, low_vol_corr, high_adx_corr, low_adx_corr]

# Extract C0-C1 correlation across regimes as example
c0_c1_corrs = [m.iloc[0, 1] for m in corr_matrices]
c0_c2_corrs = [m.iloc[0, 2] for m in corr_matrices]
c1_c2_corrs = [m.iloc[1, 2] for m in corr_matrices]

x = np.arange(len(regimes))
width = 0.25
ax4.bar(x - width, c0_c1_corrs, width, label='C0-C1', alpha=0.7)
ax4.bar(x, c0_c2_corrs, width, label='C0-C2', alpha=0.7)
ax4.bar(x + width, c1_c2_corrs, width, label='C1-C2', alpha=0.7)
ax4.set_xticks(x)
ax4.set_xticklabels(regimes)
ax4.set_title('Correlation by Market Regime', fontsize=14, fontweight='bold')
ax4.set_ylabel('Correlation')
ax4.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax4.legend()
ax4.grid(True, alpha=0.3)

# Plot 5: Risk Contribution Pie Chart
ax5 = axes[1, 1]
risk_pcts = (tcr / portfolio_std) * 100
colors = plt.cm.Set3(np.linspace(0, 1, len(cluster_cols)))
wedges, texts, autotexts = ax5.pie(risk_pcts, labels=[c.replace('_pnl', '') for c in cluster_cols],
                                    autopct='%1.1f%%', colors=colors, startangle=90)
ax5.set_title('Portfolio Risk Contribution by Cluster\n(Equal-Weighted)', fontsize=14, fontweight='bold')

# Plot 6: Scatter of Daily PnL - Pairs
ax6 = axes[1, 2]
ax6.scatter(cluster_pnl['cluster_0_pnl'], cluster_pnl['cluster_1_pnl'], alpha=0.5, label='C0 vs C1')
ax6.scatter(cluster_pnl['cluster_2_pnl'], cluster_pnl['cluster_3_pnl'], alpha=0.5, label='C2 vs C3')
ax6.axhline(y=0, color='black', linestyle='--', alpha=0.3)
ax6.axvline(x=0, color='black', linestyle='--', alpha=0.3)
ax6.set_xlabel('Cluster X Daily PnL ($)')
ax6.set_ylabel('Cluster Y Daily PnL ($)')
ax6.set_title('Daily PnL Scatter (Selected Pairs)', fontsize=14, fontweight='bold')
ax6.legend()
ax6.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('phase3_correlation.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"\n Phase 3 visualization saved to: phase3_correlation.png")


 Phase 3 visualization saved to: phase3_correlation.png


In [13]:
# KEY FINDINGS

In [14]:
print("\n" + "=" * 80)
print("PHASE 3 KEY FINDINGS")
print("=" * 80)

# Identify high correlations
print("\n Correlation Insights:")
for i, col1 in enumerate(cluster_cols):
    for j, col2 in enumerate(cluster_cols):
        if i < j:
            corr = correlation_matrix.iloc[i, j]
            level = " HIGH" if abs(corr) > 0.5 else " MODERATE" if abs(corr) > 0.3 else " LOW"
            diversification = "Limited diversification" if corr > 0.3 else "Good diversification" if corr < 0.1 else "Some diversification"
            print(f"   {col1.replace('_pnl', '')} vs {col2.replace('_pnl', '')}: {corr:.3f} ({level}) - {diversification}")

# Regime dependency
print("\n Regime-Dependent Behavior:")
print("   Correlations tend to change based on market conditions:")
print(f"   - High vol environments show different clustering behavior")
print(f"   - Trend strength (ADX) affects strategy co-movement")

print("\n PHASE 3 COMPLETED")


PHASE 3 KEY FINDINGS

 Correlation Insights:
   cluster_0 vs cluster_1: 0.276 ( LOW) - Some diversification
   cluster_0 vs cluster_2: 0.033 ( LOW) - Good diversification
   cluster_0 vs cluster_3: -0.115 ( LOW) - Good diversification
   cluster_1 vs cluster_2: -0.027 ( LOW) - Good diversification
   cluster_1 vs cluster_3: 0.039 ( LOW) - Good diversification
   cluster_2 vs cluster_3: 0.063 ( LOW) - Good diversification

 Regime-Dependent Behavior:
   Correlations tend to change based on market conditions:
   - High vol environments show different clustering behavior
   - Trend strength (ADX) affects strategy co-movement

 PHASE 3 COMPLETED
