# 7. High-Performance Trade Analysis (Outlier Detection)
## Identify and Analyze Exceptional Profitable Trades

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from data_utils import load_data
from strategy import TradeAnalyzer
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
print("Libraries imported successfully!")

In [None]:
# Load trades data
trades_df = pd.read_csv('../results/baseline_strategy_trades.csv')
print(f"Total trades: {len(trades_df)}")
trades_df.head()

## 7.1 Identify Outlier Trades (Z-score > 3)

In [None]:
# Calculate Z-scores for PnL
trades_df['pnl_zscore'] = np.abs(stats.zscore(trades_df['pnl']))

# Identify outliers (3-sigma)
outlier_threshold = 3.0
trades_df['is_outlier'] = trades_df['pnl_zscore'] > outlier_threshold

# Filter to profitable trades only
profitable_trades = trades_df[trades_df['is_profitable']].copy()
outlier_trades = profitable_trades[profitable_trades['is_outlier']].copy()
normal_trades = profitable_trades[~profitable_trades['is_outlier']].copy()

print(f"\nProfitable Trades: {len(profitable_trades)}")
print(f"Outlier Trades (Z-score > {outlier_threshold}): {len(outlier_trades)} ({len(outlier_trades)/len(profitable_trades)*100:.2f}%)")
print(f"Normal Profitable Trades: {len(normal_trades)} ({len(normal_trades)/len(profitable_trades)*100:.2f}%)")

In [None]:
# Outlier trade details
print("\nOutlier Trades:")
outlier_trades[['entry_time', 'exit_time', 'direction', 'pnl', 'pnl_pct', 'duration', 'regime']]

## 7.2 Statistical Comparison

In [None]:
# Compare outliers vs normal
comparison = pd.DataFrame({
    'Metric': ['Count', 'Avg PnL', 'Median PnL', 'Std PnL', 'Avg Duration', 'Median Duration'],
    'Outlier Trades': [
        len(outlier_trades),
        outlier_trades['pnl'].mean(),
        outlier_trades['pnl'].median(),
        outlier_trades['pnl'].std(),
        outlier_trades['duration'].mean(),
        outlier_trades['duration'].median()
    ],
    'Normal Trades': [
        len(normal_trades),
        normal_trades['pnl'].mean(),
        normal_trades['pnl'].median(),
        normal_trades['pnl'].std(),
        normal_trades['duration'].mean(),
        normal_trades['duration'].median()
    ]
})

print("\nStatistical Comparison:")
print(comparison)

## 7.3 Feature Analysis

In [None]:
# Load full data to get features for trades
df = load_data('../data/nifty_with_regimes.csv')

# Merge trade features
feature_cols = ['avg_iv', 'iv_spread', 'pcr_oi', 'futures_basis', 'ema_gap', 'hour']
feature_cols = [col for col in feature_cols if col in df.columns]

# Get entry features for each trade
outlier_features = []
normal_features = []

for _, trade in outlier_trades.iterrows():
    entry_idx = trade['entry_idx']
    if entry_idx < len(df):
        outlier_features.append(df.iloc[entry_idx][feature_cols].values)

for _, trade in normal_trades.iterrows():
    entry_idx = trade['entry_idx']
    if entry_idx < len(df):
        normal_features.append(df.iloc[entry_idx][feature_cols].values)

outlier_features_df = pd.DataFrame(outlier_features, columns=feature_cols)
normal_features_df = pd.DataFrame(normal_features, columns=feature_cols)

print("Feature comparison:")
for col in feature_cols:
    outlier_mean = outlier_features_df[col].mean()
    normal_mean = normal_features_df[col].mean()
    print(f"{col}: Outlier={outlier_mean:.4f}, Normal={normal_mean:.4f}, Diff={outlier_mean-normal_mean:.4f}")

## 7.4 Regime Analysis

In [None]:
# Regime distribution
regime_names = {-1: 'Downtrend', 0: 'Sideways', 1: 'Uptrend'}

print("\nOutlier Trades by Regime:")
outlier_regime_dist = outlier_trades['regime'].value_counts()
for regime, count in outlier_regime_dist.items():
    print(f"  {regime_names.get(regime, regime)}: {count} ({count/len(outlier_trades)*100:.1f}%)")

print("\nNormal Trades by Regime:")
normal_regime_dist = normal_trades['regime'].value_counts()
for regime, count in normal_regime_dist.items():
    print(f"  {regime_names.get(regime, regime)}: {count} ({count/len(normal_trades)*100:.1f}%)")

## 7.5 Time-of-Day Analysis

In [None]:
# Extract hour from entry time
outlier_trades['hour'] = pd.to_datetime(outlier_trades['entry_time']).dt.hour
normal_trades['hour'] = pd.to_datetime(normal_trades['entry_time']).dt.hour

print("\nTime Distribution - Outlier Trades:")
print(outlier_trades['hour'].value_counts().sort_index())

print("\nTime Distribution - Normal Trades:")
print(normal_trades['hour'].value_counts().sort_index())

## 7.6 Visualizations

In [None]:
# Scatter plot: PnL vs Duration
plt.figure(figsize=(12, 6))
plt.scatter(normal_trades['duration'], normal_trades['pnl'], 
           alpha=0.5, label='Normal', s=50)
plt.scatter(outlier_trades['duration'], outlier_trades['pnl'], 
           alpha=0.8, label='Outlier', s=100, marker='*', color='red')
plt.xlabel('Trade Duration (periods)')
plt.ylabel('PnL')
plt.title('PnL vs Duration: Outlier vs Normal Trades')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../plots/outlier_pnl_vs_duration.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Box plots for feature distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, col in enumerate(feature_cols[:6]):
    data = [normal_features_df[col].dropna(), outlier_features_df[col].dropna()]
    axes[idx].boxplot(data, labels=['Normal', 'Outlier'])
    axes[idx].set_title(col)
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../plots/outlier_feature_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Correlation heatmap
outlier_corr_data = outlier_trades[['pnl', 'duration']].copy()
for col in feature_cols:
    if col in outlier_features_df.columns:
        outlier_corr_data[col] = outlier_features_df[col].values[:len(outlier_corr_data)]

plt.figure(figsize=(10, 8))
sns.heatmap(outlier_corr_data.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Outlier Trades - Feature Correlation')
plt.tight_layout()
plt.savefig('../plots/outlier_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Time distribution comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(outlier_trades['hour'], bins=range(9, 17), alpha=0.7, label='Outlier')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Outlier Trades - Time Distribution')
axes[0].grid(True, alpha=0.3)

axes[1].hist(normal_trades['hour'], bins=range(9, 17), alpha=0.7, label='Normal', color='green')
axes[1].set_xlabel('Hour of Day')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Normal Trades - Time Distribution')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../plots/outlier_time_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 7.7 Statistical Tests

In [None]:
# T-tests for feature differences
print("\nStatistical Tests (t-test):")
print("=" * 60)

for col in feature_cols:
    if col in outlier_features_df.columns and col in normal_features_df.columns:
        outlier_vals = outlier_features_df[col].dropna()
        normal_vals = normal_features_df[col].dropna()
        
        if len(outlier_vals) > 0 and len(normal_vals) > 0:
            t_stat, p_value = stats.ttest_ind(outlier_vals, normal_vals)
            significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
            print(f"{col}: t={t_stat:.4f}, p={p_value:.4f} {significance}")

## 7.8 Generate Insights Report

In [None]:
# Create comprehensive report
report = []
report.append("=" * 80)
report.append("HIGH-PERFORMANCE TRADE ANALYSIS REPORT")
report.append("=" * 80)
report.append("")

report.append("1. OUTLIER IDENTIFICATION")
report.append("-" * 80)
report.append(f"Total Profitable Trades: {len(profitable_trades)}")
report.append(f"Outlier Trades (Z-score > 3): {len(outlier_trades)} ({len(outlier_trades)/len(profitable_trades)*100:.2f}%)")
report.append(f"Normal Trades: {len(normal_trades)} ({len(normal_trades)/len(profitable_trades)*100:.2f}%)")
report.append("")

report.append("2. AVERAGE PnL COMPARISON")
report.append("-" * 80)
report.append(f"Outlier Trades Avg PnL: {outlier_trades['pnl'].mean():.2f}")
report.append(f"Normal Trades Avg PnL: {normal_trades['pnl'].mean():.2f}")
report.append(f"Difference: {outlier_trades['pnl'].mean() - normal_trades['pnl'].mean():.2f}")
report.append("")

report.append("3. REGIME PATTERNS")
report.append("-" * 80)
report.append("Outlier Trades:")
for regime, count in outlier_regime_dist.items():
    report.append(f"  {regime_names.get(regime, regime)}: {count} ({count/len(outlier_trades)*100:.1f}%)")
report.append("")

report.append("4. TIME-OF-DAY PATTERNS")
report.append("-" * 80)
outlier_hour_dist = outlier_trades['hour'].value_counts().sort_index()
report.append("Most common hours for outlier trades:")
for hour, count in outlier_hour_dist.head(3).items():
    report.append(f"  {hour}:00 - {count} trades")
report.append("")

report.append("5. DISTINGUISHING FEATURES")
report.append("-" * 80)
for col in feature_cols:
    if col in outlier_features_df.columns and col in normal_features_df.columns:
        outlier_mean = outlier_features_df[col].mean()
        normal_mean = normal_features_df[col].mean()
        diff_pct = ((outlier_mean - normal_mean) / abs(normal_mean) * 100) if normal_mean != 0 else 0
        report.append(f"{col}: {diff_pct:+.2f}% difference")
report.append("")

report.append("=" * 80)

# Save report
with open('../results/outlier_analysis_report.txt', 'w') as f:
    f.write('\n'.join(report))

print("\n".join(report))

## Summary

In [None]:
print("=" * 80)
print("OUTLIER ANALYSIS COMPLETE")
print("=" * 80)
print(f"\nKey Findings:")
print(f"  • {len(outlier_trades)/len(profitable_trades)*100:.1f}% of profitable trades are outliers")
print(f"  • Outlier trades have {outlier_trades['pnl'].mean()/normal_trades['pnl'].mean():.1f}x higher average PnL")
print(f"  • Average duration: Outlier={outlier_trades['duration'].mean():.1f}, Normal={normal_trades['duration'].mean():.1f}")
print("\nAll visualizations and reports saved to plots/ and results/ directories")