In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Create output directories
os.makedirs('csv_files', exist_ok=True)
os.makedirs('outputs', exist_ok=True)


# Load raw data files
print("Loading raw data...")
fear_greed = pd.read_csv('fear_greed_index - fear_greed_index.csv')
print(f"Fear & Greed Index loaded: {len(fear_greed)} records")

trades = pd.read_csv('historical_data - historical_data.csv')
print(f"Historical Trading Data loaded: {len(trades):,} records")

# Standardize data types for Fear & Greed Index
print("Standardizing data types...")
fear_greed['date'] = pd.to_datetime(fear_greed['date'], errors='coerce')
fear_greed['timestamp'] = pd.to_numeric(fear_greed['timestamp'], errors='coerce')
fear_greed['value'] = pd.to_numeric(fear_greed['value'], errors='coerce')
fear_greed['classification'] = fear_greed['classification'].astype('category')

# Standardize timestamps in trading data
trades['Timestamp IST'] = pd.to_datetime(trades['Timestamp IST'], format='%d-%m-%Y %H:%M', errors='coerce')
trades['date'] = trades['Timestamp IST'].dt.date
trades['date'] = pd.to_datetime(trades['date'])
trades['Timestamp'] = pd.to_numeric(trades['Timestamp'], errors='coerce')

# Standardize numeric columns in trading data
numeric_cols = ['Execution Price', 'Size Tokens', 'Size USD', 'Closed PnL',
                'Fee', 'Start Position', 'Order ID', 'Trade ID']
for col in numeric_cols:
    if col in trades.columns:
        trades[col] = pd.to_numeric(trades[col], errors='coerce')

# Standardize categorical columns in trading data
categorical_cols = ['Account', 'Coin', 'Side', 'Direction', 'Crossed']
for col in categorical_cols:
    if col in trades.columns:
        trades[col] = trades[col].astype('category')

print("Data types standardized")

# Handle missing values in Fear & Greed Index
print("Handling missing values...")
fear_greed = fear_greed.dropna(subset=['date', 'value', 'classification'])
print(f"Fear & Greed cleaned: {len(fear_greed)} records")

# Handle missing values in trading data
trades_before = len(trades)
trades = trades.dropna(subset=['date', 'Account', 'Coin', 'Size USD'])
trades['Closed PnL'] = trades['Closed PnL'].fillna(0)
trades['Fee'] = trades['Fee'].fillna(0)
print(f"Trading data cleaned: {len(trades):,} records (removed {trades_before-len(trades):,})")

# Align timestamps and merge datasets
print("Aligning timestamps and merging datasets...")
trades_aligned = pd.merge(trades, fear_greed[['date', 'value', 'classification']],
                         on='date', how='inner')
print(f"Aligned dataset: {len(trades_aligned):,} records")
print(f"Date range: {trades_aligned['date'].min()} to {trades_aligned['date'].max()}")

# Save cleaned data
print("Saving cleaned data...")
fear_greed.to_csv('csv_files/cleaned_fear_greed_index.csv', index=False)
trades.to_csv('csv_files/cleaned_trading_data.csv', index=False)
trades_aligned.to_csv('csv_files/aligned_trades_sentiment.csv', index=False)
print("All cleaned data saved to csv_files/")

print("STEP 3: EXPLORATORY DATA ANALYSIS")

# Calculate trader-level metrics
print("Calculating trader metrics...")
trader_metrics = trades.groupby('Account').agg({
    'Closed PnL': ['sum', 'mean', 'std', 'count'],
    'Size USD': ['sum', 'mean', 'max'],
    'Trade ID': 'count',
    'Fee': 'sum'
}).reset_index()

trader_metrics.columns = ['Account', 'total_pnl', 'avg_pnl_per_trade', 'pnl_std',
                          'trades_with_pnl', 'total_volume', 'avg_position_size',
                          'max_position_size', 'total_trades', 'total_fees']

# Calculate win rate for each trader
trades_with_pnl = trades[trades['Closed PnL'] != 0].copy()
winning_trades = trades_with_pnl[trades_with_pnl['Closed PnL'] > 0].groupby('Account').size()
total_closed_trades = trades_with_pnl.groupby('Account').size()
trader_metrics['win_rate'] = ((winning_trades / total_closed_trades) * 100).fillna(0)

# Calculate additional performance metrics
trader_metrics['profit_factor'] = trader_metrics['total_pnl'] / (trader_metrics['total_fees'] + 0.01)
trader_metrics['sharpe_proxy'] = trader_metrics['total_pnl'] / (trader_metrics['pnl_std'] + 1)

# Save trader metrics
trader_metrics.to_csv('csv_files/trader_metrics.csv', index=False)
print(f"Trader metrics calculated for {len(trader_metrics)} traders")
print("Saved: csv_files/trader_metrics.csv")

# Calculate daily sentiment metrics
print("Analyzing sentiment distribution...")
daily_sentiment_metrics = trades_aligned.groupby(['date', 'classification']).agg({
    'Closed PnL': 'sum',
    'Size USD': 'sum',
    'Trade ID': 'count',
    'Account': 'nunique'
}).reset_index()

daily_sentiment_metrics.columns = ['date', 'classification', 'daily_pnl',
                                    'daily_volume', 'trade_count', 'active_traders']

# Save daily sentiment metrics
daily_sentiment_metrics.to_csv('csv_files/daily_sentiment_metrics.csv', index=False)
print("Daily sentiment metrics calculated")
print("Saved: csv_files/daily_sentiment_metrics.csv")

# Display sentiment distribution
sentiment_dist = trades_aligned['classification'].value_counts()
print("Sentiment Distribution:")
for sentiment, count in sentiment_dist.items():
    pct = (count / len(trades_aligned)) * 100
    print(f"  {sentiment:15s}: {count:7,} trades ({pct:5.2f}%)")

# Create visualizations
print("Creating visualizations...")

# Visualization 1: Sentiment distribution pie chart
print("1. Creating sentiment distribution pie chart...")
fig, ax = plt.subplots(figsize=(10, 8))
colors = ['#e74c3c', '#f39c12', '#95a5a6', '#3498db', '#2ecc71']
sentiment_dist.plot(kind='pie', autopct='%1.1f%%', colors=colors, ax=ax)
ax.set_title('Sentiment Distribution Across All Trades', fontsize=16, fontweight='bold', pad=20)
ax.set_ylabel('')
plt.tight_layout()
plt.savefig('outputs/1_sentiment_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/1_sentiment_distribution.png")

# Visualization 2: Top 10 traders by PnL
print("2. Creating top traders chart...")
fig, ax = plt.subplots(figsize=(12, 8))
top10_traders = trader_metrics.nlargest(10, 'total_pnl')
trader_labels = [f"Trader {i+1}" for i in range(len(top10_traders))]
bars = ax.barh(trader_labels, top10_traders['total_pnl'].values, color='#3498db')
ax.set_xlabel('Total PnL ($)', fontsize=12, fontweight='bold')
ax.set_ylabel('Traders', fontsize=12, fontweight='bold')
ax.set_title('Top 10 Traders by Total PnL', fontsize=16, fontweight='bold', pad=20)
ax.grid(axis='x', alpha=0.3)
for bar, val in zip(bars, top10_traders['total_pnl'].values):
    ax.text(val, bar.get_y() + bar.get_height()/2, f'${val:,.0f}',
            va='center', ha='left', fontsize=10, fontweight='bold')
plt.tight_layout()
plt.savefig('outputs/2_top_traders_pnl.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/2_top_traders_pnl.png")

# Visualization 3: PnL by sentiment
print("3. Creating PnL by sentiment chart...")
fig, ax = plt.subplots(figsize=(12, 7))
sentiment_pnl = trades_aligned.groupby('classification')['Closed PnL'].sum().sort_values(ascending=False)
colors_map = {'Extreme Fear': '#e74c3c', 'Fear': '#f39c12', 'Neutral': '#95a5a6',
              'Greed': '#3498db', 'Extreme Greed': '#2ecc71'}
bar_colors = [colors_map.get(s, '#95a5a6') for s in sentiment_pnl.index]
bars = ax.bar(sentiment_pnl.index, sentiment_pnl.values, color=bar_colors,
              edgecolor='black', linewidth=1.5)
ax.set_ylabel('Total PnL ($)', fontsize=12, fontweight='bold')
ax.set_xlabel('Sentiment Category', fontsize=12, fontweight='bold')
ax.set_title('Total PnL by Market Sentiment', fontsize=16, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)
ax.tick_params(axis='x', rotation=45)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'${height:,.0f}', ha='center', va='bottom', fontsize=11, fontweight='bold')
plt.tight_layout()
plt.savefig('outputs/3_pnl_by_sentiment.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/3_pnl_by_sentiment.png")

# Visualization 4: Position size by sentiment
print("4. Creating position sizing chart...")
fig, ax = plt.subplots(figsize=(12, 7))
position_by_sentiment = trades_aligned.groupby('classification')['Size USD'].agg(['mean', 'median'])
position_by_sentiment = position_by_sentiment.sort_values('mean', ascending=False)
x = np.arange(len(position_by_sentiment))
width = 0.35
bars1 = ax.bar(x - width/2, position_by_sentiment['mean'], width,
               label='Mean', color='#3498db', alpha=0.8)
bars2 = ax.bar(x + width/2, position_by_sentiment['median'], width,
               label='Median', color='#e74c3c', alpha=0.8)
ax.set_ylabel('Position Size ($)', fontsize=12, fontweight='bold')
ax.set_xlabel('Sentiment Category', fontsize=12, fontweight='bold')
ax.set_title('Average Position Sizing by Market Sentiment', fontsize=16, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels(position_by_sentiment.index, rotation=45, ha='right')
ax.legend(fontsize=11)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/4_position_size_by_sentiment.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/4_position_size_by_sentiment.png")

# Visualization 5: Daily volume time series
print("5. Creating daily volume time series...")
fig, ax = plt.subplots(figsize=(14, 6))
daily_volume = trades_aligned.groupby('date')['Size USD'].sum()
ax.plot(daily_volume.index, daily_volume.values, linewidth=2, color='#3498db', alpha=0.7)
ax.fill_between(daily_volume.index, daily_volume.values, alpha=0.3, color='#3498db')
ax.set_xlabel('Date', fontsize=12, fontweight='bold')
ax.set_ylabel('Daily Volume ($)', fontsize=12, fontweight='bold')
ax.set_title('Daily Trading Volume Over Time', fontsize=16, fontweight='bold', pad=20)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('outputs/5_daily_volume_timeseries.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/5_daily_volume_timeseries.png")

# Visualization 6: Sentiment timeline
print("6. Creating sentiment timeline...")
fig, ax = plt.subplots(figsize=(14, 7))
sentiment_daily = trades_aligned.groupby(['date', 'classification']).size().unstack(fill_value=0)
sentiment_daily.plot(kind='area', stacked=True, ax=ax,
                     color=['#e74c3c', '#f39c12', '#95a5a6', '#3498db', '#2ecc71'],
                     alpha=0.7)
ax.set_xlabel('Date', fontsize=12, fontweight='bold')
ax.set_ylabel('Number of Trades', fontsize=12, fontweight='bold')
ax.set_title('Trading Activity Timeline by Sentiment', fontsize=16, fontweight='bold', pad=20)
ax.legend(title='Sentiment', fontsize=10, title_fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('outputs/6_sentiment_timeline.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/6_sentiment_timeline.png")

# Visualization 7: Position size distribution
print("7. Creating position size distribution...")
fig, ax = plt.subplots(figsize=(12, 7))
position_cap = trades_aligned['Size USD'].quantile(0.99)
filtered_positions = trades_aligned[trades_aligned['Size USD'] <= position_cap]['Size USD']
ax.hist(filtered_positions, bins=50, color='#3498db', alpha=0.7, edgecolor='black')
ax.set_xlabel('Position Size ($)', fontsize=12, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=12, fontweight='bold')
ax.set_title('Distribution of Position Sizes (up to 99th percentile)',
             fontsize=16, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)
mean_pos = filtered_positions.mean()
median_pos = filtered_positions.median()
ax.axvline(mean_pos, color='red', linestyle='--', linewidth=2,
           label=f'Mean: ${mean_pos:,.2f}')
ax.axvline(median_pos, color='green', linestyle='--', linewidth=2,
           label=f'Median: ${median_pos:,.2f}')
ax.legend(fontsize=11)
plt.tight_layout()
plt.savefig('outputs/7_position_size_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/7_position_size_distribution.png")

# Visualization 8: Trader performance scatter
print("8. Creating trader performance scatter...")
fig, ax = plt.subplots(figsize=(12, 8))
scatter = ax.scatter(trader_metrics['total_volume'],
                    trader_metrics['total_pnl'],
                    s=trader_metrics['total_trades']/50,
                    c=trader_metrics['total_pnl'],
                    cmap='RdYlGn',
                    alpha=0.6,
                    edgecolors='black',
                    linewidth=1)
ax.set_xlabel('Total Volume ($)', fontsize=12, fontweight='bold')
ax.set_ylabel('Total PnL ($)', fontsize=12, fontweight='bold')
ax.set_title('Trader Performance: Volume vs Profitability',
             fontsize=16, fontweight='bold', pad=20)
ax.grid(True, alpha=0.3)
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Total PnL ($)', fontsize=11, fontweight='bold')
ax.axhline(y=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
plt.tight_layout()
plt.savefig('outputs/8_trader_performance_scatter.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/8_trader_performance_scatter.png")

# Print summary
print("ANALYSIS COMPLETE")
print("CSV Files Created (csv_files/):")
print("  1. cleaned_fear_greed_index.csv")
print("  2. cleaned_trading_data.csv")
print("  3. aligned_trades_sentiment.csv")
print("  4. trader_metrics.csv")
print("  5. daily_sentiment_metrics.csv")
print("Visualizations Created (outputs/):")
print("  1. sentiment_distribution.png")
print("  2. top_traders_pnl.png")
print("  3. pnl_by_sentiment.png")
print("  4. position_size_by_sentiment.png")
print("  5. daily_volume_timeseries.png")
print("  6. sentiment_timeline.png")
print("  7. position_size_distribution.png")
print("  8. trader_performance_scatter.png")
print("Ready for submission")

Loading raw data...
Fear & Greed Index loaded: 2644 records
Historical Trading Data loaded: 211,224 records
Standardizing data types...
Data types standardized
Handling missing values...
Fear & Greed cleaned: 2644 records
Trading data cleaned: 211,224 records (removed 0)
Aligning timestamps and merging datasets...
Aligned dataset: 211,218 records
Date range: 2023-05-01 00:00:00 to 2025-05-01 00:00:00
Saving cleaned data...
All cleaned data saved to csv_files/
STEP 3: EXPLORATORY DATA ANALYSIS
Calculating trader metrics...
Trader metrics calculated for 32 traders
Saved: csv_files/trader_metrics.csv
Analyzing sentiment distribution...
Daily sentiment metrics calculated
Saved: csv_files/daily_sentiment_metrics.csv
Sentiment Distribution:
  Fear           :  61,837 trades (29.28%)
  Greed          :  50,303 trades (23.82%)
  Extreme Greed  :  39,992 trades (18.93%)
  Neutral        :  37,686 trades (17.84%)
  Extreme Fear   :  21,400 trades (10.13%)
Creating visualizations...
1. Creating s

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from statsmodels.tsa.stattools import grangercausalitytests, adfuller
from scipy import signal
import warnings
warnings.filterwarnings('ignore')

# Load cleaned data from previous steps
print("Loading cleaned data...")
trades_aligned = pd.read_csv('csv_files/aligned_trades_sentiment.csv')
trades_aligned['date'] = pd.to_datetime(trades_aligned['date'])
trader_metrics = pd.read_csv('csv_files/trader_metrics.csv')
print(f"Loaded {len(trades_aligned):,} trades")
print(f"Loaded {len(trader_metrics)} trader profiles")

print("SENTIMENT-BEHAVIOR CORRELATION")

# Calculate daily aggregated performance by sentiment
print("Profitability Analysis by Sentiment")
daily_perf = trades_aligned.groupby(['date', 'classification']).agg({
    'Closed PnL': ['sum', 'mean', 'count'],
    'Size USD': ['sum', 'mean'],
    'Fee': 'sum',
    'Account': 'nunique'
}).reset_index()

daily_perf.columns = ['date', 'classification', 'total_pnl', 'avg_pnl', 'trade_count',
                      'total_volume', 'avg_position', 'total_fees', 'active_traders']

# Compare Fear vs Greed profitability
fear_profit = daily_perf[daily_perf['classification'] == 'Fear']['total_pnl']
greed_profit = daily_perf[daily_perf['classification'] == 'Greed']['total_pnl']

print("Fear Periods:")
print(f"  Avg Daily PnL: ${fear_profit.mean():,.2f}")
print(f"  Median Daily PnL: ${fear_profit.median():,.2f}")
print(f"  Std Dev: ${fear_profit.std():,.2f}")

print("Greed Periods:")
print(f"  Avg Daily PnL: ${greed_profit.mean():,.2f}")
print(f"  Median Daily PnL: ${greed_profit.median():,.2f}")
print(f"  Std Dev: ${greed_profit.std():,.2f}")

# Statistical test: T-test
t_stat, p_value = stats.ttest_ind(fear_profit.dropna(), greed_profit.dropna())
print("T-Test (Fear vs Greed):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.6f}")
if p_value < 0.05:
    print("  SIGNIFICANT difference (p < 0.05)")
else:
    print("  Not significant (p >= 0.05)")

# Save correlation results
correlation_results = daily_perf.groupby('classification').agg({
    'total_pnl': ['mean', 'median', 'std'],
    'total_volume': 'mean',
    'avg_position': 'mean',
    'active_traders': 'mean'
}).round(2)
correlation_results.to_csv('csv_files/sentiment_correlation_results.csv')
print("Saved: csv_files/sentiment_correlation_results.csv")

# Volume patterns before sentiment shifts
print(" Volume Patterns Before Sentiment Shifts")
daily_sentiment = trades_aligned.groupby('date')['classification'].first().reset_index()
daily_sentiment['sentiment_shift'] = daily_sentiment['classification'] != daily_sentiment['classification'].shift(1)
shift_dates = daily_sentiment[daily_sentiment['sentiment_shift']]['date'].values

# Calculate volume 3 days before each shift
volume_before_shift = []
for shift_date in shift_dates:
    shift_dt = pd.to_datetime(shift_date)
    pre_period = trades_aligned[
        (trades_aligned['date'] >= shift_dt - pd.Timedelta(days=3)) &
        (trades_aligned['date'] < shift_dt)
    ]['Size USD'].sum()
    volume_before_shift.append(pre_period)

normal_volume = trades_aligned.groupby('date')['Size USD'].sum().mean()
avg_pre_shift_volume = np.mean(volume_before_shift) if volume_before_shift else 0

print("Volume Analysis:")
print(f"  Normal Daily Volume: ${normal_volume:,.2f}")
print(f"  Avg Volume 3-days Before Shift: ${avg_pre_shift_volume:,.2f}")
print(f"  Ratio: {avg_pre_shift_volume/normal_volume:.2f}x")

if avg_pre_shift_volume > normal_volume * 1.2:
    print("  VOLUME SURGE detected before sentiment shifts (+20%)")
else:
    print("  No significant volume change before shifts")

# Granger causality test
print("Advanced: Granger Causality Test")
ts_data = trades_aligned.groupby('date').agg({
    'Closed PnL': 'sum',
    'value': 'first'
}).reset_index()
ts_data.columns = ['date', 'daily_pnl', 'sentiment_value']

# Check stationarity
adf_pnl = adfuller(ts_data['daily_pnl'].dropna())
adf_sentiment = adfuller(ts_data['sentiment_value'].dropna())

print("Stationarity Tests:")
print(f"  PnL: ADF={adf_pnl[0]:.4f}, p-value={adf_pnl[1]:.4f}")
print(f"  Sentiment: ADF={adf_sentiment[0]:.4f}, p-value={adf_sentiment[1]:.4f}")

# Difference if needed for stationarity
if adf_pnl[1] > 0.05:
    ts_data['daily_pnl'] = ts_data['daily_pnl'].diff()
if adf_sentiment[1] > 0.05:
    ts_data['sentiment_value'] = ts_data['sentiment_value'].diff()

ts_data = ts_data.dropna()

# Run Granger test
if len(ts_data) > 50:
    try:
        gc_result = grangercausalitytests(ts_data[['daily_pnl', 'sentiment_value']],
                                          maxlag=5, verbose=False)

        print("Granger Causality Results:")
        for lag in [1, 2, 3]:
            p_val = gc_result[lag][0]['ssr_ftest'][1]
            causal = "YES" if p_val < 0.05 else "NO"
            print(f"  Lag {lag}: p-value={p_val:.4f} | Causal: {causal}")

        best_lag = min(gc_result.keys(), key=lambda x: gc_result[x][0]['ssr_ftest'][1])
        print(f"  Best predictive lag: {best_lag} days")
    except Exception as e:
        print(f"  Note: {str(e)[:80]}")

# Cross-correlation analysis
print(" Cross-Correlation (Lead-Lag Analysis)")

def calculate_cross_corr(series1, series2, max_lag=10):
    s1 = (series1 - np.mean(series1)) / (np.std(series1) * len(series1))
    s2 = (series2 - np.mean(series2)) / np.std(series2)
    ccf = np.correlate(s1, s2, 'full')
    lags = signal.correlation_lags(len(series1), len(series2))
    center = len(ccf) // 2
    return lags[center-max_lag:center+max_lag+1], ccf[center-max_lag:center+max_lag+1]

ts_fresh = trades_aligned.groupby('date').agg({
    'Closed PnL': 'sum',
    'value': 'first'
}).reset_index()

lags, ccf = calculate_cross_corr(ts_fresh['value'].values,
                                  ts_fresh['Closed PnL'].values,
                                  max_lag=7)

max_corr_idx = np.argmax(np.abs(ccf))
optimal_lag = lags[max_corr_idx]
max_corr = ccf[max_corr_idx]

print("Cross-Correlation Results:")
print(f"  Maximum correlation: {max_corr:.4f} at lag {optimal_lag} days")
if optimal_lag < 0:
    print(f"  Sentiment LEADS profitability by {abs(optimal_lag)} days")
elif optimal_lag > 0:
    print(f"  Profitability LEADS sentiment by {optimal_lag} days")
else:
    print(f"  Contemporaneous relationship (same day)")

print(" PATTERN DISCOVERY & INSIGHTS")

# K-Means clustering
print("K-Means Clustering (Basic)")
features_df = trader_metrics[['total_pnl', 'avg_position_size', 'total_trades',
                               'total_volume', 'profit_factor']].fillna(0)

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_df)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
trader_metrics['cluster_kmeans'] = kmeans.fit_predict(features_scaled)

print("K-Means clustering complete")
print("Cluster Distribution:")
print(trader_metrics['cluster_kmeans'].value_counts().sort_index())

cluster_profiles = trader_metrics.groupby('cluster_kmeans').agg({
    'total_pnl': ['mean', 'sum'],
    'total_trades': 'mean',
    'avg_position_size': 'mean',
    'total_volume': 'mean',
    'Account': 'count'
}).round(2)

print("Cluster Profiles:")
print(cluster_profiles)

# Characterize each cluster
for cluster_id in range(3):
    cluster_data = trader_metrics[trader_metrics['cluster_kmeans'] == cluster_id]
    avg_trades = cluster_data['total_trades'].mean()
    avg_pos = cluster_data['avg_position_size'].mean()

    if avg_trades > 15000:
        profile = "HIGH FREQUENCY SCALPERS"
    elif avg_pos > 15000:
        profile = "WHALE TRADERS (Large Positions)"
    else:
        profile = "BALANCED TRADERS"

    print(f"  Cluster {cluster_id}: {profile}")
    print(f"    Members: {len(cluster_data)}")
    print(f"    Avg PnL: ${cluster_data['total_pnl'].mean():,.2f}")

# DBSCAN clustering
print("DBSCAN Clustering (Advanced - Density-based)")
dbscan = DBSCAN(eps=0.5, min_samples=2)
trader_metrics['cluster_dbscan'] = dbscan.fit_predict(features_scaled)

n_clusters = len(set(trader_metrics['cluster_dbscan'])) - (1 if -1 in trader_metrics['cluster_dbscan'] else 0)
n_outliers = (trader_metrics['cluster_dbscan'] == -1).sum()

print("DBSCAN clustering complete")
print(f"  Clusters found: {n_clusters}")
print(f"  Outliers detected: {n_outliers}")

if n_outliers > 0:
    outliers = trader_metrics[trader_metrics['cluster_dbscan'] == -1]
    print("  Outlier traders (exceptional performance):")
    for idx, row in outliers.nlargest(3, 'total_pnl').iterrows():
        print(f"    Account {row['Account'][:12]}...: ${row['total_pnl']:,.2f}")

# PCA analysis
print(" PCA Analysis (Advanced)")
pca = PCA(n_components=2)
pca_features = pca.fit_transform(features_scaled)
trader_metrics['pca1'] = pca_features[:, 0]
trader_metrics['pca2'] = pca_features[:, 1]

explained_var = pca.explained_variance_ratio_
print("PCA complete")
print(f"  PC1 explains: {explained_var[0]*100:.2f}% variance")
print(f"  PC2 explains: {explained_var[1]*100:.2f}% variance")
print(f"  Total explained: {sum(explained_var)*100:.2f}%")

# Profitable trader behavior extraction
print(" Profitable Trader Patterns")
profit_threshold = trader_metrics['total_pnl'].quantile(0.75)
trader_metrics['is_profitable'] = trader_metrics['total_pnl'] > profit_threshold

profitable_traders = trader_metrics[trader_metrics['is_profitable']]
unprofitable_traders = trader_metrics[~trader_metrics['is_profitable']]

print("Comparison: Profitable vs Others")
print(f"  Profitable Traders: {len(profitable_traders)}")
print(f"  Others: {len(unprofitable_traders)}")

comparison_metrics = {
    'Avg Position Size': [profitable_traders['avg_position_size'].mean(),
                          unprofitable_traders['avg_position_size'].mean()],
    'Avg Trade Count': [profitable_traders['total_trades'].mean(),
                        unprofitable_traders['total_trades'].mean()],
    'Avg Volume': [profitable_traders['total_volume'].mean(),
                   unprofitable_traders['total_volume'].mean()],
    'Profit Factor': [profitable_traders['profit_factor'].mean(),
                      unprofitable_traders['profit_factor'].mean()]
}

comparison_df = pd.DataFrame(comparison_metrics,
                             index=['Profitable (Top 25%)', 'Others']).T
print(comparison_df)

# Statistical significance tests
print(" Statistical Significance Tests")

# Test 1: Mann-Whitney U test
stat_pos, p_pos = stats.mannwhitneyu(
    profitable_traders['avg_position_size'],
    unprofitable_traders['avg_position_size']
)
print("Position Size Difference:")
print(f"  Mann-Whitney U: {stat_pos:.2f}")
print(f"  p-value: {p_pos:.6f}")
print(f"  Result: {'SIGNIFICANT' if p_pos < 0.05 else 'Not Significant'}")

# Test 2: Chi-Square test
sentiment_by_profit = pd.crosstab(
    trades_aligned['Account'].isin(profitable_traders['Account']),
    trades_aligned['classification']
)
chi2, p_chi2, dof, expected = stats.chi2_contingency(sentiment_by_profit)
print("Sentiment Association with Profitability:")
print(f"  Chi-Square: {chi2:.2f}")
print(f"  p-value: {p_chi2:.6f}")
print(f"  Result: {'SIGNIFICANT' if p_chi2 < 0.05 else 'Not Significant'}")

# Test 3: ANOVA
clusters_list = [trader_metrics[trader_metrics['cluster_kmeans'] == i]['total_pnl']
                 for i in range(3)]
f_stat, p_anova = stats.f_oneway(*clusters_list)
print("Cluster Performance Difference:")
print(f"  F-statistic: {f_stat:.2f}")
print(f"  p-value: {p_anova:.6f}")
print(f"  Result: {'SIGNIFICANT' if p_anova < 0.05 else 'Not Significant'}")

# Behavioral patterns by sentiment
print(" Behavioral Patterns in Different Market Conditions")
trades_with_profit = trades_aligned.merge(
    trader_metrics[['Account', 'is_profitable']],
    on='Account',
    how='left'
)

behavior_patterns = trades_with_profit.groupby(['classification', 'is_profitable']).agg({
    'Size USD': 'mean',
    'Trade ID': 'count',
    'Closed PnL': 'mean'
}).round(2)

print("Average Position Size by Sentiment & Profitability:")
print(behavior_patterns['Size USD'].unstack())

print("Trade Frequency by Sentiment & Profitability:")
print(behavior_patterns['Trade ID'].unstack())

# Save results
print("Saving results...")
trader_metrics.to_csv('csv_files/trader_metrics_clustered.csv', index=False)
print("Saved: csv_files/trader_metrics_clustered.csv")

behavior_patterns.to_csv('csv_files/behavioral_patterns.csv')
print("Saved: csv_files/behavioral_patterns.csv")

comparison_df.to_csv('csv_files/profitable_comparison.csv')
print("Saved: csv_files/profitable_comparison.csv")

# Create visualizations
print("Creating visualizations...")

# Visualization 1: PnL by sentiment box plot
print("1. Creating sentiment profitability boxplot...")
fig, ax = plt.subplots(figsize=(12, 7))
sentiment_order = ['Extreme Fear', 'Fear', 'Neutral', 'Greed', 'Extreme Greed']
sns.boxplot(data=daily_perf, x='classification', y='total_pnl',
            order=sentiment_order, palette='Set2', ax=ax)
ax.set_xlabel('Sentiment Category', fontsize=12, fontweight='bold')
ax.set_ylabel('Daily Total PnL ($)', fontsize=12, fontweight='bold')
ax.set_title('Profitability Distribution by Market Sentiment',
             fontsize=16, fontweight='bold', pad=20)
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/9_sentiment_profitability_boxplot.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/9_sentiment_profitability_boxplot.png")

# Visualization 2: Cluster visualization with PCA
print("2. Creating trader clusters PCA plot...")
fig, ax = plt.subplots(figsize=(12, 8))
colors = ['#e74c3c', '#3498db', '#2ecc71']
for i in range(3):
    cluster_data = trader_metrics[trader_metrics['cluster_kmeans'] == i]
    ax.scatter(cluster_data['pca1'], cluster_data['pca2'],
              c=colors[i], label=f'Cluster {i}', s=100, alpha=0.6, edgecolors='black')
ax.set_xlabel(f'PC1 ({explained_var[0]*100:.1f}% variance)', fontsize=12, fontweight='bold')
ax.set_ylabel(f'PC2 ({explained_var[1]*100:.1f}% variance)', fontsize=12, fontweight='bold')
ax.set_title('Trader Clusters (PCA Visualization)', fontsize=16, fontweight='bold', pad=20)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/10_trader_clusters_pca.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/10_trader_clusters_pca.png")

# Visualization 3: Profitable vs others comparison
print("3. Creating profitable comparison chart...")
fig, ax = plt.subplots(figsize=(12, 7))
comparison_df.plot(kind='bar', ax=ax, color=['#2ecc71', '#e74c3c'], alpha=0.8)
ax.set_xlabel('Metrics', fontsize=12, fontweight='bold')
ax.set_ylabel('Value', fontsize=12, fontweight='bold')
ax.set_title('Profitable Traders vs Others: Key Metrics Comparison',
             fontsize=16, fontweight='bold', pad=20)
ax.legend(['Profitable (Top 25%)', 'Others'], fontsize=11)
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/11_profitable_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/11_profitable_comparison.png")

# Visualization 4: Volume before sentiment shifts
print("4. Creating volume sentiment shift chart...")
fig, ax = plt.subplots(figsize=(12, 7))
categories = ['Normal Days', '3-Days Before Sentiment Shift']
volumes = [normal_volume, avg_pre_shift_volume]
bars = ax.bar(categories, volumes, color=['#3498db', '#e74c3c'], alpha=0.8,
              edgecolor='black', linewidth=2)
ax.set_ylabel('Average Daily Volume ($)', fontsize=12, fontweight='bold')
ax.set_title('Trading Volume: Normal vs Pre-Sentiment Shift',
             fontsize=16, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)
for bar, vol in zip(bars, volumes):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'${vol:,.0f}', ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('outputs/12_volume_sentiment_shift.png', dpi=300, bbox_inches='tight')
plt.close()
print("  Saved: outputs/12_volume_sentiment_shift.png")

print("STEPS 4 & 5 COMPLETE")
print("New CSV Files:")
print("  - sentiment_correlation_results.csv")
print("  - trader_metrics_clustered.csv")
print("  - behavioral_patterns.csv")
print("  - profitable_comparison.csv")
print("New Visualizations:")
print("  - 9_sentiment_profitability_boxplot.png")
print("  - 10_trader_clusters_pca.png")
print("  - 11_profitable_comparison.png")
print("  - 12_volume_sentiment_shift.png")

Loading cleaned data...
Loaded 211,218 trades
Loaded 32 trader profiles
SENTIMENT-BEHAVIOR CORRELATION
Profitability Analysis by Sentiment
Fear Periods:
  Avg Daily PnL: $36,891.82
  Median Daily PnL: $1,412.31
  Std Dev: $96,611.85
Greed Periods:
  Avg Daily PnL: $11,140.57
  Median Daily PnL: $678.48
  Std Dev: $62,427.96
T-Test (Fear vs Greed):
  t-statistic: 2.6983
  p-value: 0.007389
  SIGNIFICANT difference (p < 0.05)
Saved: csv_files/sentiment_correlation_results.csv
 Volume Patterns Before Sentiment Shifts
Volume Analysis:
  Normal Daily Volume: $2,486,636.27
  Avg Volume 3-days Before Shift: $9,400,344.29
  Ratio: 3.78x
  VOLUME SURGE detected before sentiment shifts (+20%)
Advanced: Granger Causality Test
Stationarity Tests:
  PnL: ADF=-4.0488, p-value=0.0012
  Sentiment: ADF=-3.1799, p-value=0.0212
Granger Causality Results:
  Lag 1: p-value=0.0597 | Causal: NO
  Lag 2: p-value=0.0372 | Causal: YES
  Lag 3: p-value=0.1110 | Causal: NO
  Best predictive lag: 2 days
 Cross-Cor