# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu, skew, kurtosis
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from statsmodels.tsa.api import VAR
from statsmodels.stats.multitest import multipletests
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import warnings
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10,5)

# Paths

In [None]:
fear_greed_path = "/content/fear_greed_index.csv"
trades_path = "/content/historical_data.csv"

# Load and parse Fear & Greed / BTC data

In [None]:
fg = pd.read_csv(fear_greed_path, dtype=str)
trades = pd.read_csv(trades_path, dtype=str)
fg.columns = [c.strip() for c in fg.columns]
if 'timestamp' in fg.columns:
    fg['timestamp_num'] = pd.to_numeric(fg['timestamp'], errors='coerce')
    fg.loc[~fg['timestamp_num'].isna(), 'fg_dt_from_ts'] = pd.to_datetime(
        fg.loc[~fg['timestamp_num'].isna(), 'timestamp_num'], unit='s', utc=True
    ).dt.tz_convert(None)
if 'date' in fg.columns:
    fg['fg_date_parsed'] = pd.to_datetime(fg['date'], dayfirst=True, errors='coerce')
mask = fg['fg_date_parsed'].isna() & fg.get('timestamp_num').notna()
if mask.any():
    fg.loc[mask, 'fg_date_parsed'] = pd.to_datetime(
        fg.loc[mask, 'timestamp_num'], unit='s', errors='coerce', utc=True
    ).dt.tz_convert(None)
fg['date'] = pd.to_datetime(fg['fg_date_parsed']).dt.normalize()
fg['fg_class'] = fg['classification'].str.strip().str.title() if 'classification' in fg.columns else np.nan
fg['fg_value'] = pd.to_numeric(fg['value'], errors='coerce') if 'value' in fg.columns else np.nan
price_cols = [c for c in fg.columns if any(k in c.lower() for k in ['price','close','close_usd','btc'])]
fg['btc_price'] = pd.to_numeric(fg[price_cols[0]], errors='coerce') if price_cols else np.nan
fg = fg[['date','fg_value','fg_class','btc_price']].dropna(subset=['date']).reset_index(drop=True)

# Load and parse trades

In [None]:
trades.columns = [c.strip() for c in trades.columns]
trades.rename(columns=lambda x: x.strip().replace(' ', '_').replace('.', '_').lower(), inplace=True)
time_cols = [c for c in trades.columns if 'timestamp' in c]
for c in time_cols:
    parsed = pd.to_datetime(trades[c], dayfirst=True, errors='coerce', infer_datetime_format=True)
    if parsed.notna().sum() > 0:
        time_col = c
        break
raw_times = trades[time_col].astype(str).str.replace(r'(\d{1,2})\.(\d{2})(?!\d)', r'\1:\2', regex=True)
parsed_times = pd.to_datetime(raw_times, dayfirst=True, errors='coerce', infer_datetime_format=True)
parsed_times = parsed_times.dt.tz_localize('Asia/Kolkata', ambiguous='NaT', nonexistent='NaT').dt.tz_convert(None)
if parsed_times.isna().sum() > 0.5 * len(parsed_times):
    parsed_times = pd.to_datetime(trades[time_col], errors='coerce', infer_datetime_format=True)
trades['trade_datetime'] = parsed_times
trades['trade_date'] = pd.to_datetime(trades['trade_datetime']).dt.normalize()
num_cols = ['execution_price','size_tokens','size_usd','closed_pnl','fee','leverage']
for col in num_cols:
    if col in trades.columns:
        trades[col] = pd.to_numeric(trades[col].astype(str).str.replace(',',''), errors='coerce')
if 'side' in trades.columns:
    trades['side'] = trades['side'].str.strip().str.upper()
if 'account' in trades.columns:
    trades['account'] = trades['account'].astype(str).str.strip()

# Feature engineering

In [None]:
trades['size_usd_abs'] = trades['size_usd'].abs()
trades['return_pct'] = trades['closed_pnl'] / trades['size_usd'].replace({0:np.nan})
trades['pnl_per_usd'] = trades['closed_pnl'] / trades['size_usd'].replace({0:np.nan})

# Daily aggregation

In [None]:
daily = trades.groupby('trade_date').agg(
    total_volume_usd = ('size_usd_abs', 'sum'),
    mean_closed_pnl = ('closed_pnl', 'mean'),
    pnl_std = ('closed_pnl', 'std'),
    mean_return_pct = ('return_pct', 'mean'),
    win_rate = ('closed_pnl', lambda x: np.nanmean(x>0)),
    median_leverage = ('leverage', 'median') if 'leverage' in trades.columns else ('size_usd','count')
).reset_index().rename(columns={'trade_date':'date'})
fg_daily = fg[['date','fg_value','fg_class','btc_price']].drop_duplicates(subset=['date'])
merged = pd.merge(daily, fg_daily, on='date', how='left').fillna(method='ffill')
if merged['btc_price'].notna().any():
    merged['btc_ret'] = merged['btc_price'].pct_change().fillna(0)

# Visualization: Sentiment trend

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(data=fg, x='date', y='fg_value', hue='fg_class', palette='coolwarm')
plt.title("Fear & Greed Index Over Time")
plt.show()

# Visualization: Trader metrics vs sentiment

In [None]:
fig, axes = plt.subplots(2,1, figsize=(12,8), sharex=True)
sns.lineplot(data=merged, x='date', y='total_volume_usd', ax=axes[0])
axes[0].set_title("Total Trading Volume Over Time")
sns.lineplot(data=merged, x='date', y='mean_closed_pnl', ax=axes[1])
axes[1].set_title("Mean Closed PnL Over Time")
plt.show()

# Distribution comparison Fear vs Greed

In [None]:
metrics_to_plot = ['total_volume_usd','mean_closed_pnl','mean_return_pct','win_rate']
for metric in metrics_to_plot:
    if metric in merged.columns:
        plt.figure()
        sns.boxplot(data=merged, x='fg_class', y=metric, palette='coolwarm')
        plt.title(f"{metric} by Sentiment Phase")
        plt.show()

# Event study visualization

In [None]:
fg_series = fg.set_index('date')['fg_class'].dropna()
regime_changes = fg_series[fg_series != fg_series.shift(1)].index
k = 7
events = []
for d in regime_changes:
    window = merged.set_index('date').loc[d - pd.Timedelta(days=k) : d + pd.Timedelta(days=k)].reset_index()
    if not window.empty:
        window['offset'] = (window['date'] - d).dt.days
        window['event_date'] = d
        events.append(window[['offset','total_volume_usd','mean_closed_pnl']])
if events:
    event_df = pd.concat(events, ignore_index=True)
    plt.figure()
    sns.lineplot(data=event_df, x='offset', y='total_volume_usd')
    plt.axvline(0, color='red', linestyle='--')
    plt.title("Event Study: Volume Around Sentiment Switch")
    plt.show()

# Clustering visualization

In [None]:
if 'account' in trades.columns:
    acct = trades.groupby('account').agg(
        total_volume = ('size_usd_abs', 'sum'),
        trade_count = ('size_usd', 'count'),
        avg_trade_size = ('size_usd_abs', 'mean'),
        avg_pnl = ('closed_pnl','mean'),
        win_rate = ('closed_pnl', lambda x: np.nanmean(x>0))
    ).reset_index().fillna(0)
    X = acct[['total_volume','trade_count','avg_trade_size','avg_pnl','win_rate']].values
    Xs = StandardScaler().fit_transform(X)
    km = KMeans(n_clusters=4, random_state=42).fit(Xs)
    acct['cluster'] = km.labels_
    plt.figure()
    sns.scatterplot(data=acct, x='total_volume', y='avg_pnl', hue='cluster', palette='tab10')
    plt.title("Trader Segmentation Clusters")
    plt.show()

# Feature importance visualization

In [None]:
dfp = merged.copy()
dfp['fg_binary'] = dfp['fg_class'].str.lower().str.contains('greed').astype(int)
dfp['vol_3d_mean'] = dfp['total_volume_usd'].rolling(3).mean()
dfp['pnl_3d_mean'] = dfp['mean_closed_pnl'].rolling(3).mean()
X = dfp[['vol_3d_mean','pnl_3d_mean']].shift(1).dropna()
y = dfp['fg_binary'].loc[X.index]
if y.nunique() == 2:
    clf = RandomForestClassifier(random_state=42).fit(X,y)
    fi = pd.Series(clf.feature_importances_, index=X.columns)
    plt.figure()
    fi.sort_values().plot(kind='barh')
    plt.title("Feature Importance for Predicting Sentiment Phase")
    plt.show()