In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)


In [None]:
trader_df = pd.read_csv("../data/processed/cleaned_trader_data.csv")
sentiment_df = pd.read_csv("../data/processed/cleaned_sentiment_data.csv")

trader_df["time"] = pd.to_datetime(trader_df["time"])
sentiment_df["Date"] = pd.to_datetime(sentiment_df["Date"])

print(trader_df.shape, sentiment_df.shape)


In [None]:
# Trade value
trader_df["trade_value"] = trader_df["execution price"] * trader_df["size"]

# Direction encoding
trader_df["side_binary"] = trader_df["side"].apply(lambda x: 1 if str(x).lower() == "buy" else -1)

# Notional exposure
trader_df["notional_exposure"] = trader_df["trade_value"] * trader_df.get("leverage", 1)

# PnL normalization
if "closedPnL" in trader_df.columns:
    trader_df["normalized_pnl"] = trader_df["closedPnL"] / (trader_df["trade_value"] + 1)
else:
    trader_df["normalized_pnl"] = 0


In [None]:
# Risk score
trader_df["risk_score"] = trader_df["notional_exposure"] / (trader_df["trade_value"] + 1)

# Leverage adjusted return
trader_df["leverage_adj_return"] = trader_df["normalized_pnl"] / (trader_df.get("leverage", 1) + 1)

# Profit flag
trader_df["is_profitable"] = trader_df["normalized_pnl"].apply(lambda x: 1 if x > 0 else 0)


In [None]:
# Overtrading indicator (high frequency trader proxy)
trader_df["trade_hour"] = trader_df["time"].dt.hour
trader_df["overtrading_score"] = trader_df.groupby("account")["time"].transform("count")

# Aggression score (size + leverage)
trader_df["aggression_score"] = trader_df["size"] * trader_df.get("leverage", 1)

# Discipline proxy (low leverage + consistent pnl)
trader_df["discipline_score"] = (1 / (trader_df.get("leverage", 1) + 1)) * trader_df["is_profitable"]


In [None]:
trader_df["date"] = trader_df["time"].dt.date
sentiment_df["date"] = sentiment_df["Date"].dt.date

merged_df = trader_df.merge(
    sentiment_df[["date", "Classification"]],
    on="date",
    how="left"
)

merged_df["sentiment_binary"] = merged_df["Classification"].apply(
    lambda x: 1 if x == "Greed" else 0
)


In [None]:
def sentiment_regime(x):
    if x == "Fear":
        return "Risk-Off"
    elif x == "Greed":
        return "Risk-On"
    else:
        return "Neutral"

merged_df["market_regime"] = merged_df["Classification"].apply(sentiment_regime)


In [None]:
profile_df = merged_df.groupby("account").agg(
    total_trades=("size", "count"),
    avg_trade_size=("size", "mean"),
    avg_leverage=("leverage", "mean"),
    avg_pnl=("normalized_pnl", "mean"),
    win_rate=("is_profitable", "mean"),
    avg_risk=("risk_score", "mean"),
    avg_aggression=("aggression_score", "mean"),
    avg_discipline=("discipline_score", "mean")
).reset_index()


In [None]:
merged_df.to_csv("../data/processed/sentiment_labeled_data.csv", index=False)
profile_df.to_csv("../data/processed/trader_profiles.csv", index=False)

print("âœ… Feature-engineered datasets saved.")
