In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)


In [None]:
trader_df = pd.read_csv("../data/raw/historical_trader_data.csv")
sentiment_df = pd.read_csv("../data/raw/fear_greed_index.csv")

trader_df["time"] = pd.to_datetime(trader_df["time"], errors="coerce")
sentiment_df["Date"] = pd.to_datetime(sentiment_df["Date"], errors="coerce")

print(trader_df.shape, sentiment_df.shape)


In [None]:
# Remove invalid trades
trader_df = trader_df.dropna(subset=["execution price", "size", "side", "time"])

# Remove zero or negative trades
trader_df = trader_df[(trader_df["execution price"] > 0) & (trader_df["size"] > 0)]

# Drop duplicates
trader_df = trader_df.drop_duplicates()

print("After basic cleaning:", trader_df.shape)


In [None]:
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

trader_df = remove_outliers_iqr(trader_df, "execution price")
trader_df = remove_outliers_iqr(trader_df, "size")

print("After outlier removal:", trader_df.shape)


In [None]:
if "leverage" in trader_df.columns:
    trader_df = trader_df[(trader_df["leverage"] > 0) & (trader_df["leverage"] <= 125)]


In [None]:
sentiment_df = sentiment_df.dropna(subset=["Classification"])
sentiment_df["Classification"] = sentiment_df["Classification"].str.strip().str.title()

print(sentiment_df["Classification"].value_counts())


In [None]:
trader_df.to_csv("../data/processed/cleaned_trader_data.csv", index=False)
sentiment_df.to_csv("../data/processed/cleaned_sentiment_data.csv", index=False)

print("âœ… Clean datasets saved.")
