# 01 | EDA & Feature Engineering
Credit Card Fraud Detection

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_parquet("../data/raw/engineered_transactions.parquet", engine="fastparquet")

print(df.shape)
df.head()

In [None]:
assert "is_fraud" in df.columns, "❌ is_fraud column missing!"
print("✅ is_fraud column found")
print(df["is_fraud"].value_counts())

In [None]:
fraud_ratio = df["is_fraud"].mean()
print(f"Fraud ratio: {fraud_ratio:.4f}")

sns.countplot(x="is_fraud", data=df)
plt.title("Fraud vs Non-Fraud Count")
plt.show()

In [None]:
plt.figure(figsize=(8,5))

sns.histplot(
    df.loc[df["is_fraud"] == 0, "amount"],
    bins=50,
    label="Non-Fraud",
    alpha=0.6
)

sns.histplot(
    df.loc[df["is_fraud"] == 1, "amount"],
    bins=50,
    label="Fraud",
    alpha=0.6
)

plt.legend()
plt.title("Transaction Amount Distribution")
plt.xlabel("Amount")
plt.show()

In [None]:
plt.figure(figsize=(8,5))

sns.histplot(
    df.loc[df["is_fraud"] == 0, "time_since_last_txn"],
    bins=50,
    label="Non-Fraud",
    alpha=0.6
)

sns.histplot(
    df.loc[df["is_fraud"] == 1, "time_since_last_txn"],
    bins=50,
    label="Fraud",
    alpha=0.6
)

plt.legend()
plt.title("Time Since Last Transaction")
plt.xlabel("Minutes")
plt.show()

In [None]:
corr = (
    df.corr(numeric_only=True)["is_fraud"]
    .sort_values(ascending=False)
)

corr.head(10)

In [None]:
plt.figure(figsize=(6,8))
sns.heatmap(
    corr.to_frame(),
    annot=True,
    cmap="coolwarm",
    cbar=False
)
plt.title("Feature Correlation with Fraud")
plt.show()

In [None]:
X = df.drop(columns=[
    "is_fraud",
    "transaction_time"  # ❌ never feed raw timestamps
])

y = df["is_fraud"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
X_train.to_parquet("../data/processed/X_train.parquet", index=False)
X_test.to_parquet("../data/processed/X_test.parquet", index=False)

y_train.to_frame("is_fraud").to_parquet("../data/processed/y_train.parquet", index=False)
y_test.to_frame("is_fraud").to_parquet("../data/processed/y_test.parquet", index=False)

print("Train/test artifacts saved successfully")