# 01b — Preprocessing for CatBoost (Human‑Interpretable + Categorical‑Friendly)

This notebook is an **add-on** to your existing preprocessing.

It:
- Loads IEEE‑CIS `train_transaction` + `train_identity` and merges them
- Reuses your **human‑interpretable feature engineering** (time, velocity, ratios, change flags)
- Keeps **raw categorical fields** (email domains, device/browser/OS strings, address fields) so CatBoost can model them directly
- Adds a few extra interpretable features: `log_amt`, `amt_decimal`, missingness indicators, and parsed device/email families
- Creates the same **80/20 chronological split**
- Saves:
  - `X_train_cat.csv`, `X_test_cat.csv`
  - `cat_feature_cols.json` (list of categorical columns)


In [1]:

# --- (Optional) Install deps (Colab-safe)
try:
    import pandas  # noqa
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                           "pandas", "numpy", "scikit-learn", "imbalanced-learn", "joblib"])

import os, sys, json, random, warnings
import numpy as np
import pandas as pd
from collections import Counter

warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# Detect Colab + mount Drive
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")

# Adjust if needed
datapath = "/content/drive/MyDrive/RThesis/" if IN_COLAB else "./"
os.makedirs(datapath, exist_ok=True)
print("datapath =", datapath)


Mounted at /content/drive
datapath = /content/drive/MyDrive/RThesis/


In [2]:

# --- Load + merge IEEE-CIS (train)
trans_path = os.path.join(datapath, "train_transaction (1).csv")
iden_path  = os.path.join(datapath, "train_identity (1).csv")

assert os.path.exists(trans_path), f"Missing file: {trans_path}"
assert os.path.exists(iden_path),  f"Missing file: {iden_path}"

trans = pd.read_csv(trans_path)
iden  = pd.read_csv(iden_path)

data = trans.merge(iden, how="left", on="TransactionID")
print("Merged shape:", data.shape)
print("Fraud rate (overall):", data["isFraud"].mean())


Merged shape: (590540, 434)
Fraud rate (overall): 0.03499000914417313


In [None]:

# --- Preserve missingness BEFORE filling (useful signal)
miss_cols_for_count = [c for c in ["DeviceInfo", "DeviceType", "P_emaildomain", "R_emaildomain", "addr1", "addr2", "id_30", "id_31", "id_33"] if c in data.columns]
if miss_cols_for_count:
    data["missing_count_raw"] = data[miss_cols_for_count].isna().sum(axis=1).astype("int16")
    data["missing_ratio_raw"] = data["missing_count_raw"] / max(1, len(miss_cols_for_count))
else:
    data["missing_count_raw"] = 0
    data["missing_ratio_raw"] = 0.0

# --- Basic cleaning: fill missing values
# Numeric -> median; Categorical -> "missing"
num_cols = data.select_dtypes(include=[np.number]).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

cat_cols_raw = data.select_dtypes(include=["object"]).columns
data[cat_cols_raw] = data[cat_cols_raw].fillna("missing")

print("Nulls after fill (top 10):")
print(data.isna().sum().sort_values(ascending=False).head(10))


In [None]:

# --- Helper: causal rolling count within a lookback window (seconds)
def rolling_count_seconds(df: pd.DataFrame, entity_col: str, time_col: str, window_sec: int) -> np.ndarray:
    """
    For each row, count number of rows with the same entity within [time - window_sec, time].
    Returns an array aligned to df.index. Sorts within each group; overall DF row order is preserved.
    """
    counts = np.zeros(len(df), dtype=int)
    for _, group in df.groupby(entity_col):
        g = group.sort_values(time_col)
        times = g[time_col].to_numpy()
        idx = g.index.to_numpy()
        left = 0
        for right in range(len(times)):
            while times[right] - times[left] > window_sec:
                left += 1
            counts[idx[right]] = right - left + 1
    return counts


In [None]:

# --- Interpretable feature engineering (extends your 01 notebook)
FREE_EMAIL_PROVIDERS = {"gmail", "yahoo", "hotmail", "outlook", "icloud", "aol", "live", "msn", "protonmail"}

def _safe_split_left(x: str, sep: str = " ") -> str:
    if not isinstance(x, str) or x == "missing":
        return "missing"
    x = x.strip()
    if not x:
        return "missing"
    return x.split(sep)[0].strip().lower()

def _email_provider(domain: str) -> str:
    if not isinstance(domain, str) or domain == "missing":
        return "missing"
    return domain.split(".")[0].lower() if "." in domain else domain.lower()

def _email_tld(domain: str) -> str:
    if not isinstance(domain, str) or domain == "missing":
        return "missing"
    parts = domain.split(".")
    return parts[-1].lower() if len(parts) >= 2 else "missing"

def add_features_catboost(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Time features
    if "TransactionDT" in out.columns:
        day  = np.floor(out["TransactionDT"] / (24*60*60)).astype("int64")
        hour = np.floor((out["TransactionDT"] % (24*60*60)) / 3600).astype("int64")
        out["Transaction_day"]  = day
        out["Transaction_hour"] = hour
        out["is_night_txn"]      = out["Transaction_hour"].isin(range(0, 7)).astype("int8")

    # Amount transforms
    if "TransactionAmt" in out.columns:
        out["log_amt"] = np.log1p(out["TransactionAmt"])
        out["amt_decimal"] = (out["TransactionAmt"] - np.floor(out["TransactionAmt"])).round(6)
        out["is_round_amt"] = (out["amt_decimal"] < 1e-6).astype("int8")

    # Missingness: computed BEFORE filling and stored as *_raw
    if "missing_count_raw" in out.columns:
        out["missing_count"] = out["missing_count_raw"].astype("int16")
        out["missing_ratio"] = out["missing_ratio_raw"].astype(float)
    else:
        out["missing_count"] = 0
        out["missing_ratio"] = 0.0

    # “Has identity” flag (interpretable)
    if "DeviceInfo" in out.columns and "DeviceType" in out.columns:
        out["has_identity"] = ((out["DeviceInfo"] != "missing") | (out["DeviceType"] != "missing")).astype("int8")
    else:
        out["has_identity"] = 0

    # Parsed categorical families
    out["os_family"] = out["id_30"].apply(lambda x: _safe_split_left(x, " ")) if "id_30" in out.columns else "missing"
    out["browser_family"] = out["id_31"].apply(lambda x: _safe_split_left(x, " ")) if "id_31" in out.columns else "missing"

    if "P_emaildomain" in out.columns:
        out["email_provider"] = out["P_emaildomain"].apply(_email_provider)
        out["email_tld"]      = out["P_emaildomain"].apply(_email_tld)
        out["is_free_email"]  = out["email_provider"].isin(FREE_EMAIL_PROVIDERS).astype("int8")
    else:
        out["email_provider"] = "missing"
        out["email_tld"]      = "missing"
        out["is_free_email"]  = 0

    out["device_brand"] = out["DeviceInfo"].apply(lambda x: _safe_split_left(x, " ")) if "DeviceInfo" in out.columns else "missing"

    # Per-card/day transaction count
    if {"card1", "Transaction_day", "TransactionID"}.issubset(out.columns):
        out["trans_per_card_day"] = out.groupby(["card1", "Transaction_day"])["TransactionID"].transform("count").astype("int32")
    else:
        out["trans_per_card_day"] = 1

    # Per-card amount statistics + deviation
    if {"card1", "TransactionAmt"}.issubset(out.columns):
        stats = out.groupby("card1")["TransactionAmt"].agg(avg_amt_per_card="mean", amt_std_per_card="std").reset_index()
        out = out.merge(stats, on="card1", how="left")
        out["avg_amt_per_card_y"] = out["avg_amt_per_card"]
        std_mean = float(out["amt_std_per_card"].mean()) if "amt_std_per_card" in out.columns else 1.0
        out["amt_std_per_card_y"] = out["amt_std_per_card"].fillna(std_mean)

        out["freq_ratio_card_amt"] = out["TransactionAmt"] / (out["avg_amt_per_card_y"] + 1e-5)
        out["amt_deviation_card"] = (out["TransactionAmt"] - out["avg_amt_per_card_y"]).abs() / (out["amt_std_per_card_y"] + 1e-5)

    # Rolling counts and time gap
    if {"card1", "TransactionDT"}.issubset(out.columns):
        out["trans_last_1h_card"]  = rolling_count_seconds(out, "card1", "TransactionDT", 3600).astype("int32")
        out["trans_last_6h_card"]  = rolling_count_seconds(out, "card1", "TransactionDT", 21600).astype("int32")
        out["trans_last_24h_card"] = rolling_count_seconds(out, "card1", "TransactionDT", 86400).astype("int32")
        out["time_since_last_txn_card"] = out.groupby("card1")["TransactionDT"].diff().fillna(0).astype("int64")

    # Device/card uniqueness windows (24h)
    if {"DeviceInfo", "card1", "TransactionDT"}.issubset(out.columns):
        devices_per_card = np.zeros(len(out), dtype=int)
        for _, group in out.groupby("card1"):
            g = group.sort_values("TransactionDT")
            times = g["TransactionDT"].to_numpy()
            devices = g["DeviceInfo"].to_numpy()
            idx = g.index.to_numpy()
            left = 0
            counter = Counter()
            unique_count = 0
            for right in range(len(times)):
                d_r = devices[right]
                if counter[d_r] == 0:
                    unique_count += 1
                counter[d_r] += 1
                while times[right] - times[left] > 86400:
                    d_l = devices[left]
                    counter[d_l] -= 1
                    if counter[d_l] == 0:
                        unique_count -= 1
                        del counter[d_l]
                    left += 1
                devices_per_card[idx[right]] = unique_count
        out["devices_per_card_24h"] = devices_per_card

        cards_per_device = np.zeros(len(out), dtype=int)
        for _, group in out.groupby("DeviceInfo"):
            g = group.sort_values("TransactionDT")
            times = g["TransactionDT"].to_numpy()
            cards = g["card1"].to_numpy()
            idx = g.index.to_numpy()
            left = 0
            counter = Counter()
            unique_count = 0
            for right in range(len(times)):
                c_r = cards[right]
                if counter[c_r] == 0:
                    unique_count += 1
                counter[c_r] += 1
                while times[right] - times[left] > 86400:
                    c_l = cards[left]
                    counter[c_l] -= 1
                    if counter[c_l] == 0:
                        unique_count -= 1
                        del counter[c_l]
                    left += 1
                cards_per_device[idx[right]] = unique_count
        out["cards_per_device_24h"] = cards_per_device

    # Change flags
    if {"card1", "DeviceInfo"}.issubset(out.columns):
        out["device_change_flag"] = (out.groupby("card1")["DeviceInfo"].shift() != out["DeviceInfo"]).astype("int32")
    if {"card1", "id_31"}.issubset(out.columns):
        out["browser_change_flag"] = (out.groupby("card1")["id_31"].shift() != out["id_31"]).astype("int32")
    if {"card1", "id_30"}.issubset(out.columns):
        out["os_change_flag"] = (out.groupby("card1")["id_30"].shift() != out["id_30"]).astype("int32")
    if {"card1", "id_33"}.issubset(out.columns):
        out["screen_resolution_change_flag"] = (out.groupby("card1")["id_33"].shift() != out["id_33"]).astype("int32")

    # Ratios
    if {"trans_per_card_day", "devices_per_card_24h"}.issubset(out.columns):
        out["card_device_use_ratio"] = out["trans_per_card_day"] / (out["devices_per_card_24h"] + 1)
    if {"card1", "time_since_last_txn_card"}.issubset(out.columns):
        mean_time = out.groupby("card1")["time_since_last_txn_card"].transform("mean")
        out["time_diff_ratio_to_card_mean"] = out["time_since_last_txn_card"] / (mean_time + 1e-5)
    if {"dist1", "dist2"}.issubset(out.columns):
        out["dist_ratio"] = out["dist1"] / (out["dist2"].replace(0, np.nan) + 1e-5)

    return out


In [None]:

# --- Sort chronologically + engineer features on full stream
data = data.sort_values("TransactionDT").reset_index(drop=True)
data_feat = add_features_catboost(data)
print("After feature engineering:", data_feat.shape)


In [None]:

# --- Build CatBoost feature matrix

cat_cols = [
    "ProductCD", "card4", "card6",
    "addr1", "addr2",
    "P_emaildomain", "R_emaildomain",
    "DeviceType", "DeviceInfo",
    "id_30", "id_31", "id_33",
    "os_family", "browser_family", "email_provider", "email_tld", "device_brand",
]

num_cols = [
    "TransactionAmt", "log_amt", "amt_decimal", "is_round_amt",
    "card1", "card2", "card3", "card5",
    "dist1", "dist2",
    "D1", "D2", "D3", "D4",
    "Transaction_day", "Transaction_hour", "is_night_txn",
    "trans_per_card_day",
    "avg_amt_per_card_y", "amt_std_per_card_y",
    "freq_ratio_card_amt", "amt_deviation_card",
    "trans_last_1h_card", "trans_last_6h_card", "trans_last_24h_card",
    "time_since_last_txn_card",
    "devices_per_card_24h", "cards_per_device_24h",
    "device_change_flag", "browser_change_flag", "os_change_flag",
    "screen_resolution_change_flag",
    "card_device_use_ratio", "time_diff_ratio_to_card_mean",
    "dist_ratio",
    "missing_count", "missing_ratio", "has_identity", "is_free_email",
]

for c in cat_cols:
    if c not in data_feat.columns:
        data_feat[c] = "missing"
for c in num_cols:
    if c not in data_feat.columns:
        data_feat[c] = 0

feature_cols = cat_cols + num_cols

y = data_feat["isFraud"].astype(int).reset_index(drop=True)
keys = data_feat[["TransactionID", "TransactionDT"]].copy().reset_index(drop=True)

X = data_feat[feature_cols].copy().reset_index(drop=True)

# Force categorical to string
for c in cat_cols:
    X[c] = X[c].astype(str).fillna("missing")

print("X shape:", X.shape, "| y mean:", y.mean())
print("Categorical cols:", len(cat_cols), "| Numeric cols:", len(num_cols))


In [None]:

# --- Chronological 80/20 split (same as 01)
TRAIN_FRAC = 0.80
n = len(X)
split_idx = int(n * TRAIN_FRAC)

X_train_cat = X.iloc[:split_idx].reset_index(drop=True)
X_test_cat  = X.iloc[split_idx:].reset_index(drop=True)

y_train = y.iloc[:split_idx].reset_index(drop=True)
y_test  = y.iloc[split_idx:].reset_index(drop=True)

train_keys = keys.iloc[:split_idx].reset_index(drop=True)
test_keys  = keys.iloc[split_idx:].reset_index(drop=True)

train_keys.insert(0, "row_id", np.arange(len(train_keys)))
test_keys.insert(0, "row_id", np.arange(len(test_keys)))

assert train_keys["TransactionDT"].max() <= test_keys["TransactionDT"].min(), "Time split violated!"
print("Train:", X_train_cat.shape, "Test:", X_test_cat.shape)
print("Train fraud%:", y_train.mean()*100, "Test fraud%:", y_test.mean()*100)


In [None]:

# --- Save CatBoost-ready datasets
X_train_path = os.path.join(datapath, "X_train_cat.csv")
X_test_path  = os.path.join(datapath, "X_test_cat.csv")
cat_cols_path = os.path.join(datapath, "cat_feature_cols.json")

X_train_cat.to_csv(X_train_path, index=False)
X_test_cat.to_csv(X_test_path, index=False)

with open(cat_cols_path, "w") as f:
    json.dump(cat_cols, f, indent=2)

print("Saved:")
print(" -", X_train_path)
print(" -", X_test_path)
print(" -", cat_cols_path)
