In [1]:
import os
import pandas as pd
from tqdm import tqdm

#### 5.1. Load Train, Validation, and Test Sets

We load the previously prepared datasets from Parquet files.
These sets have already been cleaned, split, and balanced, and are ready for feature engineering.

In [2]:
df_train = pd.read_parquet("./data/4/df_train.parquet")
df_test = pd.read_parquet("./data/4/df_test.parquet")
df_val = pd.read_parquet("./data/4/df_val.parquet")

#### 5.2. Feature Engineering for Transactions

The `preprocess_transactions` function generates new features to help the model detect fraud.
Features are designed to capture important patterns and behaviors in financial transactions.

Types of features created:
- **Transaction amount features:** ratios of transaction amount to sender/recipient balances
- **Time-based features:** hour, day, weekend/weekday, night, time since last transaction
- **Sender/recipient behavior features:** transaction counts, totals, maximum amounts, heavy recipients
- **Balance consistency checks:** errors between expected and observed balances
- **Transaction type features:** encoded type, type interactions with amount or fraud flags
- **Rolling statistics:** moving averages, standard deviations, and maximums over recent transactions

These features enrich the dataset with behavioral and temporal patterns that are useful for fraud detection.

A progress bar is used to monitor feature calculation.
It shows the current feature being calculated and overall progress.
This is helpful for large datasets to keep track of long-running preprocessing.

Original identifier columns (`nameOrig`, `nameDest`) are dropped to avoid overfitting.
Transaction type column (`type`) is one-hot encoded to convert categorical data into numeric format.
All remaining columns are ensured to be numeric and clipped to avoid extreme values.

In [3]:
def preprocess_transactions(df):
    df = df.sort_values(["nameOrig", "step"]).reset_index(drop=True)
    
    new_columns = [
        "delta_sender", "delta_recipient", "abs_delta_sender", "abs_delta_recipient", "delta_net",
        "amount_over_old", "amount_over_new", "amount_over_total", "amount_over_net_balance",
        "type_encoded", "type_times_amount", "type_flag_amount",
        "high_amount_flag", "large_tx", "hour", "day", "is_weekend", "is_night", "sender_receiver_same",
        "amount_over_mean_sender", "sender_tx_count", "sender_total_amount", "sender_max_amount", "amount_over_sender_max",
        "time_since_last_tx", "time_until_next_tx", "mean_time_between_tx", "time_gap_dev",
        "recipient_tx_count", "recipient_total_received", "recipient_heavy",
        "balance_sender_error", "abs_balance_sender_error", "balance_recipient_error", "abs_balance_recipient_error", "balance_error_total",
        "is_cash", "is_transfer", "is_logical_inconsistent",
        "rolling_mean_3", "rolling_std_3", "rolling_max_3"
    ]
    
    pbar = tqdm(total=len(new_columns), desc="Calculating features", ncols=120)
    
    def update(col_name):
        pbar.set_postfix_str(f"Current: {col_name}")
        pbar.update(1)

    df["delta_sender"] = df["oldbalanceOrg"] - df["newbalanceOrig"]; update("delta_sender")
    df["delta_recipient"] = df["newbalanceDest"] - df["oldbalanceDest"]; update("delta_recipient")
    df["abs_delta_sender"] = df["delta_sender"].abs(); update("abs_delta_sender")
    df["abs_delta_recipient"] = df["delta_recipient"].abs(); update("abs_delta_recipient")
    df["delta_net"] = (df["newbalanceOrig"] - df["oldbalanceOrg"]) + (df["newbalanceDest"] - df["oldbalanceDest"]); update("delta_net")

    df["amount_over_old"] = df["amount"] / (df["oldbalanceOrg"] + 1); update("amount_over_old")
    df["amount_over_new"] = df["amount"] / (df["newbalanceOrig"] + 1); update("amount_over_new")
    df["amount_over_total"] = df["amount"] / (df["oldbalanceOrg"] + df["oldbalanceDest"] + 1); update("amount_over_total")
    df["amount_over_net_balance"] = df["amount"] / (df["newbalanceOrig"] + df["newbalanceDest"] + 1); update("amount_over_net_balance")

    type_mean = df.groupby("type")["isFraud"].mean()
    df["type_encoded"] = df["type"].map(type_mean); update("type_encoded")
    df["type_times_amount"] = df["type_encoded"] * df["amount"]; update("type_times_amount")
    df["type_flag_amount"] = df["type_encoded"] * df["isFlaggedFraud"] * df["amount"]; update("type_flag_amount")

    df["high_amount_flag"] = df["isFlaggedFraud"] * df["amount"]; update("high_amount_flag")
    df["large_tx"] = (df["amount"] > 0.5 * df["oldbalanceOrg"]).astype(int); update("large_tx")

    df["hour"] = df["step"] % 24; update("hour")
    df["day"] = df["step"] // 24; update("day")
    df["is_weekend"] = (df["day"] % 7).isin([5,6]).astype(int); update("is_weekend")
    df["is_night"] = ((df["hour"] >= 22) | (df["hour"] < 6)).astype(int); update("is_night")

    df["sender_receiver_same"] = (df["nameOrig"] == df["nameDest"]).astype(int); update("sender_receiver_same")

    grp_sender = df.groupby("nameOrig")["amount"]
    df["amount_over_mean_sender"] = df["amount"] / (grp_sender.transform("mean") + 1); update("amount_over_mean_sender")
    df["sender_tx_count"] = grp_sender.transform("count"); update("sender_tx_count")
    df["sender_total_amount"] = grp_sender.transform("sum"); update("sender_total_amount")
    df["sender_max_amount"] = grp_sender.transform("max"); update("sender_max_amount")
    df["amount_over_sender_max"] = df["amount"] / (grp_sender.transform("max") + 1); update("amount_over_sender_max")
    df["time_since_last_tx"] = df.groupby("nameOrig")["step"].diff().fillna(-1); update("time_since_last_tx")
    df["time_until_next_tx"] = df.groupby("nameOrig")["step"].diff(-1).abs().fillna(-1); update("time_until_next_tx")
    df["mean_time_between_tx"] = grp_sender.transform(lambda x: x.diff().clip(lower=0).mean()); update("mean_time_between_tx")
    df["time_gap_dev"] = df["time_since_last_tx"] - df["mean_time_between_tx"]; update("time_gap_dev")

    grp_rec = df.groupby("nameDest")["amount"]
    df["recipient_tx_count"] = grp_rec.transform("count"); update("recipient_tx_count")
    df["recipient_total_received"] = grp_rec.transform("sum"); update("recipient_total_received")
    df["recipient_heavy"] = (df["recipient_tx_count"] > df["recipient_tx_count"].median()).astype(int); update("recipient_heavy")

    df["balance_sender_error"] = (df["oldbalanceOrg"] - df["amount"]) - df["newbalanceOrig"]; update("balance_sender_error")
    df["abs_balance_sender_error"] = df["balance_sender_error"].abs(); update("abs_balance_sender_error")
    df["balance_recipient_error"] = (df["oldbalanceDest"] + df["amount"]) - df["newbalanceDest"]; update("balance_recipient_error")
    df["abs_balance_recipient_error"] = df["balance_recipient_error"].abs(); update("abs_balance_recipient_error")
    df["balance_error_total"] = df["abs_balance_sender_error"] + df["abs_balance_recipient_error"]; update("balance_error_total")

    df["is_cash"] = df["type"].isin(["CASH_OUT","CASH_IN"]).astype(int); update("is_cash")
    df["is_transfer"] = df["type"].isin(["TRANSFER"]).astype(int); update("is_transfer")
    df["is_logical_inconsistent"] = ((df["oldbalanceOrg"]==0)&(df["amount"]>0)) | \
                                    (df["newbalanceOrig"]<0) | \
                                    ((df["oldbalanceDest"]==0)&(df["amount"]>0)) | \
                                    (df["oldbalanceOrg"]-df["amount"] != df["newbalanceOrig"]); update("is_logical_inconsistent")

    window = 3
    df["rolling_mean_3"] = grp_sender.transform(lambda x: x.rolling(window, min_periods=1).mean()); update("rolling_mean_3")
    df["rolling_std_3"] = grp_sender.transform(lambda x: x.rolling(window, min_periods=1).std().fillna(0)); update("rolling_std_3")
    df["rolling_max_3"] = grp_sender.transform(lambda x: x.rolling(window, min_periods=1).max()); update("rolling_max_3")

    df.drop(columns=["nameDest","nameOrig"], inplace=True)
    df = pd.get_dummies(df, columns=["type"], drop_first=False)
    df = df.select_dtypes(include="number").astype(float)

    pbar.close()
    return df.clip(-1e10,1e10)

#### 5.3. Apply Feature Engineering

We apply the preprocessing function to train, validation, and test datasets.
This ensures that all datasets have the same features and are ready for modeling.

In [4]:
df_train = preprocess_transactions(df_train)
df_test = preprocess_transactions(df_test)
df_val = preprocess_transactions(df_val)

Calculating features: 100%|█████████████████████████████████████| 42/42 [00:52<00:00,  1.25s/it, Current: rolling_max_3]
Calculating features: 100%|█████████████████████████████████████| 42/42 [04:41<00:00,  6.71s/it, Current: rolling_max_3]
Calculating features: 100%|█████████████████████████████████████| 42/42 [06:22<00:00,  9.11s/it, Current: rolling_max_3]


#### 5.4. Save Preprocessed Datasets

The newly engineered datasets are saved as Parquet files for future use:
- Preserves all calculated features
- Ensures reproducibility
- Ready for model training and evaluation

In [5]:
os.makedirs("./data/5/", exist_ok=True)

df_train.to_parquet("./data/5/df_train.parquet", index=False)
df_val.to_parquet("./data/5/df_val.parquet", index=False)
df_test.to_parquet("./data/5/df_test.parquet", index=False)