In [1]:
import os
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [2]:
data = load_dataset("CiferAI/Cifer-Fraud-Detection-Dataset-AF")["train"].to_pandas().convert_dtypes()

X = data.drop(columns="isFraud")
y = data["isFraud"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.001, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=42)
X_train, y_train = undersampler.fit_resample(X_train, y_train)

X_train["isFraud"] = y_train
X_val["isFraud"] = y_val
X_test["isFraud"] = y_test

In [3]:
def preprocess_transactions(df):
    df = df.sort_values(["nameOrig", "step"]).reset_index(drop=True)
    
    df["delta_sender"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
    df["delta_recipient"] = df["newbalanceDest"] - df["oldbalanceDest"]
    df["abs_delta_sender"] = df["delta_sender"].abs()
    df["abs_delta_recipient"] = df["delta_recipient"].abs()
    df["amount_over_old"] = df["amount"] / (df["oldbalanceOrg"] + 1)
    df["amount_over_new"] = df["amount"] / (df["newbalanceOrig"] + 1)
    df["amount_over_total"] = df["amount"] / (df["oldbalanceOrg"] + df["oldbalanceDest"] + 1)
    df["delta_net"] = (df["newbalanceOrig"] - df["oldbalanceOrg"]) + (df["newbalanceDest"] - df["oldbalanceDest"])
    df["type_encoded"] = df["type"].map(df.groupby("type")["isFraud"].mean())
    df["type_times_amount"] = df["type_encoded"] * df["amount"]
    df["high_amount_flag"] = df["isFlaggedFraud"] * df["amount"]
    df["large_tx"] = (df["amount"] > 0.5 * df["oldbalanceOrg"]).astype(int)
    df["hour"] = df["step"] % 24
    df["day"] = df["step"] // 24
    df["is_weekend"] = (df["day"] % 7).isin([5,6]).astype(int)
    df["is_night"] = df["hour"].between(22,6).astype(int)
    df["sender_receiver_same"] = (df["nameOrig"] == df["nameDest"]).astype(int)
    df["amount_over_mean_sender"] = df["amount"] / (df.groupby("nameOrig")["amount"].transform("mean")+1)
    df["type_flag_amount"] = df["type_encoded"] * df["isFlaggedFraud"] * df["amount"]
    df["delta_sender_over_recipient"] = df["delta_sender"] / (df["delta_recipient"] + 1)
    df["sender_tx_count"] = df.groupby("nameOrig")["amount"].transform("count")
    df["sender_total_amount"] = df.groupby("nameOrig")["amount"].transform("sum")
    df["sender_max_amount"] = df.groupby("nameOrig")["amount"].transform("max")
    df["amount_over_sender_max"] = df["amount"] / (df.groupby("nameOrig")["amount"].transform("max")+1)
    df["recipient_tx_count"] = df.groupby("nameDest")["amount"].transform("count")
    df["recipient_total_received"] = df.groupby("nameDest")["amount"].transform("sum")
    df["recipient_heavy"] = (df["recipient_tx_count"] > df["recipient_tx_count"].median()).astype(int)
    df["balance_sender_error"] = (df["oldbalanceOrg"] - df["amount"]) - df["newbalanceOrig"]
    df["abs_balance_sender_error"] = df["balance_sender_error"].abs()
    df["balance_recipient_error"] = (df["oldbalanceDest"] + df["amount"]) - df["newbalanceDest"]
    df["abs_balance_recipient_error"] = df["balance_recipient_error"].abs()
    df["balance_error_total"] = df["abs_balance_sender_error"] + df["abs_balance_recipient_error"]
    df["amount_over_net_balance"] = df["amount"] / (df["newbalanceOrig"] + df["newbalanceDest"] + 1)
    df["is_cash"] = df["type"].isin(["CASH_OUT","CASH_IN"]).astype(int)
    df["is_transfer"] = df["type"].isin(["TRANSFER"]).astype(int)
    df["time_since_last_tx"] = df.groupby("nameOrig")["step"].diff().fillna(-1)
    df["time_until_next_tx"] = df.groupby("nameOrig")["step"].diff(-1).abs().fillna(-1)
    df["mean_time_between_tx"] = df.groupby("nameOrig")["time_since_last_tx"].transform(lambda x: x[x>=0].mean())
    df["time_gap_dev"] = df["time_since_last_tx"] - df["mean_time_between_tx"]

    df["is_logical_inconsistent"] = ((df["oldbalanceOrg"]==0)&(df["amount"]>0)) | \
                                    (df["newbalanceOrig"]<0) | \
                                    ((df["oldbalanceDest"]==0)&(df["amount"]>0)) | \
                                    (df["oldbalanceOrg"]-df["amount"] != df["newbalanceOrig"])

    window = 3
    grp = df.groupby("nameOrig")["amount"]
    df["rolling_mean_3"] = grp.transform(lambda x: x.rolling(window,1).mean())
    df["rolling_std_3"] = grp.transform(lambda x: x.rolling(window,1).std().fillna(0))
    df["rolling_max_3"] = grp.transform(lambda x: x.rolling(window,1).max())

    df.drop(columns=["nameDest","nameOrig"], inplace=True)
    df = pd.get_dummies(df, columns=["type"], drop_first=False)
    df = df.select_dtypes(include=["number"]).astype(float)
    return df.clip(-1e10,1e10)

df_train = preprocess_transactions(X_train)
df_val = preprocess_transactions(X_val)
df_test = preprocess_transactions(X_test)

In [4]:
os.makedirs("data", exist_ok=True)

df_train.to_parquet("./data/df_train_preprocessed.parquet", index=False)
df_val.to_parquet("./data/df_val_preprocessed.parquet", index=False)
df_test.to_parquet("./data/df_test_preprocessed.parquet", index=False)