# Fraud Detection ML Pipeline
Target: `is_fraud`

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc

from xgboost import XGBClassifier

In [None]:
# Load data
df = pd.read_csv("train.csv")

y = df["is_fraud"]
X = df.drop(columns=["is_fraud"])

num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object", "category"]).columns

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)

In [None]:
# Logistic Regression baseline
log_reg = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

log_reg.fit(X_train, y_train)

y_prob_lr = log_reg.predict_proba(X_test)[:, 1]
print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, y_prob_lr))
print(classification_report(y_test, log_reg.predict(X_test)))

In [None]:
# Class imbalance ratio
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos
scale_pos_weight

In [None]:
# XGBoost model
xgb = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        eval_metric="logloss",
        random_state=42
    ))
])

xgb.fit(X_train, y_train)

y_prob_xgb = xgb.predict_proba(X_test)[:, 1]
print("XGBoost ROC-AUC:", roc_auc_score(y_test, y_prob_xgb))
print(classification_report(y_test, xgb.predict(X_test)))

In [None]:
# PR-AUC
precision, recall, _ = precision_recall_curve(y_test, y_prob_xgb)
print("PR-AUC:", auc(recall, precision))

In [None]:
# Threshold tuning
threshold = 0.25
y_pred_custom = (y_prob_xgb >= threshold).astype(int)
print("XGBoost @ threshold =", threshold)
print(classification_report(y_test, y_pred_custom))