In [50]:
import pandas as 
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    cross_val_predict
)

from sklearn.metrics import make_scorer, average_precision_score

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import joblib
import os

pd.set_option("display.float_format", lambda x: f"{x:,.4f}")


In [51]:
data_dir = "C://Users//aditi//OneDrive//Desktop//fraud_detection//processed"

X_train = pd.read_csv(f"{data_dir}/X_train.csv")
X_test  = pd.read_csv(f"{data_dir}/X_test.csv")

y_train = pd.read_csv(f"{data_dir}/y_train.csv").values.ravel()
y_test  = pd.read_csv(f"{data_dir}/y_test.csv").values.ravel()

X_train.shape, X_test.shape


((400, 17), (100, 17))

In [52]:
pd.Series(y_train).value_counts()

0    378
1     22
Name: count, dtype: int64

In [53]:

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

pr_auc_scorer = make_scorer(
    average_precision_score,
    needs_proba=True
)
print()





## Logistic Regression

In [54]:
lr_pipeline = Pipeline(steps=[
    ("smote", SMOTE(random_state=42, k_neighbors=3)),
    ("model", LogisticRegression(
        max_iter=1000,
        solver="lbfgs"
    ))
])

lr_param_grid = {
    "model__C": [0.01, 0.1, 1.0, 10.0]
}

lr_grid = GridSearchCV(
    lr_pipeline,
    lr_param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1
)

lr_grid.fit(X_train, y_train)
lr_best = lr_grid.best_estimator_


In [55]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import average_precision_score

lr_cv_probs = cross_val_predict(
    lr_best,
    X_train,
    y_train,
    cv=cv,
    method="predict_proba",
    n_jobs=-1
)[:, 1]

lr_pr_auc = average_precision_score(y_train, lr_cv_probs)
lr_pr_auc


0.10570110934583325

## Random Forest

In [56]:
rf_pipeline = Pipeline(steps=[
    ("smote", SMOTE(random_state=42, k_neighbors=3)),
    ("model", RandomForestClassifier(
        random_state=42,
        n_jobs=-1
    ))
])

rf_param_grid = {
    "model__n_estimators": [200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

rf_grid = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_


In [57]:
rf_cv_probs = cross_val_predict(
    rf_best,
    X_train,
    y_train,
    cv=cv,
    method="predict_proba",
    n_jobs=-1
)[:, 1]

rf_pr_auc = average_precision_score(y_train, rf_cv_probs)
rf_pr_auc


0.05013805144403976

## XGBoost

In [58]:
xgb_pipeline = Pipeline(steps=[
    ("smote", SMOTE(random_state=42, k_neighbors=3)),
    ("model", XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        random_state=42,
        n_jobs=-1
    ))
])

xgb_param_grid = {
    "model__n_estimators": [300],
    "model__max_depth": [4, 6],
    "model__learning_rate": [0.05, 0.1],
    "model__subsample": [0.8],
    "model__colsample_bytree": [0.8]
}

xgb_grid = GridSearchCV(
    xgb_pipeline,
    xgb_param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1
)

xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_


In [59]:
xgb_cv_probs = cross_val_predict(
    xgb_best,
    X_train,
    y_train,
    cv=cv,
    method="predict_proba",
    n_jobs=-1
)[:, 1]

xgb_pr_auc = average_precision_score(y_train, xgb_cv_probs)
xgb_pr_auc


0.05315191192951221

In [60]:
def extract_metrics(grid, model_name):
    idx = grid.best_index_
    return {
        "Model": model_name,
        "CV ROC-AUC Mean": grid.cv_results_["mean_test_score"][idx],
        "CV ROC-AUC Std": grid.cv_results_["std_test_score"][idx],
    }


In [61]:
def compute_pr_auc(model, X, y, cv):
    probs = cross_val_predict(
        model,
        X,
        y,
        cv=cv,
        method="predict_proba",
        n_jobs=-1
    )[:, 1]
    return average_precision_score(y, probs)

In [62]:
cv_results = pd.DataFrame([
    {
        **extract_metrics(lr_grid, "Logistic Regression"),
        "CV PR-AUC": compute_pr_auc(lr_best, X_train, y_train, cv)
    },
    {
        **extract_metrics(rf_grid, "Random Forest"),
        "CV PR-AUC": compute_pr_auc(rf_best, X_train, y_train, cv)
    },
    {
        **extract_metrics(xgb_grid, "XGBoost"),
        "CV PR-AUC": compute_pr_auc(xgb_best, X_train, y_train, cv)
    }
])

cv_results


Unnamed: 0,Model,CV ROC-AUC Mean,CV ROC-AUC Std,CV PR-AUC
0,Logistic Regression,0.3115,0.0488,0.1057
1,Random Forest,0.4053,0.0765,0.0501
2,XGBoost,0.4561,0.0716,0.0532


In [63]:
model_dir = "C:/Users/aditi/OneDrive/Desktop/fraud_detection/models"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(lr_best, f"{model_dir}/logistic_model.pkl")
joblib.dump(rf_best, f"{model_dir}/random_forest_model.pkl")
joblib.dump(xgb_best, f"{model_dir}/xgboost_fraud_model.pkl")


['C:/Users/aditi/OneDrive/Desktop/fraud_detection/models/xgboost_fraud_model.pkl']