In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

Read the datset

In [None]:
df = pd.read_csv("../data/fraud_data.csv")

In [3]:
count_class0, count_class1 = df.isFraud.value_counts()
print(f"Class 0 count = {count_class0}, class 1 count = {count_class1}") 

Class 0 count = 6354407, class 1 count = 8213


Fraud Transactions are very less as compared to legit transactions

Creating separate dataframes for transactions with class in "isFraud" column = 0 and 1. Then over sampling from the minority class dataframe(in this case, the dataframe with class = 1) to make the class balanced.

In [4]:
df_class0 = df[df["isFraud"]==0]
df_class1 = df[df["isFraud"]==1]
df_class1_sampled = df_class1.sample(count_class0, replace = True)
print(df_class1_sampled.shape)

(6354407, 11)


Concatenating the new sampled dataframe with class = 1 and dataframe with class = 0

In [5]:
df_data_sampled = pd.concat([df_class0,df_class1_sampled], axis = 0)
print(df_data_sampled.isFraud.value_counts())

isFraud
0    6354407
1    6354407
Name: count, dtype: int64


now our synthetic data is ready for splitting into test and train set

In [6]:
# Define features and target
X = df_data_sampled.drop("isFraud", axis=1)
y = df_data_sampled["isFraud"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

Defining the column types

In [7]:
# Column definitions
numeric = ["amount", "oldbalanceOrg", "newbalanceOrig", "newbalanceDest", "oldbalanceDest"]
categorical = ["type"]

Preprocessing data for scaling and handling the categorical columns

In [8]:
# Preprocessing
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric),
    ("cat", OneHotEncoder(drop="first"), categorical)
])

Defining the classifier and making pipeline

In [9]:
#XGBoost classifier
xgb_clf = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Pipeline
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", xgb_clf)
])

Training the model

In [10]:
# Train
pipeline.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating the mmodel based on the predictions made on test set

In [11]:
# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[1263406    7476]
 [  10624 1260257]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99   1270882
           1       0.99      0.99      0.99   1270881

    accuracy                           0.99   2541763
   macro avg       0.99      0.99      0.99   2541763
weighted avg       0.99      0.99      0.99   2541763



**Overfitting when we have balanced repeated classes**

In [12]:
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score

# Predict probabilities for the positive class
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Find precision, recall, thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# Compute F1 for each threshold
f1 = 2 * (precision * recall) / (precision + recall + 1e-10)

# Best threshold
best_idx = np.argmax(f1)
best_threshold = thresholds[best_idx]
print(f"Best Threshold: {best_threshold:.4f}, F1 Score: {f1[best_idx]:.4f}")

# Apply best threshold
y_pred_opt = (y_proba >= best_threshold).astype(int)

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_opt))


Best Threshold: 0.5990, F1 Score: 0.9930
Confusion Matrix:
 [[1263979    6903]
 [  10772 1260109]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99   1270882
           1       0.99      0.99      0.99   1270881

    accuracy                           0.99   2541763
   macro avg       0.99      0.99      0.99   2541763
weighted avg       0.99      0.99      0.99   2541763



In [None]:
import joblib

# Save the model
joblib.dump(pipeline, '../models/XGBoost_Oversampling_minority.pkl')

joblib.dump((X_test, y_test), '../test_data/XGBoost_Oversampling_minority_Test.pkl')

['D:\\IEOR\\3rd Sem\\Fraud detection\\Test\\XGBoost_Oversampling_minority_Test.pkl']