In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

Read the datset

In [None]:
# Read the dataset
df = pd.read_csv("../data/fraud_data.csv")


In [3]:
count_class0, count_class1 = df.isFraud.value_counts()
print(f"Class 0 count = {count_class0}, class 1 count = {count_class1}") 

Class 0 count = 6354407, class 1 count = 8213


Fraud Transactions are very less as compared to legit transactions

Creating separate dataframes for transactions with class in "isFraud" column = 0 and 1. Then over sampling from the minority class dataframe(in this case, the dataframe with class = 1) to make the class balanced.

In [4]:
df_class0 = df[df["isFraud"]==0]
df_class1 = df[df["isFraud"]==1]
df_class1_sampled = df_class1.sample(count_class0, replace = True)
print(df_class1_sampled.shape)

(6354407, 11)


Concatenating the new sampled dataframe with class = 1 and dataframe with class = 0

In [5]:
df_data_sampled = pd.concat([df_class0,df_class1_sampled], axis = 0)
print(df_data_sampled.isFraud.value_counts())

isFraud
0    6354407
1    6354407
Name: count, dtype: int64


now our synthetic data is ready for splitting into test and train set

In [6]:
# Define features and target
X = df_data_sampled.drop("isFraud", axis=1)
y = df_data_sampled["isFraud"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

Defining the column types

In [7]:
# Column definitions
numeric = ["amount", "oldbalanceOrg", "newbalanceOrig", "newbalanceDest", "oldbalanceDest"]
categorical = ["type"]

Preprocessing data for scaling and handling the categorical columns

In [8]:
# Preprocessing
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric),
    ("cat", OneHotEncoder(drop="first"), categorical)
])

Defining the classifier and making pipeline

In [9]:
# LightGBM Classifier
lgbm_clf = LGBMClassifier(
    random_state=42,
    class_weight=None,  
    n_jobs=-1
)

# Pipeline
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", lgbm_clf)
])

Training the model

In [10]:
# Train
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 5083526, number of negative: 5083525
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 10167051, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


Evaluating the mmodel based on the predictions made on test set

In [11]:
# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[1265996    4886]
 [      0 1270881]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270882
           1       1.00      1.00      1.00   1270881

    accuracy                           1.00   2541763
   macro avg       1.00      1.00      1.00   2541763
weighted avg       1.00      1.00      1.00   2541763



**Overfitting when we have balanced repeated classes**

In [None]:
import joblib

# Save the model
joblib.dump(pipeline, '../models/LightGBM_Oversampling_minority.pkl')

joblib.dump((X_test, y_test), '../test_data/LightGBM_Oversampling_minority_Test.pkl')

['D:\\IEOR\\3rd Sem\\Fraud detection\\Test\\LightGBM_Oversampling_minority_Test.pkl']