In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [26]:
# Load the dataset
df = pd.read_csv(r'C:\Users\abi3c\Desktop\creditcard.csv')

In [27]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('Class', axis=1), df['Class'], test_size=0.2, random_state=42)

In [28]:
# Train a Gradient Boosting Machine (GBM) model
param = { 'objective':'binary:logistic',
        'learning_rate': 0.19,
        'n_estimators': 60, 
        'max_depth': 6, 
        'min_child_weight': 3.52,
        'gamma': 9.75, 
        'subsample': 0.45,
        'colsample_bytree': 0.64,
        'seed': 24}

XGBC = XGBClassifier(n_jobs = 4,random_state = 123).set_params(**param)
model = XGBC.fit(X_train, y_train)
preds = model.predict(X_test)

# round((f1_score(preds, y_test, average='weighted')), 4)

from sklearn.metrics import confusion_matrix

confusion_matrix(preds, y_test)

array([[56862,    22],
       [    2,    76]], dtype=int64)

In [29]:
# Make predictions on the test set
y_pred = XGBC.predict(X_test)

In [30]:
# Evaluate the performance of the model
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('ROC AUC Score:', roc_auc_score(y_test, y_pred))

Confusion Matrix:
 [[56862     2]
 [   22    76]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.78      0.86        98

    accuracy                           1.00     56962
   macro avg       0.99      0.89      0.93     56962
weighted avg       1.00      1.00      1.00     56962

ROC AUC Score: 0.8877375162220206


## let's try in balanced datasets

In [9]:
non_fraud = df[df['Class']==0]
fraud = df[df['Class']==1]

In [10]:
legit = non_fraud.sample(n=508)

In [11]:
ndf = pd.concat([legit,fraud], axis = 0)

In [15]:
X = ndf.drop(columns='Class', axis=1)
y = ndf['Class']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [22]:
# Train a Gradient Boosting Machine (GBM) model
param = { 'objective':'binary:logistic',
        'learning_rate': 0.19,
        'n_estimators': 60, 
        'max_depth': 6, 
        'min_child_weight': 3.52,
        'gamma': 9.75, 
        'subsample': 0.45,
        'colsample_bytree': 0.64,
        'seed': 24}

XGBC = XGBClassifier(n_jobs = 4,random_state = 123).set_params(**param)
model = XGBC.fit(X_train, y_train)
preds = model.predict(X_test)

# round((f1_score(preds, y_test, average='weighted')), 4)

from sklearn.metrics import confusion_matrix

confusion_matrix(preds, y_test)

array([[101,  15],
       [  1,  83]], dtype=int64)

In [23]:
# Make predictions on the test set
y_pred = XGBC.predict(X_test)

In [24]:
# Evaluate the performance of the model
print('Confusion Matrix:\n', confusion_matrix(Y_test, y_pred))
print('Classification Report:\n', classification_report(Y_test, y_pred))
print('ROC AUC Score:', roc_auc_score(Y_test, y_pred))

Confusion Matrix:
 [[101   1]
 [ 15  83]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.99      0.93       102
           1       0.99      0.85      0.91        98

    accuracy                           0.92       200
   macro avg       0.93      0.92      0.92       200
weighted avg       0.93      0.92      0.92       200

ROC AUC Score: 0.9185674269707885
