In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
%matplotlib inline

In [2]:
train = pd.read_csv('training_data')
X_train = train.drop('output', axis = 1)
y_train = train['output']
test = pd.read_csv('testing_data')
X_test = test.drop('output', axis = 1)
y_test = test['output']

In [4]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under_train, y_under_train = undersample.fit_resample(X_train, y_train)
print("Undersampled class distribution:", Counter(y_under_train))

Undersampled class distribution: Counter({0: 7506, 1: 7506})


In [7]:
import xgboost as xgb

In [17]:
param = {'objective': 'binary:logistic',
         'tree_method' : 'hist',
        }

In [22]:
model = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=4)

In [24]:
model.fit(X_train, y_train)

In [26]:
pred = model.predict(X_test)

In [30]:
from sklearn.metrics import confusion_matrix, classification_report

In [32]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

[[553432    142]
 [   596   1549]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.92      0.72      0.81      2145

    accuracy                           1.00    555719
   macro avg       0.96      0.86      0.90    555719
weighted avg       1.00      1.00      1.00    555719



In [34]:
from sklearn.metrics import roc_auc_score

In [38]:
print(roc_auc_score(y_test,pred))

0.8609440036035215


In [42]:
X_under_test, y_under_test = undersample.fit_resample(X_test, y_test)
print("Undersampled class distribution:", Counter(y_under_test))
prediction_model = model.predict(X_under_test)
print(confusion_matrix(y_under_test,prediction_model))
print(classification_report(y_under_test,prediction_model))
print(roc_auc_score(y_under_test,prediction_model))

Undersampled class distribution: Counter({0: 2145, 1: 2145})
[[2143    2]
 [ 596 1549]]
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      2145
           1       1.00      0.72      0.84      2145

    accuracy                           0.86      4290
   macro avg       0.89      0.86      0.86      4290
weighted avg       0.89      0.86      0.86      4290

0.8606060606060607


### Training the model on balanced data

In [45]:
model1 = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=4)

In [47]:
model1.fit(X_under_train, y_under_train)
pred1 = model1.predict(X_test)

In [49]:
print(confusion_matrix(y_test,pred1))
print(classification_report(y_test,pred1))
print(roc_auc_score(y_test,pred1))

[[543070  10504]
 [    83   2062]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.16      0.96      0.28      2145

    accuracy                           0.98    555719
   macro avg       0.58      0.97      0.64    555719
weighted avg       1.00      0.98      0.99    555719

0.971165240852401


### Using Label Encoded data instead

In [73]:
train1 = pd.read_csv('training_data_label_encoded')
X_train1 = train1.drop('is_fraud', axis = 1)
y_train1= train1['is_fraud']
test1 = pd.read_csv('testing_data_label_encoded')
X_test1 = test1.drop('is_fraud', axis = 1)
y_test1 = test1['is_fraud']

In [74]:
model3 = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=4)

In [77]:
model3.fit(X_train1, y_train1)
pred3 = model3.predict(X_test1)

In [79]:
print(confusion_matrix(y_test1,pred3))
print(classification_report(y_test1,pred3))
print(roc_auc_score(y_test1,pred3))

[[553451    123]
 [   597   1548]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.93      0.72      0.81      2145

    accuracy                           1.00    555719
   macro avg       0.96      0.86      0.91    555719
weighted avg       1.00      1.00      1.00    555719

0.8607280645810274


### Both undersampling and Label-Encoding

In [91]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under_train1, y_under_train1 = undersample.fit_resample(X_train1, y_train1)
print("Undersampled class distribution:", Counter(y_under_train1))

Undersampled class distribution: Counter({0: 7506, 1: 7506})


In [83]:
model2 = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=4)

In [85]:
model2.fit(X_under_train1, y_under_train1)
pred2 = model2.predict(X_test1)

In [89]:
print(confusion_matrix(y_test1,pred2))
print(classification_report(y_test1,pred2))
print(roc_auc_score(y_test1,pred2))

[[542174  11400]
 [    96   2049]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.15      0.96      0.26      2145

    accuracy                           0.98    555719
   macro avg       0.58      0.97      0.63    555719
weighted avg       1.00      0.98      0.99    555719

0.9673256512587839


In [10]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)
scaler.mean_


array([1.        , 0.        , 0.33333333])

In [12]:
scaler.scale_

array([0.81649658, 0.81649658, 1.24721913])

In [14]:
X_train

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [16]:
X_scaled = scaler.transform(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])