In [None]:
# Import package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file = 'standardized_paysim.csv'
path = '../Data/'

paysim = pd.read_csv(path+file)

In [None]:
paysim.sample(10)

In [None]:
paysim.describe()

# Random Up/Down Sampling:

In [None]:
# class count
class_count_0, class_count_1 = paysim['isFraud'].value_counts()

# Separate class
class_0 = paysim[paysim['isFraud'] == 0]
class_1 = paysim[paysim['isFraud'] == 1]

# print the shape of the class
print('class 0:', class_0.shape)
print('class 1:', class_1.shape)

In [None]:
# Random Under-Sampling
class_0_under = class_0.sample(class_count_1)

test_under = pd.concat([class_0_under, class_1], axis=0)

print("total class of 1 and 0: ",test_under['isFraud'].value_counts())

In [None]:
# Random Over-Sampling
class_1_over = class_1.sample(class_count_0, replace=True)

test_over = pd.concat([class_1_over, class_0], axis=0)

print("total class of 1 and 0: ",test_over['isFraud'].value_counts())

# Balance data with the imbalanced-learn python module

In [None]:
# 
import imblearn
from timeit import default_timer as timer 

# Random under-sampling with imblearn
from imblearn.under_sampling import RandomUnderSampler

In [None]:
y =  paysim['isFraud']
X =  paysim.loc[:, paysim.columns != 'isFraud']
# 
start = timer()

rus = RandomUnderSampler(random_state=42, replacement=True)
# fit predictor and target variable

x_rus, y_rus = rus.fit_resample(X, y)

print('original dataset shape:', len(y))
print('Resample dataset shape', len(y_rus))
print('\n')
end = timer()
print(f'{(end - start)} seconds elapsed.')

# Random over-sampling with imblearn

In [None]:
# import library
from imblearn.over_sampling import RandomOverSampler
# 
start = timer()

ros = RandomOverSampler(random_state=42)

# fit predictor and target variable
x_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', len(y))
print('Resample dataset shape', len(y_ros))
print('\n')
end = timer()
print(f'{(end - start)} seconds elapsed.')

# Under-sampling: Tomek links

In [None]:
# import library
from imblearn.under_sampling import TomekLinks
# 
start = timer()

tl = RandomUnderSampler(sampling_strategy='majority')

# fit predictor and target variable
x_tl, y_tl = tl.fit_resample(X, y)

print('Original dataset shape', len(y))
print('Resample dataset shape', len(y_tl))
print('\n')
end = timer()
print(f'{(end - start)} seconds elapsed.')

In [None]:
# import library
from imblearn.under_sampling import TomekLinks
# 
start = timer()

tl = RandomOverSampler(sampling_strategy='majority')

# fit predictor and target variable
x_tl, y_tl = tl.fit_resample(X, y)

print('Original dataset shape', len(y))
print('Resample dataset shape', len(y_tl))
print('\n')
end = timer()
print(f'{(end - start)} seconds elapsed.')

# Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
# import library
from imblearn.over_sampling import SMOTE
# 
start = timer()

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, y)

print('Original dataset shape', len(y))
print('Resample dataset shape', len(y_smote))
print('\n')
end = timer()
print(f'{(end - start)} seconds elapsed.')

# NearMiss

In [None]:
from imblearn.under_sampling import NearMiss
# 
start = timer()

nm = NearMiss()

x_nm, y_nm = nm.fit_resample(X, y)

print('Original dataset shape:', len(y))
print('Resample dataset shape:', len(y_nm))
print('\n')
end = timer()
print(f'{(end - start)} seconds elapsed.')

# Train/Test Data:

In [None]:
from sklearn.model_selection import train_test_split
# 
start = timer()
# Random Under-Sampling
x_u = test_under.loc[:, paysim.columns != 'isFraud']
y_u = test_under.isFraud

X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(x_u, y_u, test_size=0.20, random_state=42)

# Random Over-Sampling
x_o = test_over.loc[:, paysim.columns != 'isFraud']
y_o = test_over.isFraud

X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(x_o, y_o, test_size=0.20, random_state=42)

# Random under-sampling with imblearn
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(x_rus, y_rus, test_size=0.20, random_state=42)

# Random over-sampling with imblearn
X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(x_ros, y_ros, test_size=0.20, random_state=42)

# Under-sampling: Tomek links
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(x_tl, y_tl, test_size=0.20, random_state=42)

# Synthetic Minority Oversampling Technique (SMOTE)
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(x_smote, y_smote, test_size=0.20, 
                                                                            random_state=42)
# NearMiss
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(x_nm, y_nm, test_size=0.20, random_state=42)

# Orig Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

end = timer()
print(f'{(end - start)} seconds elapsed.')

# Machine Learning

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
# classification_report(y_test, y_pred)
# confusion_matrix(y_test, y_pred)

In [None]:
x_nm.shape, y_nm.shape, y_nm.value_counts() 

In [None]:
# Extreme Gradient Boosting with XGBoost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define model
xgb_cl = XGBClassifier(objective ='binary:logistic', n_estimators=10, seed=42, eval_metric = "logloss",
               learning_rate = 0.3, use_label_encoder=False)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

grid_search = GridSearchCV(
    estimator=xgb_cl,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

In [None]:
%timeit grid_search.fit(X_train_nm, y_train_nm)

In [None]:
grid_search.best_estimator_

In [None]:
y_pred = grid_search.predict(X_test_nm)
print(classification_report(y_test_nm, y_pred))

In [None]:
confusion_matrix(y_test_nm, y_pred)

In [None]:
# Random Under-Sampling
%timeit grid_search.fit(X_train_u, y_train_u)

In [None]:
y_pred = grid_search.predict(X_test_u)
print(classification_report(y_test_u, y_pred))

In [None]:
confusion_matrix(y_test_u, y_pred)

In [None]:
grid_search.best_estimator_

In [None]:
y_train_tl.value_counts(), y_test_tl.value_counts()

In [None]:
# Random Under-Sampling
%timeit grid_search.fit(X_train_tl, y_train_tl)

In [None]:
y_pred = grid_search.predict(X_test_tl)
print(classification_report(y_test_tl, y_pred))

In [None]:
confusion_matrix(y_test_tl, y_pred)

In [None]:
grid_search.best_estimator_