In [1]:
## Import Essential Data handling libraries
import pandas as pd
import numpy as np

from math import floor
from time import perf_counter

import matplotlib.pyplot as plt,seaborn as sns
import matplotlib as mpl
%matplotlib inline


from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [2]:
### Importing Machine Learning Libraries and functions
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import model_selection

from sklearn.metrics import precision_recall_curve, auc, f1_score
from sklearn.metrics import make_scorer


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from xgboost import XGBClassifier

import lightgbm as lgb

from imblearn.over_sampling import SMOTE

from sklearn.feature_selection import RFECV

In [3]:
from functools import partial
from skopt import space
from skopt import gp_minimize

from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope


import optuna

#### Loading Data

In [4]:
df_raw = pd.read_csv("..\data\creditcard.csv")

In [5]:
df_raw.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.36,-0.073,2.536,1.378,-0.338,0.462,0.24,0.099,0.364,0.091,-0.552,-0.618,-0.991,-0.311,1.468,-0.47,0.208,0.026,0.404,0.251,-0.018,0.278,-0.11,0.067,0.129,-0.189,0.134,-0.021,149.62,0
1,0.0,1.192,0.266,0.166,0.448,0.06,-0.082,-0.079,0.085,-0.255,-0.167,1.613,1.065,0.489,-0.144,0.636,0.464,-0.115,-0.183,-0.146,-0.069,-0.226,-0.639,0.101,-0.34,0.167,0.126,-0.009,0.015,2.69,0
2,1.0,-1.358,-1.34,1.773,0.38,-0.503,1.8,0.791,0.248,-1.515,0.208,0.625,0.066,0.717,-0.166,2.346,-2.89,1.11,-0.121,-2.262,0.525,0.248,0.772,0.909,-0.689,-0.328,-0.139,-0.055,-0.06,378.66,0
3,1.0,-0.966,-0.185,1.793,-0.863,-0.01,1.247,0.238,0.377,-1.387,-0.055,-0.226,0.178,0.508,-0.288,-0.631,-1.06,-0.684,1.966,-1.233,-0.208,-0.108,0.005,-0.19,-1.176,0.647,-0.222,0.063,0.061,123.5,0
4,2.0,-1.158,0.878,1.549,0.403,-0.407,0.096,0.593,-0.271,0.818,0.753,-0.823,0.538,1.346,-1.12,0.175,-0.451,-0.237,-0.038,0.803,0.409,-0.009,0.798,-0.137,0.141,-0.206,0.502,0.219,0.215,69.99,0


In [6]:
df_raw.drop(["Time"], axis = 1, inplace  = True)

##### Checking Class Distribution

In [7]:
print(df_raw.Class.value_counts())

print("\n","% distribution\n",df_raw.Class.value_counts(True)*100)

0    284315
1       492
Name: Class, dtype: int64

 % distribution
 0   99.827
1    0.173
Name: Class, dtype: float64


In [8]:
df_raw.shape

(284807, 30)

#### Scoring function

In [9]:
def AUPRC(y_true, y_pred_proba) :
    precision, recall, thresholds = precision_recall_curve(y_true = y_true, probas_pred= y_pred_proba)
    auprc = auc(recall, precision)
    return(np.round(auprc, 4))

#### Data Transformation
* 1. Quantile Transformation (normal distribution)
* 2. standard scaling the data

In [10]:
quantile_transformer = QuantileTransformer(output_distribution="normal")
standard_scaler = StandardScaler()

In [11]:
df_transformed = df_raw.copy()

for i in range(df_transformed.iloc[:,:-1].shape[1]):
    qt_transformed_var = quantile_transformer.fit_transform(df_transformed.iloc[:,i].values.reshape(-1,1))[:,0]
    qt_transformed_std_scaled = standard_scaler.fit_transform(qt_transformed_var.reshape(-1,1))[:,0]
    
    df_transformed.iloc[:,i] = qt_transformed_std_scaled

In [12]:
df_transformed.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-0.997,-0.156,2.128,1.195,-0.3,0.723,0.291,0.198,0.441,0.305,-0.466,-0.891,-1.0,-0.522,1.784,-0.68,0.397,0.041,0.601,0.967,0.035,0.353,-0.458,0.075,0.195,-0.308,0.839,-0.319,1.028,0
1,0.459,0.221,-0.009,0.417,0.123,0.217,-0.16,0.163,-0.257,-0.113,1.651,1.359,0.508,-0.287,0.663,0.583,-0.073,-0.25,-0.225,-0.027,-0.668,-0.837,0.497,-0.647,0.272,0.387,-0.095,0.056,-0.834,0
2,-0.996,-1.269,1.377,0.345,-0.478,1.372,0.962,0.532,-1.455,0.452,0.562,-0.096,0.73,-0.318,2.821,-2.685,1.493,-0.164,-2.519,1.377,0.883,1.04,2.015,-1.119,-0.701,-0.19,-0.536,-0.766,1.591,0
3,-0.715,-0.284,1.395,-0.688,0.05,1.182,0.287,0.76,-1.36,0.062,-0.174,0.052,0.528,-0.49,-0.73,-1.282,-1.048,2.213,-1.518,-0.655,-0.263,-0.0,-0.795,-1.669,1.394,-0.388,0.525,0.575,0.922,0
4,-0.858,0.754,1.157,0.367,-0.373,0.401,0.705,-0.857,0.869,0.953,-0.739,0.552,1.386,-1.387,0.14,-0.657,-0.257,-0.046,1.102,1.233,0.063,1.085,-0.575,0.221,-0.409,1.025,1.071,1.452,0.596,0


In [13]:
#selected vars (manual feature selection)
all_vars = list(set(list(df_transformed.columns)) - set(["V8","V13","V15", "V19", "V20", "V22", "V23","V24", "V25", "V26", "Amount"]))
 
df = df_transformed[all_vars]

In [14]:
df.head()

Unnamed: 0,V4,V10,V9,V21,V3,V5,V7,V28,Class,V6,V16,V11,V18,V14,V27,V2,V17,V12,V1
0,1.195,0.305,0.441,0.035,2.128,-0.3,0.291,-0.319,0,0.723,-0.68,-0.466,0.041,-0.522,0.839,-0.156,0.397,-0.891,-0.997
1,0.417,-0.113,-0.257,-0.668,-0.009,0.123,-0.16,0.056,0,0.217,0.583,1.651,-0.25,-0.287,-0.095,0.221,-0.073,1.359,0.459
2,0.345,0.452,-1.455,0.883,1.377,-0.478,0.962,-0.766,0,1.372,-2.685,0.562,-0.164,-0.318,-0.536,-1.269,1.493,-0.096,-0.996
3,-0.688,0.062,-1.36,-0.263,1.395,0.05,0.287,0.575,0,1.182,-1.282,-0.174,2.213,-0.49,0.525,-0.284,-1.048,0.052,-0.715
4,0.367,0.953,0.869,0.063,1.157,-0.373,0.705,1.452,0,0.401,-0.657,-0.739,-0.046,-1.387,1.071,0.754,-0.257,0.552,-0.858


#### HyperParameter Optimization

In [15]:
X = df.drop("Class", axis = 1).values

y = df.Class.values





classifier = ExtraTreesClassifier(n_jobs = -1) # utilizing all the cores
# classifier.get_params().keys()

param_grid = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
              "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
              "min_child_weight" : [ 1, 3, 5, 7 ],
              'subsample'        : [0.6, 0.8, 1.0],
              "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
              "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]}

model = model_selection.RandomizedSearchCV(estimator = classifier,
                                    param_distributions = param_grid,
                                    n_iter = 50,
                                    scoring = "f1", 
                                    verbose = 1,
                                    cv = 5,
                                    n_jobs = -1)

model.fit(X,y)

print(model.best_score_)
print(model.best_estimator_.get_params())
print(model.best_estimator_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 59.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 76.8min finished


0.8214759580793967
{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 9, 'max_features': 7, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 3, 'min_samples_split': 4, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
ExtraTreesClassifier(criterion='entropy', max_depth=9, max_features=7,
                     min_samples_leaf=3, min_samples_split=4, n_estimators=150,
                     n_jobs=-1)


In [17]:
# best estimator from random grid search CV
model = ExtraTreesClassifier(criterion='entropy', max_depth=9, max_features=7,
                     min_samples_leaf=3, min_samples_split=4, n_estimators=150,
                     n_jobs=-1)

def create_stratified_folds(data, k_fold_num, target_variable):
    data["kfold"] = -1 # we create a new column called kfold and fill it with -1
    data = data.sample(frac=1).reset_index(drop=True) # the next step is to randomize the rows of the data

    y = data[target_variable].values

    kf = model_selection.StratifiedKFold(n_splits= k_fold_num)

    for fold, (trn_, partitioned_idx_) in enumerate(kf.split(X=data, y = y)):
        data.loc[partitioned_idx_, 'kfold'] = fold
    return(data)

# Creating stratified k-fold within the data
k_fold_num = 5

df = create_stratified_folds(df, k_fold_num = 5, target_variable = "Class")

features = [f for f in df.columns if f not in ["kfold","Class"]]

cv_auprc = []
f1_scores = []
for fold in range(k_fold_num):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    x_train = df_train[features].values
    x_valid = df_valid[features].values

    model.fit(x_train, df_train.Class.values)

    valid_preds = model.predict_proba(x_valid)[:, 1]
    valid_preds_binary = model.predict(x_valid)

    auprc = AUPRC(df_valid.Class.values, valid_preds)
    f1 = f1_score(df_valid.Class.values, valid_preds_binary)

    cv_auprc.append(auprc)
    f1_scores.append(np.round(f1,4))

print("Average Area Under Precision Recall Curve = ", np.round(np.mean(cv_auprc),2))
print("Average F1 score = ", np.round(np.mean(f1_scores),2))

Average Area Under Precision Recall Curve =  0.84
Average F1 score =  0.84


In [18]:
cv_auprc

[0.8369, 0.8604, 0.8181, 0.8843, 0.8042]

In [19]:
f1_scores

[0.8324, 0.841, 0.8152, 0.8778, 0.8114]