In [1]:
'''Main'''
import numpy as np
import pandas as pd

'''Data Prep'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report

# Timer
import time as time

In [2]:
# read data
data = pd.read_csv('credit_card.csv')
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [3]:
# Create the feature matrix X and the labels array Y
dataX = data.copy().drop(['Class'],axis=1)
dataY = data['Class'].copy()

In [4]:
# We will create three data frames and check the effects of scaling in XGBoost
# Standardize X
from sklearn import preprocessing as pp
from sklearn.preprocessing import MinMaxScaler

# Case 1:
# Step 1: you drop the features that you do not want to scale
featuresToScale = dataX.drop(['Time'],axis=1).columns
# Step 2: create the scaler object
sX = pp.StandardScaler(copy=True)
# Step 3: create a new data frame to save the results
dataX1=dataX.copy()
# Step 4: transform the original data and save to dataX1
dataX1.loc[:,featuresToScale] = sX.fit_transform(dataX[featuresToScale])
dataX1.describe()
# now all rows have mean=0 and stderr=1

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,-8.157366e-16,3.154853e-17,-4.409878e-15,-6.734811e-16,-2.874435e-16,4.168992e-16,-8.767997e-16,-2.423604e-16,3.078727e-16,...,2.75487e-16,1.6850770000000002e-17,1.478472e-15,-6.797197e-16,1.234659e-16,-7.659279e-16,3.247603e-16,-2.953495e-18,5.401572000000001e-17,3.202236e-16
std,47488.145955,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,...,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,0.0,-28.79855,-44.03529,-31.87173,-4.013919,-82.4081,-19.63606,-35.2094,-61.30252,-12.22802,...,-70.69146,-47.41907,-15.06565,-71.75446,-4.683638,-19.75033,-5.401098,-55.9066,-46.74612,-0.3532294
25%,54201.5,-0.4698918,-0.3624707,-0.5872142,-0.5993788,-0.5010686,-0.5766822,-0.447886,-0.1746805,-0.5853631,...,-0.2746334,-0.3109433,-0.7473476,-0.2591784,-0.5854676,-0.6084001,-0.6780717,-0.1755053,-0.160444,-0.3308401
50%,84692.0,0.009245351,0.03965683,0.1186124,-0.01401724,-0.03936682,-0.2058046,0.03241723,0.01871982,-0.04681169,...,-0.08104705,-0.04009429,0.009345377,-0.0179242,0.06765678,0.0318324,-0.1081217,0.003325174,0.03406368,-0.2652715
75%,139320.5,0.6716939,0.4867202,0.6774569,0.5250082,0.4433465,0.2991625,0.4611107,0.2740785,0.5435305,...,0.1725733,0.2537392,0.728336,0.2364319,0.7257153,0.6728006,0.4996663,0.2255648,0.2371526,-0.04471707
max,172792.0,1.253351,13.35775,6.187993,11.91874,25.21413,55.02015,97.47824,16.75153,14.19494,...,51.13464,37.03471,14.47304,36.07668,7.569684,14.42532,7.293975,78.3194,102.5434,102.3622


In [5]:
# Case 2: Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler

# Case 2:
# Step 1: you drop the features that you do not want to scale
featuresToScale = dataX.drop(['Time'],axis=1).columns
# Step 2: create the scaler object
mX = pp.MinMaxScaler(copy=True)
# Step 3: create a new data frame to save the results
dataX2=dataX.copy()
# Step 4: transform the original data and save to dataX1
dataX2.loc[:,featuresToScale] = mX.fit_transform(dataX[featuresToScale])
dataX2.describe()
# now all rows have min=0 and max=1

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,0.958294,0.767258,0.837414,0.25193,0.765716,0.26302,0.265356,0.785385,0.46278,...,0.580265,0.56148,0.510031,0.665434,0.382234,0.577907,0.425448,0.416511,0.313124,0.003439
std,47488.145955,0.033276,0.017424,0.026275,0.062764,0.009292,0.013395,0.007537,0.012812,0.037846,...,0.008208,0.011841,0.033854,0.009274,0.081611,0.029261,0.078771,0.00745,0.006698,0.009736
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,54201.5,0.942658,0.760943,0.821985,0.214311,0.76106,0.255295,0.26198,0.783148,0.440626,...,0.578011,0.557798,0.48473,0.66303,0.334454,0.560104,0.372036,0.415203,0.312049,0.000218
50%,84692.0,0.958601,0.767949,0.84053,0.25105,0.765351,0.260263,0.2656,0.785625,0.461008,...,0.5796,0.561005,0.510347,0.665267,0.387756,0.578838,0.416932,0.416536,0.313352,0.000856
75%,139320.5,0.980645,0.775739,0.855213,0.284882,0.769836,0.267027,0.268831,0.788897,0.48335,...,0.581682,0.564484,0.534688,0.667626,0.44146,0.597593,0.464807,0.418191,0.314712,0.003004
max,172792.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Now we use a nicer function to do classification
# This function includes K-fold cross validation
# This function output both predicted class and predicted prob.
# This function also generates all common performance metrics of classification
def build_model(data, features, target, classifier, oversample=None):
    '''
    Function that takes in data with features and target columns,
    builds a model using the specified classifier,
    and outputs the classification performance.
    
    Inputs:
        data (pd.DataFrame): Dataframe containing features and target columns for model building.
        features (list): List of features to use in model building.
        target (str): Name of target / label column.
        classifier (sklearn classifier Object): Model to build.
        
    Outputs:
        cm (pd.DataFrame): Confusion matrix.
        results (dict): Accuracy, Precision, Recall, F1 Score, AUC.
    '''
    
    # initialise stratified K Fold, 5 fold in this case
    kf = StratifiedKFold(n_splits=5, shuffle=False)
    # get splitting iterations
    kf.get_n_splits(X=data[features], y=data[target])
    
    # lists to store results
    y_pred_all = []
    y_true_all = []
    y_proba_all = []
    
    # comprehensive dictionary of results
    results = dict()
    
    # generate indices to split data
    for train_index, test_index in kf.split(data[features], data[target]):
        ### train & test for current iteration ###
        curr_train, curr_test = data.iloc[train_index], data.iloc[test_index]
        # train set
        X_train = curr_train[features]
        y_train = curr_train[target]
        # test set
        X_test = curr_test[features]
        y_test = curr_test[target]
        
        ### handling class imbalance ###
        if oversample:
            X_train, y_train = oversample.fit_resample(X_train, y_train)
        
        # classifier
        clf = classifier
        # fit data
        clf.fit(X_train, y_train)
        # get predictions
        y_pred = clf.predict(X_test) # get predicted class
        y_proba = clf.predict_proba(X_test) # get predicted prob.
        # extend list
        y_pred_all.extend(y_pred)
        y_true_all.extend(y_test)
        y_proba_all.extend(y_proba[:, 1])
        
    # generate confusion matrix
    cm = pd.DataFrame(confusion_matrix(y_true=y_true_all, y_pred=y_pred_all), \
                      index=['Actual: 0', 'Actual: 1'], \
                      columns=['Pred: 0', 'Pred: 1'])

    # generate results
    report = classification_report(y_true=y_true_all, y_pred=y_pred_all, output_dict=True)
    results['accuracy'] = report['accuracy']
    results['recall'] = report['1']['recall']
    results['precision'] = report['1']['precision']
    results['f1'] = report['1']['f1-score']
    
    # auc
    fpr, tpr, thresholds = roc_curve(y_true=y_true_all, y_score=y_proba_all)
    auc_score = auc(fpr, tpr)
    results['auc'] = auc_score
    
    return cm, results

In [7]:
# an simple example how to use this self-defined function
from sklearn.tree import DecisionTreeClassifier
start_time = time.time()

# initialise decision tree classifier
dt = DecisionTreeClassifier(random_state=1)

# define train data frame, list of features, and target column name
features = list(dataX.columns)
target = 'Class'
train = pd.concat([dataX, dataY], axis=1)

# build model
cm_dt, results_dt = build_model(train, features, target, dt)

print("Confusion Matrix: \n\n", cm_dt)
print("\n")
print(f"F1 Score: {results_dt['f1']}")
print("--- %s seconds ---" % (time.time() - start_time))

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   227529    56786
Actual: 1      136      356


F1 Score: 0.01235381892632821
--- 73.89886808395386 seconds ---


In [8]:
start_time = time.time()
train1 = pd.concat([dataX1, dataY], axis=1) # only this is different
# we use standardized input features, not raw data

# build model
cm_dt1, results_dt1 = build_model(train1, features, target, dt)

print("Confusion Matrix: \n\n", cm_dt1)
print("\n")
print(f"F1 Score: {results_dt1['f1']}")
print("--- %s seconds ---" % (time.time() - start_time))

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   227529    56786
Actual: 1      136      356


F1 Score: 0.01235381892632821
--- 73.73727607727051 seconds ---


In [9]:
start_time = time.time()
train2 = pd.concat([dataX2, dataY], axis=1) 
# we use min-max scaled input features, not raw data

# build model
cm_dt2, results_dt2 = build_model(train2, features, target, dt)

print("Confusion Matrix: \n\n", cm_dt2)
print("\n")
print(f"F1 Score: {results_dt2['f1']}")
print("--- %s seconds ---" % (time.time() - start_time))

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   227391    56924
Actual: 1      135      357


F1 Score: 0.012358714278296089
--- 70.00598645210266 seconds ---


In [10]:
# We can easily switch to other classification algorithms now
# XGBoost
from xgboost import XGBClassifier
start_time = time.time()
# initialise XGBoost classifier, mostly default values
xgb = XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='error')

# build model
cm_xgb, results_xgb = build_model(train, features, target, xgb)
cm_xgb1, results_xgb1 = build_model(train1, features, target, xgb)
cm_xgb2, results_xgb2 = build_model(train2, features, target, xgb)

print("Confusion Matrix: \n\n", cm_xgb)
print("\n")
print(f"F1 Score: {results_xgb['f1']}")
print("Confusion Matrix: \n\n", cm_xgb1)
print("\n")
print(f"F1 Score: {results_xgb1['f1']}")
print("Confusion Matrix: \n\n", cm_xgb2)
print("\n")
print(f"F1 Score: {results_xgb2['f1']}")
print("--- %s seconds ---" % (time.time() - start_time))
# This is clearly better than DT

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   282678     1637
Actual: 1      106      386


F1 Score: 0.30695825049701786
Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   282678     1637
Actual: 1      106      386


F1 Score: 0.30695825049701786
Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   282549     1766
Actual: 1      109      383


F1 Score: 0.2900416508898145
--- 363.9331612586975 seconds ---


In [11]:
# We switch to LightGBM and combine 3 cases of codes here
start_time = time.time()
import lightgbm as lgb

# initialize LightGBM, all default values
clf = lgb.LGBMClassifier()

# build model
cm_lgb, results_lgb = build_model(train, features, target, clf)
cm_lgb1, results_lgb1 = build_model(train1, features, target, clf)
cm_lgb2, results_lgb2 = build_model(train2, features, target, clf)

print("Confusion Matrix: \n\n", cm_lgb)
print("\n")
print(f"F1 Score: {results_lgb['f1']}")
print("Confusion Matrix: \n\n", cm_lgb1)
print("\n")
print(f"F1 Score: {results_lgb1['f1']}")
print("Confusion Matrix: \n\n", cm_lgb2)
print("\n")
print(f"F1 Score: {results_lgb2['f1']}")
print("--- %s seconds ---" % (time.time() - start_time))

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   283026     1289
Actual: 1      269      223


F1 Score: 0.22255489021956093
Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   280568     3747
Actual: 1      181      311


F1 Score: 0.1367032967032967
Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0   280376     3939
Actual: 1      140      352


F1 Score: 0.14718795734894416
--- 26.561797380447388 seconds ---
