# Experiment 05: Credit card Fraud

This experiment uses the data from the Kaggle dataset [Credit Card Fraud Detection](https://www.kaggle.com/dalpozz/creditcardfraud). The dataset is made up of a number of variables which are a result of PCA transformation.

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score, recall_score, f1_score
import json
import seaborn
import matplotlib.pyplot as plt

from experiments.libs.loaders import load_fraud
from experiments.libs.timer import Timer
% matplotlib inline
% load_ext autoreload
% autoreload 2



In [2]:
random_seed = 42

In [3]:
df = load_fraud()

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
pipeline_steps = [('scale', StandardScaler())]
continuous_pipeline = Pipeline(steps=pipeline_steps)

In [6]:
featurisers = [('continuous', continuous_pipeline)]

In [7]:
xgb_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                          ('clf', XGBClassifier(max_depth=8, 
                                                                learning_rate=0.1, 
                                                                n_estimators=100))])

In [8]:
lgbm_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                          ('clf', LGBMClassifier(num_leaves=8, 
                                                                learning_rate=0.1,
                                                                n_estimators=100))])

In [9]:
metrics_dict = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'AUC': roc_auc_score,
    'F1': f1_score,
}

def classification_metrics(metrics, y_true, y_pred):
    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}

In [10]:
X = df[[col for col in df.columns if col.startswith('V')]].values
y = df['Class'].values

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random_seed, test_size=0.3)

In [12]:
results_dict = dict()

### XGBoost

In [13]:
with Timer() as train_t:
    xgb_clf_pipeline.fit(X_train,y_train)
    
with Timer() as test_t:
    y_pred = xgb_clf_pipeline.predict(X_test)

In [14]:
results_dict['xgb']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          y_pred) 
}

### LightGBM

In [15]:
with Timer() as train_t:
    lgbm_clf_pipeline.fit(X_train, y_train)
    
with Timer() as test_t:
    y_pred = lgbm_clf_pipeline.predict(X_test)

In [16]:
results_dict['lgbm']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          y_pred) 
}

In [17]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))

{
    "lgbm": {
        "performance": {
            "AUC": 0.8749589659417316,
            "Accuracy": 0.9994850368081645,
            "F1": 0.8345864661654134,
            "Precision": 0.940677966101695,
            "Recall": 0.75
        },
        "test_time": 0.050938055850565434,
        "train_time": 0.9537332961335778
    },
    "xgb": {
        "performance": {
            "AUC": 0.8749531039334075,
            "Accuracy": 0.9994733330992591,
            "F1": 0.8314606741573033,
            "Precision": 0.9327731092436975,
            "Recall": 0.75
        },
        "test_time": 0.11142425402067602,
        "train_time": 7.827869581989944
    }
}
