# Experiment 05: Credit card Fraud (GPU version)

This experiment uses the data from the Kaggle dataset [Credit Card Fraud Detection](https://www.kaggle.com/dalpozz/creditcardfraud). The dataset is made up of a number of variables which are a result of PCA transformation.

The details of the machine we used and the version of the libraries can be found in [experiment 01](01_airline.ipynb).

In [7]:
import json
import sys

import matplotlib.pyplot as plt
import pkg_resources
from libs.loaders import load_fraud
from libs.timer import Timer
from libs.metrics import classification_metrics_binary, classification_metrics_binary_prob, binarize_prediction
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split


print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))


System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2


In [2]:
random_seed = 42

In [3]:
df = load_fraud()

INFO:libs.loaders:MOUNT_POINT not found in environment. Defaulting to /fileshare


In [4]:
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
X = df[[col for col in df.columns if col.startswith('V')]].values
y = df['Class'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random_seed, test_size=0.3)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(199364, 28)
(199364,)
(85443, 28)
(85443,)


In [10]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

In [11]:
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)

### XGBoost

In [72]:
results_dict = dict()
num_rounds = 100

In [73]:
params = {'max_depth':3, 
          'objective':'binary:logistic', 
          'min_child_weight':1, 
          'eta':0.1, 
          'colsample_bytree':1, 
          'scale_pos_weight':2, 
          'gamma':0.1, 
          'reg_lamda':1, 
          'subsample':1,
          'tree_method':'exact', 
          'updater':'grow_gpu'
          }


In [74]:
with Timer() as t_train:
    xgb_clf_pipeline = xgb.train(params, dtrain, num_boost_round=num_rounds)
    
with Timer() as t_test:
    y_prob_xgb = xgb_clf_pipeline.predict(dtest)

In [75]:
y_pred_xgb = binarize_prediction(y_prob_xgb)

In [76]:
report_xgb = classification_metrics_binary(y_test, y_pred_xgb)
report2_xgb = classification_metrics_binary_prob(y_test, y_prob_xgb)
report_xgb.update(report2_xgb)

In [77]:
results_dict['xgb']={
    'train_time': t_train.interval,
    'test_time': t_test.interval,
    'performance': report_xgb 
}

In [78]:
del xgb_clf_pipeline

Now let's try with XGBoost histogram.

In [79]:
params = {'max_depth':0, 
          'objective':'binary:logistic', 
          'min_child_weight':1, 
          'eta':0.1, 
          'colsample_bytree':0.80, 
          'scale_pos_weight':2, 
          'gamma':0.1, 
          'reg_lamda':1, 
          'subsample':1,
          'tree_method':'hist', 
          'max_leaves':2**3, 
          'grow_policy':'lossguide', 
         }


In [80]:
with Timer() as t_train:
    xgb_hist_clf_pipeline = xgb.train(params, dtrain, num_boost_round=num_rounds)
    
with Timer() as t_test:
    y_prob_xgb_hist = xgb_hist_clf_pipeline.predict(dtest)

In [81]:
y_pred_xgb_hist = binarize_prediction(y_prob_xgb_hist)

In [82]:
report_xgb_hist = classification_metrics_binary(y_test, y_pred_xgb_hist)
report2_xgb_hist = classification_metrics_binary_prob(y_test, y_prob_xgb_hist)
report_xgb_hist.update(report2_xgb_hist)

In [83]:
results_dict['xgb_hist']={
    'train_time': t_train.interval,
    'test_time': t_test.interval,
    'performance': report_xgb_hist
}

In [84]:
del xgb_hist_clf_pipeline

### LightGBM

In [85]:
params = {'num_leaves': 2**3,
         'learning_rate': 0.1,
         'scale_pos_weight': 2,
         'min_split_gain': 0.1,
         'min_child_weight': 1,
         'reg_lambda': 1,
         'subsample': 1,
         'objective':'binary',
         'task': 'train'
         }

In [86]:
with Timer() as t_train:
    lgbm_clf_pipeline = lgb.train(params, lgb_train, num_boost_round=num_rounds)
    
with Timer() as t_test:
    y_prob_lgbm = lgbm_clf_pipeline.predict(X_test)

In [87]:
y_pred_lgbm = binarize_prediction(y_prob_lgbm)

In [88]:
report_lgbm = classification_metrics_binary(y_test, y_pred_lgbm)
report2_lgbm = classification_metrics_binary_prob(y_test, y_prob_lgbm)
report_lgbm.update(report2_lgbm)

In [89]:
results_dict['lgbm']={
    'train_time': t_train.interval,
    'test_time': t_test.interval,
    'performance': report_lgbm 
}

In [90]:
del lgbm_clf_pipeline

Finally, we show the results

In [91]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))

{
    "lgbm": {
        "performance": {
            "AUC": 0.96572630283135,
            "Accuracy": 0.9993914071369217,
            "F1": 0.8115942028985507,
            "Precision": 0.875,
            "Recall": 0.7567567567567568
        },
        "test_time": 0.04103280100025586,
        "train_time": 0.2938522020012897
    },
    "xgb": {
        "performance": {
            "AUC": 0.9699805365480376,
            "Accuracy": 0.9994148145547324,
            "F1": 0.8214285714285714,
            "Precision": 0.8712121212121212,
            "Recall": 0.777027027027027
        },
        "test_time": 0.03786925599706592,
        "train_time": 5.8025254829990445
    },
    "xgb_hist": {
        "performance": {
            "AUC": 0.9692088902901377,
            "Accuracy": 0.9994148145547324,
            "F1": 0.8148148148148148,
            "Precision": 0.9016393442622951,
            "Recall": 0.7432432432432432
        },
        "test_time": 0.00477967400001944,
        "train_tim