### I did pretty well using Stochastic Gradient Descent.  I chose this according to Scikit Learn's algorithm flowchart.  However, ultimately XGBoost beat it, and pretty handily.

In [77]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

%matplotlib inline

In [2]:
df = pd.read_csv("CreditCard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Looks like no missing data at all
np.where(df.isna())

(array([], dtype=int64), array([], dtype=int64))

In [4]:
y = df['Class']
X = df.drop('Class', 1)

In [5]:
fraud = df.loc[df['Class'] == 1, 'Time'].count()
print("There are only", fraud, "occurrences of fraud in the whole data set.")

There are only 492 occurrences of fraud in the whole data set.


## 42 may be the answer to the meaning of life, but I'm no conformist

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)
clf = SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)

### Ouch, that's what I expected.  Need to tune it.

In [7]:
# Just a raw run
predictions = clf.predict(X_test)
print(confusion_matrix(y_test, predictions))
print('\n', classification_report(y_test, predictions))

[[85293     0]
 [  150     0]]

               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85293
           1       0.00      0.00      0.00       150

   micro avg       1.00      1.00      1.00     85443
   macro avg       0.50      0.50      0.50     85443
weighted avg       1.00      1.00      1.00     85443



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### With only 492 occurrences in the whole set, I need to give the model more to train on: time to oversample

In [40]:
# Make a more even split to train on
sample = pd.concat([df.loc[df['Class'] == 1, :].sample(300),
                    df.loc[df['Class'] == 0, :].sample(30000)])

In [41]:
y_new = sample['Class']
X_new = sample.drop('Class', 1)

In [42]:
clf = SGDClassifier(max_iter=1000, class_weight={0:0.1, 1:6}, penalty='l1', loss='log')
clf.fit(X_new, y_new)

SGDClassifier(alpha=0.0001, average=False, class_weight={0: 0.1, 1: 6},
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

### Not too bad, though it took some tweaking

In [43]:
predictions = clf.predict(X_test)
print(confusion_matrix(y_test, predictions))
print('\n', classification_report(y_test, predictions))

[[84401   892]
 [   36   114]]

               precision    recall  f1-score   support

           0       1.00      0.99      0.99     85293
           1       0.11      0.76      0.20       150

   micro avg       0.99      0.99      0.99     85443
   macro avg       0.56      0.87      0.60     85443
weighted avg       1.00      0.99      0.99     85443



In [45]:
print('Coefficients')
print(clf.coef_)
print(clf.intercept_)
predictions = clf.predict(X_test)

print('\n Accuracy by admission status')
print(pd.crosstab(predictions, y_test))

print('\n Percentage accuracy')
print(clf.score(X_test, y_test))

Coefficients
[[-4.40864760e+00 -1.64577601e+04  1.35129356e+04 -2.48291530e+04
   1.59614005e+04 -1.06147057e+04 -4.97862522e+03 -1.97080831e+04
   1.36797597e+03 -8.90426422e+03 -2.01378606e+04  1.33300296e+04
  -2.23003791e+04 -3.49686304e+02 -2.50831897e+04 -3.93704819e+02
  -1.45515716e+04 -2.33874414e+04 -7.78148775e+03  2.23677617e+03
   1.15116867e+03  2.37437026e+03  9.69559851e+01  8.11413101e+01
  -4.86165415e+02  1.85152055e+02  6.93803067e+01  2.81755950e+02
   3.81993560e+02  1.12710876e+02]]
[280.33892483]

 Accuracy by admission status
Class      0    1
row_0            
0      84401   36
1        892  114

 Percentage accuracy
0.9891389581358332


In [37]:
xgb = XGBClassifier()
xgb.fit(X_new, y_new)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### However, XGBoost without tuning does significantly better (I'm looking at Type II errors)

In [38]:
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.80%


In [39]:
print(confusion_matrix(y_test, y_pred))
print('\n', classification_report(y_test, y_pred))

[[85136   157]
 [   10   140]]

               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85293
           1       0.47      0.93      0.63       150

   micro avg       1.00      1.00      1.00     85443
   macro avg       0.74      0.97      0.81     85443
weighted avg       1.00      1.00      1.00     85443



In [109]:
sample = pd.concat([df.loc[df['Class'] == 1, :].sample(300),
                    df.loc[df['Class'] == 0, :].sample(3000)])
y_new = sample['Class']
X_new = sample.drop('Class', 1)

### Some tuning, and notice above that I've resampled at a 10:1 ratio, rather than 100:1, which worked best with the SGD algorithm.  I can go even further, but the Type I errors go into the thousands then, even though I got down to only 3 misclassified frauds.

In [110]:
xgb = XGBClassifier(max_depth=6, n_estimators=200, colsample_bylevel=0.2)
xgb.fit(X_new, y_new)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.2,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### It would be interesting to keep turning the knobs here; minimizing Type II errors.  

In [111]:
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.82%


In [112]:
print(confusion_matrix(y_test, y_pred))
print('\n', classification_report(y_test, y_pred))

[[85146   147]
 [   10   140]]

               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85293
           1       0.49      0.93      0.64       150

   micro avg       1.00      1.00      1.00     85443
   macro avg       0.74      0.97      0.82     85443
weighted avg       1.00      1.00      1.00     85443



In [78]:
cross = cross_val_score(xgb, X_train, y_train, cv=5)
print(cross)
print("Cross Validation Mean: ", cross.mean())

[0.99947334 0.99962382 0.99944823 0.99967396 0.99969904]
Cross Validation Mean:  0.999583676794181
