In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
df = pd.read_csv('creditcard.csv').drop(['Time'], axis=1)

features = ['V10','V14','V4','V12', 'V11', 'V17', 'Class']
df = df[features].copy(deep=True)

df

Unnamed: 0,V10,V14,V4,V12,V11,V17,Class
0,0.090794,-0.311169,1.378155,-0.617801,-0.551600,0.207971,0
1,-0.166974,-0.143772,0.448154,1.065235,1.612727,-0.114805,0
2,0.207643,-0.165946,0.379780,0.066084,0.624501,1.109969,0
3,-0.054952,-0.287924,-0.863291,0.178228,-0.226487,-0.684093,0
4,0.753074,-1.119670,0.403034,0.538196,-0.822843,-0.237033,0
...,...,...,...,...,...,...,...
284802,4.356170,4.626942,-2.066656,2.711941,-1.593105,1.991691,0
284803,-0.975926,-0.675143,-0.738589,0.915802,-0.150189,-0.025693,0
284804,-0.484782,-0.510602,-0.557828,0.063119,0.411614,0.313502,0
284805,-0.399126,0.449624,0.689799,-0.962886,-1.933849,0.509928,0


In [139]:
X = df.drop(["Class"], axis=1)
X

Unnamed: 0,V10,V14,V4,V12,V11,V17
0,0.090794,-0.311169,1.378155,-0.617801,-0.551600,0.207971
1,-0.166974,-0.143772,0.448154,1.065235,1.612727,-0.114805
2,0.207643,-0.165946,0.379780,0.066084,0.624501,1.109969
3,-0.054952,-0.287924,-0.863291,0.178228,-0.226487,-0.684093
4,0.753074,-1.119670,0.403034,0.538196,-0.822843,-0.237033
...,...,...,...,...,...,...
284802,4.356170,4.626942,-2.066656,2.711941,-1.593105,1.991691
284803,-0.975926,-0.675143,-0.738589,0.915802,-0.150189,-0.025693
284804,-0.484782,-0.510602,-0.557828,0.063119,0.411614,0.313502
284805,-0.399126,0.449624,0.689799,-0.962886,-1.933849,0.509928


# EllipticEnvelope

In [75]:
from sklearn.covariance import EllipticEnvelope

model = EllipticEnvelope(random_state=0, contamination=0.05).fit(X)
pred = model.predict(X)

In [16]:
def evaluate(y_true, y_pred):
    print(classification_report(y_true, y_pred, target_names=['non‑fraud','fraud']))
    cm = confusion_matrix(y_true, y_pred)
    cm_df_out = pd.DataFrame(cm,
                         index=['true_non‑fraud','true_fraud'],
                         columns=['pred_non‑fraud','pred_fraud'])
    print(cm_df_out)

In [77]:
y_pred = [1 if x == -1 else 0 for x in pred]
y_true = df['Class']
evaluate(y_true, y_pred)

              precision    recall  f1-score   support

   non‑fraud       1.00      0.95      0.98    284315
       fraud       0.03      0.88      0.06       492

    accuracy                           0.95    284807
   macro avg       0.52      0.91      0.52    284807
weighted avg       1.00      0.95      0.97    284807

                pred_non‑fraud  pred_fraud
true_non‑fraud          270505       13810
true_fraud                  61         431


# IsolationForest

In [103]:
from sklearn.ensemble import IsolationForest

model = IsolationForest(random_state=0, contamination=0.05).fit(X)
pred = model.predict(X)

In [104]:
y_pred = [1 if x == -1 else 0 for x in pred]
y_true = df['Class']
evaluate(y_true, y_pred)

              precision    recall  f1-score   support

   non‑fraud       1.00      0.95      0.98    284315
       fraud       0.03      0.90      0.06       492

    accuracy                           0.95    284807
   macro avg       0.52      0.92      0.52    284807
weighted avg       1.00      0.95      0.97    284807

                pred_non‑fraud  pred_fraud
true_non‑fraud          270516       13799
true_fraud                  50         442


# LOF

In [62]:
from sklearn.neighbors import LocalOutlierFactor

pred = LocalOutlierFactor(contamination=0.05, n_neighbors=50).fit_predict(X)



In [64]:
y_pred = [1 if x == -1 else 0 for x in pred]
y_true = df['Class']
evaluate(y_true, y_pred)

              precision    recall  f1-score   support

   non‑fraud       1.00      0.95      0.97    284315
       fraud       0.00      0.07      0.00       492

    accuracy                           0.95    284807
   macro avg       0.50      0.51      0.49    284807
weighted avg       1.00      0.95      0.97    284807

                pred_non‑fraud  pred_fraud
true_non‑fraud          270107       14208
true_fraud                 459          33


# DBSCAN

In [12]:
from sklearn.cluster import DBSCAN

pred = DBSCAN(eps=1, min_samples=10).fit_predict(X)

In [18]:
outliers_indices = [i for i, x in enumerate(pred) if x == -1]

y_true = df['Class']
y_pred = np.zeros(len(y_true), dtype=int)
y_pred[outliers_indices] = 1

evaluate(y_true, y_pred)

              precision    recall  f1-score   support

   non‑fraud       1.00      0.99      0.99    284315
       fraud       0.11      0.86      0.19       492

    accuracy                           0.99    284807
   macro avg       0.55      0.92      0.59    284807
weighted avg       1.00      0.99      0.99    284807

                pred_non‑fraud  pred_fraud
true_non‑fraud          280707        3608
true_fraud                  68         424


# OneClassSVM

In [141]:
from sklearn.linear_model import SGDOneClassSVM

pred = SGDOneClassSVM(nu=0.05, random_state=0).fit_predict(X)

In [143]:
y_pred = [1 if x == -1 else 0 for x in pred]
y_true = df['Class']
evaluate(y_true, y_pred)

              precision    recall  f1-score   support

   non‑fraud       1.00      0.88      0.94    284315
       fraud       0.00      0.03      0.00       492

    accuracy                           0.88    284807
   macro avg       0.50      0.46      0.47    284807
weighted avg       1.00      0.88      0.94    284807

                pred_non‑fraud  pred_fraud
true_non‑fraud          251572       32743
true_fraud                 479          13


No matter what algorithm I use, results are never nearly as good as in exaple. Am I doing something wrong?