# CREDIT CARD FRAUD DETECTION

## Data Preparation

Due to the imbalance in the data set, we'll be applying some resampling techniques to the data.

In [136]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, SGDClassifier
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, average_precision_score


In [95]:
data = pd.read_csv("fraud_data1.csv")

In [96]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    1000 non-null   float64
 1   V1      1000 non-null   float64
 2   V2      1000 non-null   float64
 3   V3      1000 non-null   float64
 4   V4      1000 non-null   float64
 5   V5      1000 non-null   float64
 6   V6      1000 non-null   float64
 7   V7      1000 non-null   float64
 8   V8      1000 non-null   float64
 9   V9      1000 non-null   float64
 10  V10     1000 non-null   float64
 11  V11     1000 non-null   float64
 12  V12     1000 non-null   float64
 13  V13     1000 non-null   float64
 14  V14     1000 non-null   float64
 15  V15     1000 non-null   float64
 16  V16     1000 non-null   float64
 17  V17     1000 non-null   float64
 18  V18     1000 non-null   float64
 19  V19     1000 non-null   float64
 20  V20     1000 non-null   float64
 21  V21     1000 non-null   float64
 22  V

In [97]:
data.Class.value_counts()

0    508
1    492
Name: Class, dtype: int64

In [98]:
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,39818.913,-2.42553,1.878615,-3.019279,2.424717,-1.536179,-0.562552,-2.686945,0.258071,-1.278388,...,0.347925,-0.046017,-0.036092,-0.046412,0.087234,0.028997,0.093733,0.027819,95.07119,0.492
std,52427.144542,5.37538,3.574154,6.404924,3.03325,4.175311,1.796103,5.826726,4.811246,2.257019,...,2.765479,1.138552,1.139956,0.584673,0.637211,0.467639,0.996268,0.439521,238.416015,0.500186
min,0.0,-30.55238,-12.114213,-31.103685,-4.515824,-22.105532,-6.406267,-43.557242,-41.044261,-13.434066,...,-22.797604,-8.887017,-19.254328,-2.028024,-4.781606,-1.243924,-7.263482,-2.733887,0.0,0.0
25%,170.5,-2.732232,0.083608,-5.049001,0.335999,-1.646219,-1.433637,-3.011468,-0.165758,-2.173742,...,-0.173785,-0.533915,-0.219105,-0.414797,-0.216776,-0.294351,-0.03931,-0.049541,1.79,0.0
50%,368.0,-0.773294,0.936007,-0.273562,1.364654,-0.380261,-0.50744,-0.418858,0.151564,-0.566636,...,0.08312,-0.031383,-0.050372,0.029098,0.127998,-0.009778,0.062689,0.037173,15.22,0.0
75%,73595.75,0.913013,2.710365,0.955399,4.106961,0.423451,0.241343,0.309507,0.7776,0.118491,...,0.683621,0.440345,0.140485,0.392575,0.455121,0.308815,0.429253,0.209627,89.55,1.0
max,170348.0,2.132386,22.057729,3.772857,12.114672,11.095089,6.474115,5.802537,20.007208,5.436633,...,27.202839,8.361985,5.46623,1.215279,2.208209,3.065576,3.052358,1.779364,3828.04,1.0


## Preprocessing

In [115]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data.drop("Class", axis = 1))

In [116]:
scaled_data = pd.DataFrame(scaled_data, columns = data.drop("Class", axis = 1).columns)

In [118]:
target = data.Class

In [103]:
xtrain, xtest, ytrain, ytest = train_test_split(scaled_data, target, test_size = 0.33, random_state = 43, stratify = target)


In [104]:
ytrain.value_counts()

0    340
1    330
Name: Class, dtype: int64

In [105]:
ytest.value_counts()

0    168
1    162
Name: Class, dtype: int64

In [106]:
clf = SVC()

In [107]:
clf.fit(xtrain, ytrain)

SVC()

In [108]:
preds = clf.predict(xtest)

In [148]:
def evalscore(actual, predictions):
    temp = [mean_squared_error(actual, predictions), roc_auc_score(actual, predictions), f1_score(actual, predictions), average_precision_score(actual, predictions)]
    print("Mean Squared Error: {:.5f}\nArea Under Curve: {:.5f}\nF1 Score: {:.5f}\nAverage Precision Score: {:.5f}\n".format(temp[0], temp[1], temp[2], temp[3]))
    

In [149]:
evalscore(ytest, preds)

Mean Squared Error: 0.00909
Area Under Curve: 0.99074
F1 Score: 0.99065
Average Precision Score: 0.99057



In [111]:
test = pd.read_csv("fraud_data2.csv")
scaled_test = scaler.fit_transform(test.drop("Class", axis = 1))
scaled_test = pd.DataFrame(scaled_test, columns = test.drop("Class", axis= 1).columns)

In [112]:
y_test = test.Class

In [113]:
preds2 = clf.predict(scaled_test)

In [150]:
evalscore(y_test, preds2)


Mean Squared Error: 0.01267
Area Under Curve: 0.99058
F1 Score: 0.98106
Average Precision Score: 0.96282



In [197]:
def compareModels(models, xtrain, ytrain, xtest, ytest):
    trained = []
    for model in models:
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        print("\n", str(model).rstrip("()"), "\n")
        evalscore(ytest, preds)
        trained.append(model)
    return trained

def evalModels(models, x, actual):
    for model in models:
        preds = model.predict(x)
        print("\n", str(model).rstrip("()"), "\n")
        evalscore(actual, preds)


In [126]:
svm = SVC()
logr = LogisticRegression()
kmeans = KMeans(n_clusters = 2)
sgd = SGDClassifier()


In [133]:
models = [svm, logr, kmeans, sgd ]

In [173]:
temp = compareModels(models, scaled_test, y_test, xtrain, ytrain)



 SVC 

Mean Squared Error: 0.02537
Area Under Curve: 0.97424
F1 Score: 0.97356
Average Precision Score: 0.97386


 LogisticRegression 

Mean Squared Error: 0.03582
Area Under Curve: 0.96364
F1 Score: 0.96226
Average Precision Score: 0.96309


 KMeans(n_clusters=2 

Mean Squared Error: 0.65224
Area Under Curve: 0.35303
F1 Score: 0.51606
Average Precision Score: 0.43188


 SGDClassifier 

Mean Squared Error: 0.01194
Area Under Curve: 0.98788
F1 Score: 0.98773
Average Precision Score: 0.98770



In [174]:
svc, lr, km, sgd = temp
print(svc, lr, sgd)

SVC() LogisticRegression() SGDClassifier()


In [194]:
ntest = pd.read_csv("testdata/fraudtest.csv")
cols = ntest.drop(["Unnamed: 0", "Class"], axis = 1).columns
scaled_ntest = scaler.fit_transform(ntest.drop(["Unnamed: 0", "Class"], axis = 1))
scaled_ntest = pd.DataFrame(scaled_ntest, columns = cols)
scaled_ntest


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.097544,0.330885,0.869815,-0.991136,0.788589,0.674193,-0.945969,0.300327,0.173699,-0.201607,...,-0.073034,-0.101944,0.150782,0.365255,-0.297651,-1.446171,-0.975531,0.228472,-0.290103,-0.334354
1,-0.012553,-0.541962,0.446129,0.665878,-0.390727,0.054880,-0.456543,0.532896,-0.276545,1.812365,...,0.960090,-0.445073,0.147465,-0.093414,1.126018,0.302651,0.276822,0.662645,-0.492395,-0.346429
2,-0.207715,-0.438238,-0.662879,0.431065,-0.351270,1.157823,0.397381,1.269577,-1.119199,1.316189,...,-0.656761,-1.024993,-1.773301,0.407985,-2.354777,-1.629474,0.595494,-4.255361,-4.384485,0.290724
3,0.084872,-0.231758,0.242346,0.379351,-0.710405,-0.229419,-0.520752,0.050520,0.312440,0.314399,...,-0.259240,0.080849,0.639259,-0.021185,0.492755,-1.208728,1.308579,0.229929,0.322022,-0.263106
4,-0.379455,-0.092861,-0.345108,0.272756,-1.122493,-0.481363,0.677328,0.268026,0.105401,-1.223307,...,-1.003010,-0.284939,-0.635338,0.257628,-1.468955,-2.175284,1.270625,-0.101387,0.217360,0.545032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.128600,-1.450106,-0.453471,0.121285,-0.550825,0.336425,-0.835406,0.303339,-0.063197,1.702317,...,-1.546883,-0.716048,0.631198,1.917528,0.725273,0.487026,1.493548,1.780387,-1.055876,-0.218412
9996,-0.337806,-0.078975,0.207868,0.211597,-1.480085,0.116546,-1.020998,0.600402,-0.106001,0.150309,...,0.106537,0.083758,1.084476,-0.387350,0.843487,-0.100425,-1.726229,0.647403,0.107712,-0.341254
9997,-0.006477,-0.097830,-0.198360,1.123052,0.052340,-0.156541,0.749903,-0.107014,0.215828,1.449798,...,-0.360568,0.019053,1.371714,-0.603617,-0.084031,-0.168313,-0.863545,-0.004400,-0.366925,-0.340313
9998,-0.312515,-0.013786,0.481243,0.067979,0.048074,0.414229,-0.817058,0.513208,-0.056438,-1.204617,...,0.048355,0.064307,0.416358,-0.267775,0.752856,-0.813325,0.430712,-0.089268,0.558639,-0.342508


In [195]:
clfs = [svc, lr, km, sgd]

In [198]:
evalModels(clfs, scaled_ntest, ntest.Class)



 SVC 

Mean Squared Error: 0.49980
Area Under Curve: 0.73778
F1 Score: 0.15830
Average Precision Score: 0.08595


 LogisticRegression 

Mean Squared Error: 0.47870
Area Under Curve: 0.74885
F1 Score: 0.16413
Average Precision Score: 0.08940


 KMeans(n_clusters=2 

Mean Squared Error: 0.99060
Area Under Curve: 0.09798
F1 Score: 0.01824
Average Precision Score: 0.03967


 SGDClassifier 

Mean Squared Error: 0.60480
Area Under Curve: 0.68269
F1 Score: 0.13452
Average Precision Score: 0.07211



In [202]:
evalModels(clfs, scaled_test, test.Class)
           


 SVC 

Mean Squared Error: 0.00067
Area Under Curve: 0.99898
F1 Score: 0.99898
Average Precision Score: 0.99863


 LogisticRegression 

Mean Squared Error: 0.00200
Area Under Curve: 0.99695
F1 Score: 0.99694
Average Precision Score: 0.99590


 KMeans(n_clusters=2 

Mean Squared Error: 0.80667
Area Under Curve: 0.29472
F1 Score: 0.32402
Average Precision Score: 0.26636


 SGDClassifier 

Mean Squared Error: 0.00200
Area Under Curve: 0.99851
F1 Score: 0.99696
Average Precision Score: 0.99394

