Credit Card Fraud Detection using K Means Clustering and K-Nearest Neighbours

In [27]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, f1_score

In [3]:
dataset = pd.read_csv('creditcard.csv')

In [5]:
dataset.drop(['Time'], axis=1, inplace=True)

In [6]:
X = dataset.drop(['Class'], axis=1).values
y = dataset['Class'].values[:,None]

In [7]:
X.shape, y.shape

((284807, 29), (284807, 1))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)

In [9]:
X_train.shape, y_train.shape

((256326, 29), (256326, 1))

In [10]:
X_train=normalize(X_train)
X_test=normalize(X_test)

In [11]:
kmeans = KMeans(n_clusters=2, random_state=0, algorithm="full", max_iter=10000)
kmeans.fit(X_train)
kmeans_predicted_cluster_for_train=kmeans.predict(X_train)

In [13]:
pd.DataFrame(kmeans_predicted_cluster_for_train).nunique()

0    2
dtype: int64

In [14]:
kmeans_predicted_cluster_for_train.shape

(256326,)

In [15]:
tn, fp, fn, tp = confusion_matrix(y_train, kmeans_predicted_cluster_for_train).ravel()
reversed_cluster=False
if tn+tp<fn+fp:
    reversed_cluster=True
kmeans_predicted_cluster_for_test = kmeans.predict(X_test)
if reversed_cluster:
    kmeans_predicted_cluster_for_test = 1 - kmeans_predicted_cluster_for_test


In [16]:
tn, fp, fn, tp = confusion_matrix(y_test, kmeans_predicted_cluster_for_test).ravel()
print(tn, fp)
print(fn, tp)

22612 5822
26 21


In [17]:
print(classification_report(y_test, kmeans_predicted_cluster_for_test))

              precision    recall  f1-score   support

           0       1.00      0.80      0.89     28434
           1       0.00      0.45      0.01        47

    accuracy                           0.79     28481
   macro avg       0.50      0.62      0.45     28481
weighted avg       1.00      0.79      0.88     28481



In [19]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train.ravel())

KNeighborsClassifier()

In [20]:
knn_y_predicted = knn.predict(X_test)

In [23]:
X_test.shape, knn_y_predicted.shape

((28481, 29), (28481,))

In [25]:
tn, fp, fn, tp = confusion_matrix(y_test, knn_y_predicted).ravel()
print(tn, fp)
print(fn, tp)

28432 2
16 31


In [26]:
print(classification_report(y_test, knn_y_predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28434
           1       0.94      0.66      0.78        47

    accuracy                           1.00     28481
   macro avg       0.97      0.83      0.89     28481
weighted avg       1.00      1.00      1.00     28481

