In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import pickle

transactions = pd.read_csv('../transactions.csv', index_col='transactionId')

In [None]:
def m_accuracy(model, X, y):
    '''
    returns: the accuracy of the given model, on the digits dataset
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
    clf = model
    clf.fit(X_train, y_train)
    return clf.score(X_test,y_test)

def m_accuracy_mv(model, X, y, N:int):
    '''
    returns: the tupple (mean, std_var)
    '''

    accuracies = [ m_accuracy(model, X, y) for i in range(N) ]
    return (np.mean(accuracies), np.std(accuracies))

def norm2float(df:pd.DataFrame, cols:list):
    for col in cols:
        df[col] = df[col].str.replace(',', '.').astype(float)
        
    return df

In [6]:
transactions.head(10)

Unnamed: 0_level_0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
transactionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,3,9839.64,11231010000.0,170136.0,160296.36,21979790000.0,0.0,0.0,0
1,1,3,1864.28,11666540000.0,21249.0,19384.72,22044280000.0,0.0,0.0,0
2,1,4,181.0,11305490000.0,181.0,0.0,1553264000.0,0.0,0.0,1
3,1,1,181.0,1840084000.0,181.0,0.0,138997000.0,21182.0,0.0,1
4,1,3,11668.14,12048540000.0,41554.0,29885.86,21230700000.0,0.0,0.0,0
5,1,3,7817.71,190045600.0,53860.0,46042.29,2573487000.0,0.0,0.0,0
6,1,3,7107.77,1154989000.0,183195.0,176087.23,2408069000.0,0.0,0.0,0
7,1,3,7861.64,11912850000.0,176087.23,168225.59,2633326000.0,0.0,0.0,0
8,1,3,4024.36,11265010000.0,2671.0,0.0,21176930000.0,0.0,0.0,0
9,1,2,5337.77,1712410000.0,41720.0,36382.23,1195601000.0,41898.0,40348.79,0


In [7]:
X = transactions.drop(columns=['isFraud'])
y = transactions['isFraud']
smote = SMOTE(random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [8]:
# Model training and testing
N = 10
df = pd.DataFrame(data=[], columns=['n_neighbors', 'accuracy', 'smote_accuracy'])

for i in range(2, N):
    clf = KNeighborsClassifier(n_neighbors=i)
    smote_clf = KNeighborsClassifier(n_neighbors=i)
    
    clf.fit(X_train, y_train)
    smote_clf.fit(X_train_resampled, y_train_resampled)

    df.loc[len(df)] = [int(i), clf.score(X_test, y_test), smote_clf.score(X_test, y_test)]

df.head(15)

Unnamed: 0,n_neighbors,accuracy,smote_accuracy
0,2.0,0.992085,0.922809
1,3.0,0.991921,0.872914
2,4.0,0.992145,0.885149
3,5.0,0.992145,0.847721
4,6.0,0.992154,0.860263
5,7.0,0.992154,0.830443
6,8.0,0.992154,0.843243
7,9.0,0.992154,0.817608


In [None]:
plt.plot(df['n_neighbors'], df['accuracy'], color='blue')
plt.plot(df['n_neighbors'], df['smote_accuracy'], color='red')
plt.show()

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=4)
knn_clf.fit(X_train, y_train)

pickle.dump(knn_clf, open('./KNN.pkl', 'wb'))