In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import pickle

transactions = pd.read_csv('../database/transactions.csv')

In [None]:
def m_accuracy(model, X, y):
    '''
    returns: the accuracy of the given model, on the digits dataset
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
    clf = model
    clf.fit(X_train, y_train)
    return clf.score(X_test,y_test)

def m_accuracy_mv(model, X, y, N:int):
    '''
    returns: the tupple (mean, std_var)
    '''

    accuracies = [ m_accuracy(model, X, y) for i in range(N) ]
    return (np.mean(accuracies), np.std(accuracies))

def norm2float(df:pd.DataFrame, cols:list):
    for col in cols:
        df[col] = df[col].str.replace(',', '.').astype(float)
        
    return df

In [None]:
# Data Normalization
transactions[['type']] = transactions[['type']].replace(['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER'], [0, 1, 2, 3, 4])
transactions = transactions.drop(columns=['transactionId', 'nameOrig', 'nameDest'])
transactions = norm2float(transactions, ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'])
transactions.head(10)

In [None]:
X = transactions.drop(columns=['isFraud'])
y = transactions['isFraud']
smote = SMOTE(random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Model training and testing
N = 10
df = pd.DataFrame(data=[], columns=['n_neighbors', 'accuracy', 'smote_accuracy'])

for i in range(2, N):
    clf = KNeighborsClassifier(n_neighbors=i)
    smote_clf = KNeighborsClassifier(n_neighbors=i)
    
    clf.fit(X_train, y_train)
    smote_clf.fit(X_train_resampled, y_train_resampled)

    df.loc[len(df)] = [int(i), clf.score(X_test, y_test), smote_clf.score(X_test, y_test)]

df.head(15)

In [None]:
plt.plot(df['n_neighbors'], df['accuracy'], color='blue')
plt.plot(df['n_neighbors'], df['smote_accuracy'], color='red')
plt.show()

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=4)
knn_clf.fit(X_train, y_train)

pickle.dump(knn_clf, open('./KNN.pkl', 'wb'))