In [9]:
# Summary

# load experimental data
# normalize experimental data
# Handle class imbalance using Smote
# use train test split, and KNeighborsClassifier to train the model
# load dummy dataset which is mine
# predict this dataset using the model that we defined
# check results

In [10]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [11]:
# Load the experimental dataset (with "Class" column)
df_exp = pd.read_csv('../data/creditcard.csv')

# Drop duplicates
df_exp = df_exp.drop_duplicates()

# Normalizing the "Amount" column
scaler = StandardScaler()
df_exp["Normalized_Amount"] = scaler.fit_transform(df_exp["Amount"].values.reshape(-1, 1))

# Drop the original "Amount" column (optional)
df_exp = df_exp.drop(["Amount"], axis=1)

In [12]:
# Fetching the target feature ("Class")
Y_exp = df_exp["Class"]

# Fetching the independent features (dropping "Class")
X_exp = df_exp.drop(["Class"], axis=1)

# Handle class imbalance using SMOTE
X_balance, Y_balance = SMOTE().fit_resample(X_exp, Y_exp)

# Split the experimental data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_balance, Y_balance, test_size=1/3, random_state=42)

# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5, algorithm="kd_tree", n_jobs=-1)
knn.fit(X_train, Y_train)
print('Model trained.')

# Parameters for KNeighborsClassifier

# "n_neighbors" parameter is the number of nearest neighbors to consider when predicting a new data point.
# 5 is a commonly used value that provides a balance between sensitivity to the data and reducing noise.
# It's often chosen as a starting point for KNN.
# Cross-validation for k Selection did not apply since the dataset is too big.

# "algorithm" parameter is for the selected algorithm as named. The "kd_tree" parameter is one of the most efficient algorithms
# for searching for nearest neighbors, especially in high-dimensional datasets.
# We can use some alternatives for the algorithm, like "brute" or "ball_tree", but "kd_tree" is often chosen
# because it provides a good balance of speed and accuracy, especially if the dataset has multiple features and isn't extremely large.

# "n_jobs" parameter controls the number of CPU cores to use for computation.
# We selected the "n_jobs" parameter as -1, which means using all available CPU cores to speed up the computation.

Model trained.


In [13]:
# Load your dummy dataset (without "Class" column)
df = pd.read_csv('../data/credit_card_fraud_data.csv')

# Drop duplicates
df = df.drop_duplicates()

# Normalize the "Amount" column using the same scaler
df["Normalized_Amount"] = scaler.transform(df["Amount"].values.reshape(-1, 1))

# Drop the "Amount" column as it's already normalized
df = df.drop(columns=['Amount'])

# Predict the "Class" for your dummy dataset using the trained KNN model
knn_predicted_class = knn.predict(df)

# Add the predicted "Class" as a new column to your dummy dataset
df['Predicted_Class'] = knn_predicted_class

# Display the first few rows to see the predicted results
print(df['Predicted_Class'].value_counts())

Predicted_Class
0    993
1      7
Name: count, dtype: int64


In [14]:
# Display only fraudulent transactions
fraudulent_transactions = df[df['Predicted_Class'] == 1]

# Display original values and predicted class
fraudulent_transactions.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Normalized_Amount,Predicted_Class
452,453.0,-1.704162,-0.453726,2.432202,0.022421,0.406636,0.171611,-1.170789,-1.002315,0.507704,...,2.735958,-1.594964,1.839736,-2.044991,1.607374,-1.621848,-0.105754,-1.009972,0.126964,1
453,454.0,0.059401,0.612338,-1.362311,-0.890434,-0.477306,-0.169297,-0.102583,-0.306116,1.115609,...,0.746976,-1.512821,-0.425203,-0.113005,-0.252147,-0.667364,-1.901829,-0.303158,2.554583,1
468,469.0,0.66114,-1.051579,0.455899,-0.661237,2.813533,0.992875,-1.156099,0.401932,1.351873,...,0.051998,0.679155,-0.548599,-0.942351,-0.463488,-0.513513,-0.828955,0.851514,3.042116,1
499,500.0,4.027357,-7.391163,-7.992751,-4.646769,-4.187876,-7.727413,11.277032,-24.940195,-10.807854,...,-1.320533,-11.662054,2.488555,-20.802108,1.191081,-5.933462,-8.771469,-2.165336,36.48992,1
605,606.0,-5.524287,-5.433947,-4.317835,-24.786339,1.97525,3.944426,5.589757,4.513293,-6.42053,...,15.852735,16.548414,-5.482316,0.239618,11.923256,-17.230375,-1.592555,-12.045658,36.534066,1


In [15]:
# Simulate true labels based on some condition (same as Decision Tree one)
threshold = 5.0  # Example threshold; adjust based on your logic
df['True_Class'] = (df['Normalized_Amount'] > threshold).astype(int)

# Evaluate the KNN model
accuracy = accuracy_score(df['True_Class'], knn_predicted_class)
precision = precision_score(df['True_Class'], knn_predicted_class)
recall = recall_score(df['True_Class'], knn_predicted_class)
f1 = f1_score(df['True_Class'], knn_predicted_class)

# Print evaluation metrics for KNN
print("KNN Evaluation Metrics:")
print("Accuracy: {:.5f}".format(accuracy))
print("Precision: {:.5f}".format(precision))
print("Recall: {:.5f}".format(recall))
print("F1-score: {:.5f}".format(f1))

KNN Evaluation Metrics:
Accuracy: 0.97000
Precision: 0.57143
Recall: 0.12903
F1-score: 0.21053
