In [1]:
"""
Imports
"""
import pandas as pd
import os
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.semi_supervised import LabelPropagation
from imblearn.pipeline import Pipeline

In [2]:
def knn_gridsearch(data, labels, k_range, n_splits = 10):
    """
    knn_gridsearch()
     - Performs gridsearch for kNN on the given data.
    """

    # Create dataframe for storage
    knn_results = pd.DataFrame([], columns = ["param_n_neighbors", "mean_train_score", "mean_test_score"])

    # Cross validation settings
    kf = KFold(n_splits = n_splits, random_state = 1, shuffle=True)

    # Define the model
    knn_model = KNeighborsClassifier()
    parameters = {'n_neighbors': k_range}

    clf = GridSearchCV(knn_model, parameters, cv=kf, return_train_score=True, scoring=["f1", "accuracy"], refit=False)
    clf.fit(data, labels)

    df = pd.DataFrame(clf.cv_results_)
    knn_results = pd.concat([knn_results,df])
    knn_results = knn_results.sort_values(by=['mean_test_score'], ascending=False)

    return knn_results

def knn(train_data, train_labels, test_data, test_labels, k=3):
    """
    Performs kNN on the given data.
    """

    neighbours = KNeighborsClassifier(n_neighbors=k)
    neighbours.fit(train_data, train_labels)
    pred_labels = neighbours.predict(test_data)

    acc = accuracy_score(test_labels, pred_labels)
    f1 = f1_score(test_labels, pred_labels)

    return acc, f1

In [3]:
def labelPropagation(train_data, train_labels, test_data, test_labels, k):

    # Create and fit the model
    lp = LabelPropagation(kernel = "knn", n_neighbors=k)
    lp.fit(train_data, train_labels)

    # Predict on the test data
    pred_labels = lp.predict(test_data)
    acc = accuracy_score(test_labels, pred_labels)
    f1 = f1_score(test_labels, pred_labels)

    return acc, f1, lp.transduction_

In [4]:
"""
Data loading
"""
file_dir = os.getcwd()
data_path = os.path.join(file_dir, 'data/creditcard.csv')
raw_data = pd.read_csv(data_path)

In [10]:
"""
Preprocessing
"""

# Split labels and drop unnecessary columns
y = raw_data["Class"]
x = raw_data.drop(columns=["Time", "Amount", "Class"])

# Balance data using a combination of SMOTE and random undersampling
rus = RandomUnderSampler(random_state=0, sampling_strategy=1)
smote = SMOTE(random_state=0, sampling_strategy=0.1)
pipeline = Pipeline([("smote", smote), ("rus", rus)])
x_balanced, y_balanced = pipeline.fit_resample(x,y)

# Split data into 80% train / 20% test
x_train, x_test, y_train, y_test = train_test_split(x_balanced, y_balanced, test_size=0.2, random_state=0, stratify=y_balanced)

# Split train data into 30% labeled / 70% unlabeled
x_train_lab, x_train_unlab, y_train_lab, y_train_unlab = train_test_split(x_train, y_train, test_size=0.7, random_state=0, stratify=y_train)

# Create the data and labels for the semi-supervised learning set
x_train_lab_unlab = pd.concat([x_train_lab, x_train_unlab])
y_train_unlab = pd.Series([-1 for i in y_train_unlab])
y_train_lab_unlab = pd.concat([y_train_lab, y_train_unlab])

In [11]:
"""
Perform gridsearch for the k-NN baseline model
"""
k_range = [k for k in range (1,21,2)]
knn_results = knn_gridsearch(x_train_lab, y_train_lab, k_range, n_splits = 10)
keep_columns = ["param_n_neighbors", "mean_train_accuracy", "mean_test_accuracy", "mean_train_f1", "mean_test_f1"]
knn_results[keep_columns]

Unnamed: 0,param_n_neighbors,mean_train_accuracy,mean_test_accuracy,mean_train_f1,mean_test_f1
0,1,1.0,0.993478,1.0,0.993515
1,3,0.994276,0.989227,0.994308,0.989325
2,5,0.990286,0.98593,0.990377,0.986101
3,7,0.987835,0.984318,0.987975,0.984517
4,9,0.985751,0.982485,0.985937,0.982726
5,11,0.984187,0.98102,0.984412,0.981289
6,13,0.982754,0.979921,0.983015,0.98021
7,15,0.981305,0.978382,0.981597,0.978723
8,17,0.979839,0.977356,0.980166,0.977739
9,19,0.979318,0.97743,0.979651,0.977792


In [15]:
"""
Test the best k-NN model on the entire test set
"""
acc, f1 = knn(x_train_lab, y_train_lab, x_test, y_test, k=3)
print(f'Accuracy = {acc}')
print(f'f1 = {f1}')

Accuracy = 0.9890090565374132
f1 = 0.9891162385720506


In [18]:
"""
Find the optimal n_neighbors for labelPropagation
"""

lp_results = []

for k in range(1,21,2):
    acc, f1, labels = labelPropagation(x_train_lab_unlab, y_train_lab_unlab, x_test, y_test, k=k)
    lp_results.append([k,acc,f1])

print(pd.DataFrame(lp_results, columns=["k", "test_acc", "test_f1"]).sort_values(by="test_acc", ascending=False))

# Perform best labelPropagation method and save the predicted labels
acc, f1, model_labels = labelPropagation(x_train_lab_unlab, y_train_lab_unlab, x_test, y_test, k=5)

  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


    k  test_acc   test_f1
3   7  0.988921  0.988999
4   9  0.988657  0.988776
5  11  0.986811  0.986972
6  13  0.986635  0.986801
2   5  0.985316  0.985329
7  15  0.985316  0.985517
8  17  0.984261  0.984495
9  19  0.983470  0.983729
1   3  0.928867  0.924230
0   1  0.657434  0.479979


  probabilities /= normalizer


In [21]:
"""
Train k-NN on the entire training set along with the lp model labels
"""

knn(x_train_lab_unlab, model_labels, x_test, y_test, k=3)

(0.9827662006506639, 0.9827434407466104)