In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (f1_score, confusion_matrix, accuracy_score)
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import (KNeighborsClassifier, NeighborhoodComponentsAnalysis)
from sklearn.pipeline import (make_pipeline, Pipeline)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit, learning_curve

In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [None]:
df = pd.read_csv('preprocess.csv')
df.head()

In [None]:
n = 50
random_state = 42

X = df.loc[:, df.columns != 'prob']
y = df['prob']

In [None]:
# Split into train/test
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, stratify=y,
                     random_state=random_state)

In [None]:
best = 0
bestn = 1
bestm = ""
res = {}
for n_neighbors in range(1,n) :
    pca = make_pipeline(StandardScaler(),
                        PCA(random_state=random_state))
    lda = make_pipeline(StandardScaler(),
                        LinearDiscriminantAnalysis())
    nca = make_pipeline(StandardScaler(),
                        NeighborhoodComponentsAnalysis(random_state=random_state))
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    wknn = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
    # Make a list of the methods to be compared
    dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]

    for i, (name, model) in enumerate(dim_reduction_methods):
        model.fit(X_train, y_train)
        knn.fit(model.transform(X_train), y_train)
        wknn.fit(model.transform(X_train), y_train)
        out_knn = knn.predict(model.transform(X_test))
        f1_knn = f1_score(y_test, out_knn, average='micro')
        out_wknn = wknn.predict(model.transform(X_test))
        f1_wknn = f1_score(y_test, out_wknn, average='micro')
        if max(f1_knn, f1_wknn) > best:
            best = max(f1_knn, f1_wknn)
            bestn = n_neighbors
            bestm = name
            if f1_wknn > f1_knn:
                bestm = "Weighted " + bestm
        if name not in res.keys():
            res[name] = [f1_knn]
            res['W'+name] = [f1_wknn]
        else:
            res[name].append(f1_knn)
            res['W'+name].append(f1_wknn)
print("The best value was found with ", bestm, " and parameter k = ", bestn, " with a f1 score of ", best)

In [None]:
xaxis = range(1,n)
plt.plot(xaxis, res['PCA'], 'r-', label = 'PCA')
plt.plot(xaxis, res['LDA'], 'g-', label = 'LDA')
plt.plot(xaxis, res['NCA'], 'b-', label = 'NCA')
plt.plot(xaxis, res['WPCA'], 'r--', label = 'Weighted PCA')
plt.plot(xaxis, res['WLDA'], 'g--', label = 'Weighted LDA')
plt.plot(xaxis, res['WNCA'], 'b--', label = 'Weighted NCA')
plt.xlabel("K value")
plt.ylabel("F1 Score")
plt.grid(True)
plt.legend()
plt.savefig("../figures/knn.pdf")

In [None]:
knn = Pipeline(steps=[
    ('Scaler', StandardScaler()),
    ('PCA', PCA(random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=bestn, weights='distance')),
     ])
title = "KNN training curve with our best parameters"
cv = ShuffleSplit(n_splits=200, test_size=0.3, random_state=0)
plot_learning_curve(knn,title,X,y,ylim=(-0.01, 1.01), cv=cv, n_jobs=-1)
plt.savefig("../figures/knn_training_curve.pdf")
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print("\nConfusion matrix on test set:\n", confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", accuracy_score(y_test, pred))
print("\nF1 score on test set: ", f1_score(y_test, pred, average='micro'))

