In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
#################################################
# Create data set
#################################################
X, y = datasets.load_wine(return_X_y=True)

####################################################
# Split the data into a training and test set
# !! Don't touch the test set until the very end !!
####################################################
X, X_test, y, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

###############################################################################
# A: Try a simple model without scaling the data (no hyperparameter optimization)
###############################################################################
knn = KNeighborsClassifier() # Default num_neighbors=5
knn.fit(X, y)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(f"Default 5-NN accuracy on test set: {accuracy:.3f}")
# Done

#################################################
# B: Optimize hyperparameter k for k-NN classifier
# using 100 random train-validation set splits
#################################################
best_k = 0
best_accuracy = 0

for k in range(1, 10):
    # Use 100 random train-validation set splits to optimize the hyperparameter k
    # Store the accuracies for each random split
    accuracies = []
    for r in range(100):
        # Train-Validation set split
        random_state = r
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=random_state)
        # Standardize the data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        # Train k-nearest neighbors classifier with k neighbors
        knn = KNeighborsClassifier(n_neighbors=k)
        y_pred = knn.fit(X_train_scaled, y_train).predict(X_val_scaled)
        accuracy = accuracy_score(y_pred, y_val)
        accuracies.append(accuracy)

    # take the mean over all random splits
    accuracy = np.mean(accuracies)

    # Update the best k and best accuracy if the current k was better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

print(f"Best k: {best_k}, Best accuracy during optimization: {best_accuracy:.3f}")

#################################################
# Train the best model on the entire training set
#################################################
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train_scaled, y_train)

##########################################################
# Use the test set to evaluate the best k on unseen data
##########################################################
# Important: Any transformation applied to the training data must also be applied to the test data
# This might be scaling, adding or removing or scaling features, etc.
X_test_scaled = scaler.transform(X_test) # Using the scaler fitted in the previous step
y_pred_final = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_pred_final, y_test)
print(f"Final accuracy of {best_k}-NN on the test set: {accuracy:.3f}")
# The final accuracy can be slightly lower than the accuracy during optimization,
# because the test set was not used during optimization
# Done
