In [1]:
import os
import numpy as np
import utils
import pickle
import copy

# Plotting utilitys
import matplotlib.pyplot as plt

# Imports for feature engeering
#from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE

# Import machine learning librarys
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report

# Import classifiers
from sklearn.neighbors import KNeighborsClassifier

# Set random state
np.random.seed(3)

In [2]:
# Dataset location
DATASETS = "./datasets"

MNIST = "mnist.hdf5"
USPS = "usps.hdf5"
ARDIS = "ardis.hdf5"

datasets = {"ARDIS": os.path.join(DATASETS, ARDIS), \
            "USPS": os.path.join(DATASETS, USPS), \
            "MNIST": os.path.join(DATASETS, MNIST)}

# Load data from hdf5 file and return dict
data = utils.load_data(datasets)

Loading ARDIS...
Loading USPS...
Loading MNIST...
Done.


In [3]:
def pre_process(X_train, X_test, name, min_max=True, resize=True):
    
    if resize:
        X_train = utils.resize_images(X_train, 16, 16)
        X_test = utils.resize_images(X_test, 16, 16)
        
    if min_max:
        if name == "USPS":
            X_train  = (X_train + 1.0) / 2.0
            X_test = (X_test + 1.0) / 2.0
        else:
            X_train  = X_train / 255.0
            X_test = X_test / 255.0

    return X_train, X_test

## Using randomized grid search to find best parameter for KNN

In [4]:
MODELS = "./models/randomized_search"
use_datasets = ["ARDIS", "USPS", "MNIST"]

kfold = KFold(n_splits=3)

grid_params = {"n_neighbors": [2, 4, 6, 8, 10, 12, 14, 16],
               "weights": ["uniform", "distance"],
               "algorithm": ["ball_tree", "kd_tree"],
               "leaf_size": [5, 10, 20, 25, 30, 35, 40, 50, 60]
              }

for dataset_name in use_datasets:
    print(f"Using dataset: {dataset_name}")
    
    # Select data
    X_train, X_test, y_train, y_test = copy.deepcopy(utils.select_dataset(data, dataset_name))
    
    # Resize and scale
    X_train, X_test = pre_process(X_train, X_test, dataset_name)
    
    # Alogrithem for optimization
    model = KNeighborsClassifier()
    
    search = RandomizedSearchCV(model,
                                grid_params,
                                n_iter=60,
                                cv=kfold,
                                scoring='accuracy',
                                verbose=10,
                                random_state=3,
                                n_jobs=-1)
    
    search.fit(X_train, y_train)
    
    # Predict on training- and testset
    pred_train = search.predict(X_train)
    pred_test = search.predict(X_test)
        
    # Calculate error rate
    train_error_rate = (1.0 - accuracy_score(pred_train, y_train)) * 100
    test_error_rate = (1.0 - accuracy_score(pred_test, y_test)) * 100
    
    # Print Results
    print(f"Best estimator: {search.best_estimator_}")
    print(f"Best params: {search.best_params_}")
    print(f"Best score: {search.best_score_}")
    print("KNN train error: {:.3f}%".format(train_error_rate))
    print("KNN test error: {:.3f}%".format(test_error_rate))
                   
    # Pickel model for later inspection                
    pickle_string = f"{dataset_name.lower()}-KNN-{train_error_rate:.3f}-{test_error_rate:.3f}-model.pickle"
    pickle_path = os.path.join(MODELS, pickle_string)
    pickle.dump(search, open(pickle_path, 'wb'))
    
    print("")

Using dataset: ARDIS
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Best estimator: KNeighborsClassifier(algorithm='kd_tree', leaf_size=10, n_neighbors=2,
                     weights='distance')
Best params: {'weights': 'distance', 'n_neighbors': 2, 'leaf_size': 10, 'algorithm': 'kd_tree'}
Best score: 0.1686363636363636
KNN train error: 0.000%
KNN test error: 7.700%

Using dataset: USPS
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Best estimator: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=4, weights='distance')
Best params: {'weights': 'distance', 'n_neighbors': 4, 'leaf_size': 30, 'algorithm': 'ball_tree'}
Best score: 0.9625556723595939
KNN train error: 0.000%
KNN test error: 5.282%

Using dataset: MNIST
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Best estimator: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=4, weights='distance')
Best params: {'weights': 'distance', 'n_neighbors': 4, 'leaf_size': 30, 'algorithm