In [1]:
import os
import numpy as np
import utils
import copy

# Plotting utilitys
import matplotlib.pyplot as plt

# Imports for feature engeering
#from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE

# Import machine learning librarys
#from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report

# Import classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.neural_network import MLPClassifier

# Pipeline building
#from sklearn.pipeline import Pipeline

# Set random state
np.random.seed(3)

In [2]:
def pre_process(X_train, X_test, dataset_name, min_max=True, resize=True):
    
    if resize:
        # Resize using CV2 and linear function
        X_train = utils.resize_images(X_train, 16, 16)
        X_test = utils.resize_images(X_test, 16, 16)
        
    if min_max:
        # Normalize data between 0 and 1 on a dataset level
        if dataset_name == "USPS":
            X_train  = (X_train + 1.0) / 2.0
            X_test = (X_test + 1.0) / 2.0
        else:
            X_train  = X_train / 255.0
            X_test = X_test / 255.0

    return X_train, X_test

In [3]:
# Dataset location
DATASETS = "./datasets"

MNIST = "mnist.hdf5"
USPS = "usps.hdf5"
ARDIS = "ardis.hdf5"

datasets = {"ARDIS": os.path.join(DATASETS, ARDIS), \
            "USPS": os.path.join(DATASETS, USPS), \
            "MNIST": os.path.join(DATASETS, MNIST)}

# Load data from hdf5 file and return dict
data = utils.load_data(datasets)

Loading ARDIS...
Loading USPS...
Loading MNIST...
Done.


## Train models for a baseline without optimization

In [4]:
for dataset_name in ["USPS", "ARDIS","MNIST"]:
    print(f"Using dataset: {dataset_name}")
    
    # Select data
    X_train, X_test, y_train, y_test = copy.deepcopy(utils.select_dataset(data, dataset_name))
    
    X_train, X_test = pre_process(X_train, X_test, dataset_name)
    
    models = [
        ("GNB", GaussianNB()),
        ("KNN", KNeighborsClassifier(n_jobs=-1)),
        ("LR", LogisticRegression(n_jobs=-1)),
        ("SVC", SVC()),
        ("TREE", DecisionTreeClassifier())
    ]
    
    for name, model in models:
        # Fit model
        model.fit(X_train, y_train)
        
        # Predict on training- and testset
        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)
        
        # Calculate error rate
        error_rate_train = (1.0 - accuracy_score(pred_train, y_train)) * 100
        error_rate_test = (1.0 - accuracy_score(pred_test, y_test)) * 100
        
        # print results
        print("{} train: {:.3f}%, test: {:.3f}%".format(name, error_rate_train, error_rate_test))  
        
    print("")

Using dataset: USPS
GNB train: 24.126%, test: 28.052%
KNN train: 2.085%, test: 5.531%
LR train: 1.413%, test: 8.221%
SVC train: 0.631%, test: 5.282%
TREE train: 0.000%, test: 16.542%

Using dataset: ARDIS
GNB train: 56.939%, test: 61.800%
KNN train: 4.439%, test: 8.600%
LR train: 5.667%, test: 13.500%
SVC train: 1.061%, test: 4.100%
TREE train: 0.000%, test: 23.900%

Using dataset: MNIST
GNB train: 44.625%, test: 45.600%
KNN train: 1.648%, test: 2.940%
LR train: 7.455%, test: 7.570%
SVC train: 1.208%, test: 2.010%
TREE train: 0.000%, test: 11.040%

