In [None]:
from datasets.cifar_10 import read_data
import numpy as np
from utils import *

FIG_FOLDER = 'cifar'
SAVE_PLOTS = True # save all the plots

# load the data
# data are 32x32x3 images serialized
x_train, y_train = read_data.load_train_data()
x_test, y_test = read_data.load_test_data()
label_names = np.array(read_data.load_label_names())


test = True # for testing stuff without the need to wait an eternity
# test = False # for testing stuff without the need to wait an eternity
if test:
    x_train = x_train[:2000]
    y_train = y_train[:2000]
    x_test = x_test[:200]
    y_test = y_test[:200]

In [None]:
"""
def rgb_to_gray(A):
    '''
    RGB image to grayscale
    '''
    return (A @ [0.299, 0.587, 0.114]).astype(A.dtype)

x_train = rgb_to_gray(np.rollaxis((x_train).reshape((-1, 3, 32, 32)), 1, 4)).reshape((x_train.shape[0], -1))
x_test = rgb_to_gray(np.rollaxis((x_test).reshape((-1, 3, 32, 32)), 1, 4)).reshape((x_test.shape[0], -1))
x_train.shape
"""

In [None]:
# subsample training data in order to equalize class frequencies
# we match the lowest class frequency
'''
vals, freq = np.unique(y_train, return_counts = True)
wanted_freq = min(freq)
all_indexes = np.zeros((len(freq), wanted_freq), dtype=np.uint32)
for i, v in enumerate(vals):
    indexes = np.random.choice(np.where(y_train == v)[0], size=wanted_freq, replace=False)
    all_indexes[i, :] = indexes
all_indexes = np.sort(all_indexes.flatten())
y_train = y_train[all_indexes]
x_train = x_train[all_indexes, :]
'''

In [None]:
import matplotlib.pyplot as plt

vals2, freq2 = np.unique(y_train, return_counts = True)
x_ = np.arange(len(freq2))
width = 0.45

fig, ax = plt.subplots(figsize=(15,12))
# ax.bar(x_ - width/2, freq, width, label='Original Training Set')
# ax.bar(x_ + width/2, freq2, width, label='Subsampled Training Set')
ax.bar(x_, freq2, width*2, label='Training Set')
ax.set_ylabel('Number of samples')
ax.set_xlabel('Class')
ax.set_title('Training set balance')
ax.set_xticks(x_)
ax.set_xticklabels(label_names[vals2])
ax.legend()
plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'training_freq', FIG_FOLDER)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

pipe = Pipeline([('scaler1', MinMaxScaler()),
                 ('PCA', PCA(0.9)),
                 ('scaler2', MinMaxScaler()),
                ])
pipe.fit(x_train)
x_train = pipe.transform(x_train)
x_test = pipe.transform(x_test)

In [None]:
x_train.shape

In [None]:
params = [
          {'kernel': ('linear',), 'C': (0.1, 1, 10)},
          {'kernel': ('poly',), 'C': (0.1, 1, 10), 'degree': (2, 3), 'gamma': (0.1, 1)},
          # {'kernel': ('poly',), 'C': (0.1, 1, 10), 'degree': (2, 3, 4), 'gamma': (0.1, 1, 10)},
          {'kernel': ('rbf',), 'C': (1, 10, 50), 'gamma': (0.1, 1, 10)},
          # {'kernel': ('rbf',), 'C': (1, 10, 50), 'gamma': (0.01, 0.1, 1, 10, 100)},
          {'kernel': ('sigmoid',), 'C': (100, 1000, 10000), 'gamma': (0.0001, 0.001, 0.01)},
          # {'kernel': ('sigmoid',), 'C': (10, 100, 1000, 10000), 'gamma': (0.0001, 0.001, 0.01, 0.1)},
         ]

models = []
for kwargs in params:
    models.append(GS_wrapper(SVC(decision_function_shape='ovo'), kwargs, cv=2, scoring='f1_macro', verbose=4, n_jobs=-1))

In [None]:
# Fit the models
for model in models:
    t0 = time()
    model.fit(x_train, y_train)
    mins, secs = divmod(time() - t0, 60)
    print(f'{mins} mins {secs:.4f} secs')

In [None]:
# print best parameters for every model and the score
for model in models:
    print(model.best_params_, model.best_score_)

In [None]:
# calculate the metrics
for model in models:
    model.calculate_train_test_metrics(x_train, y_train, x_test, y_test)
    # print(model.train_metrics, model.test_metrics)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

'''
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
t0 = time()
knn.fit(x_train, y_train)
knn_time = time() - t0

knn_train_metrics = calculate_metrics(knn, x_train, y_train)
knn_test_metrics = calculate_metrics(knn, x_test, y_test)
'''
knn_params = {'n_neighbors': [1, 3, 5, 10, 30, 100, 200],
              'weights': ['uniform', 'distance'],
             }
knn = GS_wrapper(KNeighborsClassifier(), knn_params, cv=3, scoring='f1_macro', verbose=4, n_jobs=-1)
knn.fit(x_train, y_train)
# knn.calculate_train_test_metrics(x_train, y_train, x_test, y_test)
knn_train_metrics = calculate_metrics(knn, x_train, y_train)
knn_test_metrics = calculate_metrics(knn, x_test, y_test)

In [None]:
from sklearn.neighbors import NearestCentroid

nc = NearestCentroid()
t0 = time()
nc.fit(x_train, y_train)
nc_time = time() - t0
nc_train_metrics = calculate_metrics(nc, x_train, y_train)
nc_test_metrics = calculate_metrics(nc, x_test, y_test)

In [None]:
# create data for the plots
names = []
times = []
acc = [[], []]
prec = [[], []]
recall = [[], []]
f1 = [[], []]
for model in models:
    if hasattr(model, 'refit_time_'):
        times.append(model.refit_time_)
    else:
        times.append(model.train_time)

    acc[0].append(model.train_metrics[0]) 
    acc[1].append(model.test_metrics[0]) 
    prec[0].append(model.train_metrics[1]) 
    prec[1].append(model.test_metrics[1]) 
    recall[0].append(model.train_metrics[2]) 
    recall[1].append(model.test_metrics[2]) 
    f1[0].append(model.train_metrics[3]) 
    f1[1].append(model.test_metrics[3]) 

    if hasattr(model, 'best_params_'):
        names.append("\n".join([f'{key}: {val}' for key, val in model.best_params_.items()]))
    else:
        if model.init_kwargs is not None:
            if hasattr(model, 'kernel'):
                names.append(f'{model.kernel}, ' + ", ".join([f'{key}: {val}' for key, val in model.init_kwargs.items()]))
            else:
                names.append(", ".join([f'{key}: {val}' for key, val in model.init_kwargs.items()]))
        else:
            names.append(f'{model.kernel}')

# add kNN and nearest class centroid
# names.append('kNN: 3 neighbors\nweighted (L2)')
names.append("\n".join([f'{key}: {val}' for key, val in knn.best_params_.items()]))
names.append('Nearest Centroid')

if hasattr(knn, 'refit_time_'):
    times.append(knn.refit_time_)
else:
    times.append(knn.train_time)
# times.append(knn_time)
times.append(nc_time)
acc[0].append(knn_train_metrics[0]) 
acc[1].append(knn_test_metrics[0]) 
acc[0].append(nc_train_metrics[0]) 
acc[1].append(nc_test_metrics[0]) 

prec[0].append(knn_train_metrics[1]) 
prec[1].append(knn_test_metrics[1]) 
prec[0].append(nc_train_metrics[1]) 
prec[1].append(nc_test_metrics[1]) 

recall[0].append(knn_train_metrics[2]) 
recall[1].append(knn_test_metrics[2]) 
recall[0].append(nc_train_metrics[2]) 
recall[1].append(nc_test_metrics[2]) 

f1[0].append(knn_train_metrics[3]) 
f1[1].append(knn_test_metrics[3]) 
f1[0].append(nc_train_metrics[3]) 
f1[1].append(nc_test_metrics[3]) 

In [None]:
import matplotlib.pyplot as plt

# plot times
fig = plt.figure(figsize=(15,12))
ax = plt.bar(names, times)
plt.title('Training time')
plt.xlabel('Model')
plt.ylabel('Time in seconds')
plt.xticks(rotation=45)
plt.show()

if SAVE_PLOTS:
    save_plots(fig, 'training_time', FIG_FOLDER)

In [None]:
# plot accuracy
fig, ax = plt.subplots(figsize=(15,12))

x_ = np.arange(len(names))
width = 0.35

ax.bar(x_ - width/2, acc[0], width, label='Train')
ax.bar(x_ + width/2, acc[1], width, label='Test')

ax.set_ylabel('Accuracy')
ax.set_title('Accuracy')
ax.set_xticks(x_)
ax.set_xticklabels(names)
ax.legend()
plt.xticks(rotation=45)
plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'accuracy', FIG_FOLDER)

In [None]:
# plot precision
fig, ax = plt.subplots(figsize=(15,12))

x_ = np.arange(len(names))
width = 0.35

ax.bar(x_ - width/2, prec[0], width, label='Train')
ax.bar(x_ + width/2, prec[1], width, label='Test')

ax.set_ylabel('Precision')
ax.set_title('Precision')
ax.set_xticks(x_)
ax.set_xticklabels(names)
ax.legend()
plt.xticks(rotation=45)
plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'precision', FIG_FOLDER)

In [None]:
# plot recall
fig, ax = plt.subplots(figsize=(15,12))

x_ = np.arange(len(names))
width = 0.35

ax.bar(x_ - width/2, recall[0], width, label='Train')
ax.bar(x_ + width/2, recall[1], width, label='Test')

ax.set_ylabel('Recall')
ax.set_title('Recall')
ax.set_xticks(x_)
ax.set_xticklabels(names)
ax.legend()
plt.xticks(rotation=45)
plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'recall', FIG_FOLDER)

In [None]:
# plot f1
fig, ax = plt.subplots(figsize=(15,12))

x_ = np.arange(len(names))
width = 0.35

ax.bar(x_ - width/2, f1[0], width, label='Train')
ax.bar(x_ + width/2, f1[1], width, label='Test')

ax.set_ylabel('F1 score')
ax.set_title('F1 score')
ax.set_xticks(x_)
ax.set_xticklabels(names)
ax.legend()
plt.xticks(rotation=45)
plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'f1', FIG_FOLDER)

In [None]:
# plot all metrics
fig, ax = plt.subplots(2, 1, figsize=(19,12))

plt.subplots_adjust(hspace=.3)

x_ = np.arange(len(names))
width = 0.8

div = 4
ax[0].bar(x_ - width/2 + 0*width/4, acc[0], width/4, label='Accuracy')
ax[0].bar(x_ - width/2 + 1*width/4, prec[0], width/4, label='Precision')
ax[0].bar(x_ - width/2 + 2*width/4, recall[0], width/4, label='Recall')
ax[0].bar(x_ - width/2 + 3*width/4, f1[0], width/4, label='F1')

ax[0].set_title('Metrics during training')
ax[0].set_xticks(x_)
ax[0].set_xticklabels(names, rotation=45)
ax[0].legend()

ax[1].bar(x_ - width/2 + 0*width/4, acc[1], width/4, label='Accuracy')
ax[1].bar(x_ - width/2 + 1*width/4, prec[1], width/4, label='Precision')
ax[1].bar(x_ - width/2 + 2*width/4, recall[1], width/4, label='Recall')
ax[1].bar(x_ - width/2 + 3*width/4, f1[1], width/4, label='F1')

ax[1].set_title('Metrics during test')
ax[1].set_xticks(x_)
ax[1].set_xticklabels(names, rotation=45)
ax[1].legend()

plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'all_metrics', FIG_FOLDER)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# pick the best model (based in test f1 score)
best = max(models, key=lambda x: x.test_metrics[3])
best_name =", ".join([f'{key}: {val}' for key, val in best.best_params_.items()])
pred = best.predict(x_test)

fig = plt.figure(figsize=(15,12))
ax = sns.heatmap(confusion_matrix(y_test, pred), annot=True, cmap='Purples', fmt='g')

ax.set_title(f'Confusion Matrix | {best_name}')
ax.set_xlabel('Predictions')
ax.set_ylabel('Actual Values')
ax.xaxis.set_ticklabels(list(range(10)))
ax.yaxis.set_ticklabels(list(range(10)))

plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'confusion_matrix', FIG_FOLDER)

In [None]:
import pandas as pd

save_folder = f'results/{FIG_FOLDER}/'
create_if_not_exists(save_folder)

# save training data
data = zip(names, acc[0], prec[0], recall[0], f1[0], times)
col_names = ["Model", "Accuracy", "Precision", "Recall", "F1", "Training Time (seconds)"]
df = pd.DataFrame(data, columns=col_names)
df.to_excel(save_folder+'training_data.xlsx', float_format="%.4f", index=False)

# save test data
data = zip(names, acc[1], prec[1], recall[1], f1[1])
col_names = ["Model", "Accuracy", "Precision", "Recall", "F1"]
df = pd.DataFrame(data, columns=col_names)
df.to_excel(save_folder+'test_data.xlsx', float_format="%.4f", index=False)

In [None]:
from math import ceil

# print wrong classifications
wrong_classifications = np.where((pred ^ y_test) != 0)[0]
wrong_classes, wrong_idxs = np.unique(y_test[wrong_classifications], return_index=True)

x_test, _ = read_data.load_test_data()

wrong_images = np.rollaxis((x_test[wrong_classifications[wrong_idxs], :]).reshape((-1, 3, 32, 32)), 1, 4)

nrows, ncols = ceil(len(wrong_idxs) / 3), 3

fig, ax = plt.subplots(nrows, ncols, figsize=(15,12))
for i, image in enumerate(wrong_images):
    ax[i//ncols, i%ncols].imshow(image, vmin=0, vmax=255)
    ax[i//ncols, i%ncols].set_title(f'Predicted: {label_names[pred[wrong_classifications[wrong_idxs[i]]]]}, Actual: {label_names[y_test[wrong_classifications[wrong_idxs[i]]]]}')

# remove all axis
for i in range(nrows):
    for j in range(ncols):
        ax[i,j].axis('off')
plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'wrong_results_1', FIG_FOLDER)

# remove previous images, so we don't display them twice
wrong_classifications = np.delete(wrong_classifications, wrong_idxs)

wrong_classes, wrong_idxs = np.unique(pred[wrong_classifications], return_index=True)
wrong_images = np.rollaxis((x_test[wrong_classifications[wrong_idxs], :]).reshape((-1, 3, 32, 32)), 1, 4)

nrows, ncols = ceil(len(wrong_idxs) / 3), 3

fig, ax = plt.subplots(nrows, ncols, figsize=(15,12))
for i, image in enumerate(wrong_images):
    ax[i//ncols, i%ncols].imshow(image, vmin=0, vmax=255)
    ax[i//ncols, i%ncols].set_title(f'Predicted: {label_names[pred[wrong_classifications[wrong_idxs[i]]]]}, Actual: {label_names[y_test[wrong_classifications[wrong_idxs[i]]]]}')

# remove all axis
for i in range(nrows):
    for j in range(ncols):
        ax[i,j].axis('off')
plt.show()
if SAVE_PLOTS:
    save_plots(fig, 'wrong_results_2', FIG_FOLDER)