In [87]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import math
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.spatial import distance
import h5py
import knn

from matplotlib.colors import ListedColormap
%matplotlib inline

## Item a
Preparando os dados para serem utilizados

In [45]:
def load_zip_data(zip_data_path):
    with h5py.File(zip_data_path, 'r') as hf:
        train = hf.get('train')
        X_tr = train.get('data')[:]
        y_tr = train.get('target')[:]
        test = hf.get('test')
        X_te = test.get('data')[:]
        y_te = test.get('target')[:]
    
    return X_tr, y_tr, X_te, y_te

def set_two_classes(y_train, y_test, digit):    
    y_train[y_train==digit] = 1
    y_test[y_test==digit] = 1
    
    y_train[y_train!=digit] = -1
    y_test[y_test!=digit] = -1
    return y_train, y_test

## Item b
Calculando os erros dentro e fora da amostra para 3-NN

In [46]:
# Separating data
X_train, y_train, X_test, y_test = load_zip_data('../data/usps.h5')

print('X_train: ', X_train.shape, 'y_train: ',y_train.shape, 'X_test: ',X_test.shape, 'y_test: ', y_test.shape)
unique, counts = np.unique(y_train, return_counts=True)
freqs = counts/len(y_train)
print('Frequencies of the digits: \n', dict(zip(unique, freqs)))

y_train, y_test = set_two_classes(y_train, y_test, 1)

X_train:  (7291, 256) y_train:  (7291,) X_test:  (2007, 256) y_test:  (2007,)
Frequencies of the digits: 
 {0: 0.16376354409546015, 1: 0.13784117405019888, 2: 0.10026059525442327, 3: 0.09024825126868742, 4: 0.08942531888629818, 5: 0.07625840076807022, 6: 0.09107118365107666, 7: 0.08846523110684405, 8: 0.07433822520916199, 9: 0.08832807570977919}


In [47]:
nbrs = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

In [48]:
e_in = 1 - accuracy_score(y_train, nbrs.predict(X_train))
e_out = 1 - accuracy_score(y_test, nbrs.predict(X_test))

In [49]:
print(f"In-sample error = {round(e_in, 4)} | Out of sample error = {round(e_out, 4)}")

In-sample error = 0.0015 | Out of sample error = 0.005


## Item c

In [110]:
def find_nn_idx(x, X, k):
    distances = dist(x, X).ravel()
    order = np.argsort(np.array(distances))
    return order[:k], distances[order[:k]]

def dist(x, z, dist_type= 'euclidean'):
    res = distance.cdist(x, z, dist_type)
    return res

class CNN:
    def __init__(self, k):
        self.k = k

    def init_cnn(self, X):
        N = X.shape[0]
        S_idx = np.random.choice(N, self.k)
        return S_idx

    def find_inconsistency(self, X, y, cnn, onn):
        found = False        
        for ix, x1 in enumerate(X):
            x1 = x1.reshape(1, -1)
            y1 = cnn.predict(x1)
            yo = onn.predict(x1)
            if y1 != yo:
                found = True
                break
        inconsistent_idx = ix if found else None
        return inconsistent_idx, x1, yo

    def setup_cnn(self, X, y, S_idx):
        S = X[S_idx, :]
        ys = y[S_idx]
        cnn = KNeighborsClassifier(n_neighbors=self.k).fit(S, ys)
        return cnn

    def augment_S(self, X, y, inconsistent_y, neighbors_idx, S_idx):
        found = False
        for ix in neighbors_idx:
            print(y.shape, y[ix].shape)
            if ix in S_idx:
                continue
            if np.equal(y[ix], inconsistent_y):
                found = True 
                break
        if found:
            S_idx = np.append(S_idx, ix)
        else:
            print("Can't find a new idx.")
        return S_idx

    def find_cnn(self, X, y):
        N, _ = X.shape
        S_idx = self.init_cnn(X)
        onn = KNeighborsClassifier(n_neighbors=self.k).fit(X, y)
        while True:
            cnn = self.setup_cnn(X, y, S_idx)
            inconsistent_idx, inconsistent_x, inconsistent_y = self.find_inconsistency(X, y, cnn, onn)
            if inconsistent_idx is None:
                break
            neighbors_idx = find_nn_idx(inconsistent_x, X, N)

            S_idx = self.augment_S(X, y, inconsistent_y, neighbors_idx, S_idx)
            if len(S_idx) == old_s:
                print('No new point added into S. Exit.')
                break
        S = X[S_idx, :]
        Sy = y[S_idx]
        return S_idx, S, Sy

In [111]:
k = 3
cnn = CNN(k)
S_idx, S, Sy = cnn.find_cnn(X_train, y_train)

X_cnn, y_cnn = X_train[S_idx, :], y_train[S_idx]


cnn_cls = KNearestNeighbors(n_neighbors=k).fit(X_cnn, y_cnn) # 3-NN with condensed data
cls = KNearestNeighbors(n_neighbors=k).fit(X_train, y_train) # 3-NN with original data

# print('---- Prediction with Condensed Data: In-Sample Error')
# y_pred_cnn_3 = cnn_cls.predict(X_cnn)
# diff_cnn_3 = y_cnn - y_pred_cnn_3
# diff_cnn_3[diff_cnn_3 !=0] = 1
# E_in_cnn_3 = np.sum(diff_cnn_3!=0)/len(y_cnn)
# print('3-NN with CNN E_{in}: ', E_in_cnn_3)


# print('------ Check if CNN is traing set consistent ------')
# y_pred_orig_train_cnn_3 = cnn_cls.predict(X_train)
# y_pred_orig_train_nn_3 = cls.predict(X_tr)
# diff_orig_train_cnn_3 = y_pred_orig_train_nn_3 - y_pred_orig_train_cnn_3
# diff_orig_train_cnn_3[diff_orig_train_cnn_3 !=0] = 1
# E_in_orig_train_cnn_3 = np.sum(diff_orig_train_cnn_3!=0)/len(y_train)
# print('Differences between 3NN using original data and condensed data: ', E_in_orig_train_cnn_3)

# print('------ Test error of CNN with condensed data ------') 
# y_pred_cnn_te_3 = cnn_cls.predict(X_te)
# diff_cnn_te_3 = y_test - y_pred_cnn_te_3
# diff_cnn_te_3[diff_cnn_te_3 !=0] = 1
# E_out_cnn_3 = np.sum(diff_cnn_te_3!=0)/len(y_test)
# print('3-NN with CNN on test data E_{out}: ', E_out_cnn_3)

(7291,) (7291,)




ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()