In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import sklearn.datasets

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
titanic = sns.load_dataset('titanic')

In [3]:
class KNN():
    
    def __init(self):
        self.K = None
        self.X_train = None
        self.y_train = None
        self.vizinhos_index = None
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X, K=1):
        self.K = K
        classes = np.unique(self.y_train)
        y_pred = []
        dist_total = []
        self.vizinhos_index = []

        for xi in X:
            euclidistancias = -2 * xi @ self.X_train.T + (xi**2).sum() + (self.X_train**2).sum(axis=1)
            
            
            knn_index = np.argsort(euclidistancias)[0:K]
            self.vizinhos_index.append(knn_index)
            contagem = []
            dist_total = []
            
            for classe in classes:
                contagem.append((self.y_train[knn_index]==classe).sum())
                dist_classe = np.sum(euclidistancias[knn_index][self.y_train[knn_index]==classe])
                dist_total.append(dist_classe)
                
            contagem = np.array(contagem)
            
            #criar lista de classes empatadas
            indice_classes_empatadas = []
            for i in range(len(contagem)):
                if contagem[i] == max(contagem):
                    indice_classes_empatadas.append(i)
            
            # Popular y_pred com as duas regras de decisão: 
                # Se não empate: classe com maior votação
                # Se empate: classe com menor distância entre as classes empatadas
            
            dist_total = np.array(dist_total)

            if len(indice_classes_empatadas) == 1:
                pred_index = np.argmax(contagem)
                y_pred.append(classes[pred_index])
                
                
    
            else:
                # menor valor das distancias totais que estão com a mesma quantidade de votos
                menor_valor = np.min(dist_total[indice_classes_empatadas])
                
                # indice da menor distancia contando todas as distâncias
                indice_menor_dist = np.where(dist_total == menor_valor)[0][0]
                
                # Predição pra essa instância = Classe que tem a menor distancia
                y_pred.append(classes[indice_menor_dist])
            

            
        return np.array(y_pred)

In [4]:
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

In [5]:
knn = KNN()

In [6]:
knn.fit(X_train, y_train)

In [8]:
y_pred = knn.predict(X_test, 4)

In [10]:
knn.vizinhos_index

[array([381, 419, 107, 355], dtype=int64),
 array([ 58, 352,  14, 153], dtype=int64),
 array([ 99,  36, 132, 151], dtype=int64),
 array([282, 313,  94, 381], dtype=int64),
 array([330, 347, 152, 151], dtype=int64),
 array([159, 337, 110, 138], dtype=int64),
 array([ 42, 107, 419, 380], dtype=int64),
 array([400, 399, 176, 265], dtype=int64),
 array([281, 339, 133,  60], dtype=int64),
 array([101, 361,  41, 229], dtype=int64),
 array([130, 406, 391, 232], dtype=int64),
 array([397, 233,  27, 192], dtype=int64),
 array([378, 383,   0, 393], dtype=int64),
 array([158, 403, 410, 111], dtype=int64),
 array([194, 395, 353, 189], dtype=int64),
 array([314, 160, 125,  38], dtype=int64),
 array([163, 417, 186, 335], dtype=int64),
 array([329, 314, 160, 352], dtype=int64),
 array([372,  60, 339, 119], dtype=int64),
 array([111, 124, 316, 158], dtype=int64),
 array([ 16,  38, 272,  34], dtype=int64),
 array([282, 313,  94, 381], dtype=int64),
 array([128, 273,  45, 209], dtype=int64),
 array([269

In [9]:
accuracy_score(y_pred, y_test)

0.916083916083916

In [37]:
# Modelo com código procedural

K = 4
classes = np.unique(y_train)
y_pred = []
vizinhos_index = []
        
for xi in X_test:
    euclidistancias = -2 * xi @ X_train.T + (xi**2).sum() + (X_train**2).sum(axis=1)
            
    knn_index = np.argsort(euclidistancias)[0:K]
    vizinhos_index.append(knn_index)
    
    contagem = []
    dist_total = []
    
    for classe in classes:
        contagem.append((y_train[knn_index]==classe).sum())
        dist_classe = np.sum(euclidistancias[knn_index][y_train[knn_index]==classe])
        dist_total.append(dist_classe)
    
    contagem = np.array(contagem)
    
    #criar lista de classes empatadas
    indice_classes_empatadas = []
    for i in range(len(contagem)):
        if contagem[i] == max(contagem):
            indice_classes_empatadas.append(i)
      
    # Popular y_pred com as duas regras de decisão: 
        # Se não empate: classe com maior votação
        # Se empate: classe com menor distância entre as classes empatadas
    
    dist_total = np.array(dist_total)
    
    if len(indice_classes_empatadas) == 1:
        pred_index = np.argmax(contagem)
        y_pred.append(classes[pred_index])
    
    else:
        # menor valor das distancias totais que estão com a mesma quantidade de votos
        menor_valor = np.min(dist_total[indice_classes_empatadas])

        # indice da menor distancia contando todas as distâncias
        indice_menor_dist = np.where(dist_total == menor_valor)[0][0]

        # Classe que tem a menor distancia
        y_pred.append(classes[indice_menor_dist])
        
        #bloco de teste
        print("----------------------------Check de sanidade para instâncias com empate-----------------------------")

        print("contagem: ")
        print(contagem)
        print("dist_total: ")
        print(dist_total)
        print("menor_valor: ")
        print(menor_valor)
        print("indice_menor_dist: ")
        print(indice_menor_dist)
        
        
            
y_pred = np.array(y_pred)

----------------------------Check de sanidade para instâncias com empate-----------------------------
contagem: 
[2 2]
dist_total: 
[ 719.56582745 3255.53119999]
menor_valor: 
719.5658274518792
indice_menor_dist: 
0
----------------------------Check de sanidade para instâncias com empate-----------------------------
contagem: 
[2 2]
dist_total: 
[1268.75277352 1094.83786102]
menor_valor: 
1094.837861015927
indice_menor_dist: 
1
----------------------------Check de sanidade para instâncias com empate-----------------------------
contagem: 
[2 2]
dist_total: 
[1576.64861988 2072.00281733]
menor_valor: 
1576.648619877873
indice_menor_dist: 
0
----------------------------Check de sanidade para instâncias com empate-----------------------------
contagem: 
[2 2]
dist_total: 
[4066.93041466 4423.56557632]
menor_valor: 
4066.93041465641
indice_menor_dist: 
0
----------------------------Check de sanidade para instâncias com empate-----------------------------
contagem: 
[2 2]
dist_total: 
[4400

In [31]:
(y_train[knn_index]==classe)

array([False, False, False, False])

In [32]:
((y_train[knn_index]==classe).sum())

0

In [25]:
vizinhos_index

[array([381, 419, 107, 355], dtype=int64),
 array([ 58, 352,  14, 153], dtype=int64),
 array([ 99,  36, 132, 151], dtype=int64),
 array([282, 313,  94, 381], dtype=int64),
 array([330, 347, 152, 151], dtype=int64),
 array([159, 337, 110, 138], dtype=int64),
 array([ 42, 107, 419, 380], dtype=int64),
 array([400, 399, 176, 265], dtype=int64),
 array([281, 339, 133,  60], dtype=int64),
 array([101, 361,  41, 229], dtype=int64),
 array([130, 406, 391, 232], dtype=int64),
 array([397, 233,  27, 192], dtype=int64),
 array([378, 383,   0, 393], dtype=int64),
 array([158, 403, 410, 111], dtype=int64),
 array([194, 395, 353, 189], dtype=int64),
 array([314, 160, 125,  38], dtype=int64),
 array([163, 417, 186, 335], dtype=int64),
 array([329, 314, 160, 352], dtype=int64),
 array([372,  60, 339, 119], dtype=int64),
 array([111, 124, 316, 158], dtype=int64),
 array([ 16,  38, 272,  34], dtype=int64),
 array([282, 313,  94, 381], dtype=int64),
 array([128, 273,  45, 209], dtype=int64),
 array([269

In [27]:
euclidistancias

array([1.09695151e+06, 1.61592160e+06, 1.56632284e+06, 1.13472053e+06,
       2.16381161e+06, 5.68793499e+05, 9.06532434e+05, 1.86373346e+06,
       1.00058391e+05, 1.65954408e+06, 9.74996369e+05, 5.01715281e+05,
       6.63751510e+05, 1.14678452e+06, 9.36827050e+05, 1.29269955e+05,
       8.87855507e+05, 1.44676653e+06, 1.28565487e+06, 6.46817384e+04,
       7.55693004e+04, 4.40708283e+04, 1.35825132e+06, 7.75607640e+05,
       1.23795470e+06, 1.75434770e+06, 1.69854910e+06, 1.29402784e+06,
       3.75373690e+04, 1.15645970e+06, 1.47344819e+05, 5.72797927e+05,
       6.43549419e+05, 1.95541379e+06, 8.99714695e+05, 2.04364349e+06,
       1.32517536e+06, 6.99185781e+05, 8.76451194e+05, 7.39359221e+04,
       4.78878727e+05, 4.63349434e+05, 6.08971562e+05, 1.95572370e+06,
       1.64074006e+06, 3.32130696e+05, 3.32412288e+05, 1.64517409e+06,
       8.50798129e+05, 7.78533792e+05, 5.01562327e+04, 2.62034655e+05,
       8.19324318e+05, 1.85072743e+06, 1.59840591e+06, 7.65017760e+05,
      

In [28]:
knn_index

array([145, 401,  65, 420], dtype=int64)

In [29]:
dist_total

array([29568.91041199,     0.        ])

In [13]:
accuracy_score(y_pred, y_test)

0.916083916083916

In [None]:
# Modelo com código procedural

K = 4
classes = np.unique(y_train)
y_pred = []
vizinhos_index = []
        
for xi in X_test:
    
    euclidistancias = -2 * xi @ X_train.T + (xi**2).sum() + (X_train**2).sum(axis=1)        
    knn_index = np.argsort(euclidistancias)[0:K]
    vizinhos_index.append(knn_index)
    
    contagem = []
    dist_total = []
    
    for classe in classes:
        contagem.append((y_train[knn_index]==classe).sum())
        dist_classe = np.sum(euclidistancias[knn_index][y_train[knn_index]==classe])
        dist_total.append(dist_classe)
    
    contagem = np.array(contagem)
    indice_classes_empatadas = []
    for i in range(len(contagem)):
        if contagem[i] == max(contagem):
            indice_classes_empatadas.append(i)
          
    dist_total = np.array(dist_total)
    
    if len(indice_classes_empatadas) == 1:
        pred_index = np.argmax(contagem)
        y_pred.append(classes[pred_index])
    
    else:
        menor_valor = np.min(dist_total[indice_classes_empatadas])
        indice_menor_dist = np.where(dist_total == menor_valor)[0][0]
        y_pred.append(classes[indice_menor_dist])
                        
y_pred = np.array(y_pred)

In [20]:
K = 4
classes = np.unique(y_train)
y_pred = []
vizinhos_index = []

In [21]:
classes

array([0, 1])