# Código do KNN

## Imports

In [11]:
from functools import reduce
import numpy as np
import numpy.linalg as LA
import math

## Classe KNN

In [39]:
class KNN:
    @staticmethod
    def BLOCK_DISTANCE(a, b):
        '''Or distance of Manhattan. Considers only movements in axis directions.'''
        return np.sum(np.absolute(a-b))

    @staticmethod
    def EUCLIDEAN_DISTANCE(a, b):
        '''The shortest route between two points.'''
        return LA.norm(a-b)

    @staticmethod
    def NO_WEIGHT(acc, i):
        '''Just counts the frequency of every label.'''
        (label, _) = i
        acc[label] += 1
        return acc

    @staticmethod
    def WEIGHTED(acc, i):
        '''Weights the values by their distance.'''
        (label, distance) = i
        acc[label] += 1 / (distance ** 2) if distance else math.inf
        return acc

    def __init__(self, distance = None, k = 3, weight = None, adaptive = False):
        self.samples = []
        self.distance = KNN.EUCLIDEAN_DISTANCE if distance == None else distance
        self.k = k
        self.weight = KNN.NO_WEIGHT if weight == None else weight
        self.adaptive = adaptive
    
    def calculateRadius(self, sample):
        '''Calculates the radius for the adaptive distance.
        If the distance for the classification is changed this function should be called again.'''
        distances = [self.distance(sample[0:-1], s[0:-1]) for s in self.samples if s[-1] != sample[-1]]
        return min(distances)

    def train(self, training):
        '''Trains the kNN.'''
        self.samples = training
        self.sampleRadius = [self.calculateRadius(i) for i in self.samples]

    def classify(self, sample):
        '''Classifies a single sample.'''
        closest = self.find_closest(sample)
        votes = reduce(self.weight, closest, {i[-1]: 0 for i in self.samples})
        chosen = max(votes.keys(), key = lambda i: votes[i])
        return chosen

    def test(self, testSamples):
        '''Tests the kNN efficiency in classifying its samples.'''
        realResults = [i[-1] for i in testSamples]
        testResults = [self.classify(i) for i in testSamples]
        matched = [a == b for (a, b) in zip(realResults, testResults)]
        return {
            "hitRate": matched.count(True) / len(realResults),
            "missRate": matched.count(False) / len(realResults)
        }

    def find_closest(self, sample):
        '''Finds the closest k samples.'''
        if self.adaptive:
            distances = [(s[-1], self.distance(sample[0:-1], s[0:-1]) / self.sampleRadius[i]\
                        if self.sampleRadius[i] else math.inf) for (i, s) in enumerate(self.samples)]
        else:
            distances = [(s[-1], self.distance(sample[0:-1], s[0:-1])) for s in self.samples]
        distances.sort(key = lambda i: i[1])
        return distances[0:self.k]

# Avaliações

## Imports

In [13]:
from scipy.io import arff
import pandas as pd
from time import process_time
import random

## Funções p/ o experimento

In [14]:
k = 10
def k_fold(sepSamples, k):
    ''''Splits the samples in k groups with similar amounts of samples and distributions of every class.'''
    folds = [[] for _ in range(k)]
    for i in sepSamples:
        splitClass = np.array_split(sepSamples[i], k)
        [a.extend(b) for (a, b) in zip(folds, splitClass)]
    return folds

In [15]:
def cross_validation(knn, folds):
    '''Evaluates a knn configuration through cross validation.'''
    results = []
    trainTimes = []
    testTimes = []
    for i in range(len(folds)):
        print("Testing on fold " + str(i))
        train = [s for j, fold in enumerate(folds) if i != j for s in fold]
        random.shuffle(train)
        test = folds[i]

        trainTimes.append(process_time())
        knn.train(train)
        trainTimes[-1] = process_time() - trainTimes[-1]

        testTimes.append(process_time())
        results.append(knn.test(test))
        testTimes[-1] = process_time() - testTimes[-1]
    return {'results': results, 'trainTimes': trainTimes, 'testTimes': testTimes}

In [16]:
kVariations = [1, 2, 3, 5, 7, 9, 11, 13, 15]
def experiment(knn, kVariations, folds, randomSeed):
    '''The code which will run the experiment.'''
    random.seed(randomSeed)
    result = {}
    for i in kVariations:
        print('Parameter k = ' + str(i))
        knn.k = i
        result[i] = cross_validation(knn, folds)
    return result

## Base de Dados 1 - CM1

### Preparação

In [17]:
data = arff.loadarff('Datasets/cm1.arff')
dataFrame = pd.DataFrame(data[0])
dataFrame.defects = [1 if i == b'true' else 0 for i in dataFrame.defects]
rawData = dataFrame.values
rawData

array([[   1.1,    1.4,    1.4, ...,    1.2,    1.4,    0. ],
       [   1. ,    1. ,    1. , ...,    1. ,    1. ,    1. ],
       [  24. ,    5. ,    1. , ...,   19. ,    9. ,    0. ],
       ..., 
       [  82. ,   11. ,    3. , ...,  190. ,   21. ,    1. ],
       [  10. ,    2. ,    1. , ...,   13. ,    3. ,    1. ],
       [  28. ,    6. ,    5. , ...,   37. ,   11. ,    1. ]])

In [18]:
# Separating samples by class
classes = {i[-1]: rawData[np.where(rawData[:,-1] == i[-1])] for i in rawData}
classes

{0.0: array([[  1.1,   1.4,   1.4, ...,   1.2,   1.4,   0. ],
        [ 24. ,   5. ,   1. , ...,  19. ,   9. ,   0. ],
        [ 20. ,   4. ,   4. , ...,  16. ,   7. ,   0. ],
        ..., 
        [  4. ,   1. ,   1. , ...,   6. ,   1. ,   0. ],
        [  3. ,   1. ,   1. , ...,   2. ,   1. ,   0. ],
        [ 12. ,   3. ,   1. , ...,  25. ,   5. ,   0. ]]),
 1.0: array([[   1.,    1.,    1., ...,    1.,    1.,    1.],
        [  31.,    4.,    1., ...,   51.,    7.,    1.],
        [  29.,    5.,    1., ...,   37.,    9.,    1.],
        ..., 
        [  82.,   11.,    3., ...,  190.,   21.,    1.],
        [  10.,    2.,    1., ...,   13.,    3.,    1.],
        [  28.,    6.,    5., ...,   37.,   11.,    1.]])}

In [19]:
# Separating the folds
folds = k_fold(classes, k)

### KNN sem peso

In [21]:
weightlessKNN = KNN(weight=KNN.NO_WEIGHT, adaptive=False)
weightlessExperiment = experiment(weightlessKNN, kVariations, folds, 42)
weightlessExperiment

Parameter k = 1
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 2
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 3
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 5
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 7
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 9
Test

{1: {'results': [{'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.9, 'missRate': 0.1},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.76, 'missRate': 0.24},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.84, 'missRate': 0.16},
   {'hitRate': 0.8958333333333334, 'missRate': 0.10416666666666667}],
  'testTimes': [0.390625,
   0.4375,
   0.171875,
   0.140625,
   0.15625,
   0.171875,
   0.1875,
   0.140625,
   0.140625,
   0.140625],
  'trainTimes': [0.265625,
   0.375,
   0.390625,
   0.328125,
   0.4375,
   0.328125,
   0.375,
   0.59375,
   0.265625,
   0.28125]},
 2: {'results': [{'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.9, 'missRate': 0.1},
   {'hitRate': 0.88, 'missRate': 0.12},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.8, 'missRate': 0.2},
   {'hitRate': 0.88, 'missRate': 0.12},
   {'hitRate': 0.88, 'missRate': 

### KNN com peso

In [38]:
weightedKNN = KNN(weight=KNN.WEIGHTED, adaptive=False)
weightedExperiment = experiment(weightedKNN, kVariations, folds, 42)
weightedExperiment

Parameter k = 1
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 2
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 3
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 5
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 7
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 9
Test

{1: {'results': [{'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.9, 'missRate': 0.1},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.76, 'missRate': 0.24},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.84, 'missRate': 0.16},
   {'hitRate': 0.8958333333333334, 'missRate': 0.10416666666666667}],
  'testTimes': [0.140625,
   0.4375,
   0.203125,
   0.171875,
   0.28125,
   0.15625,
   0.328125,
   0.34375,
   0.25,
   0.3125],
  'trainTimes': [0.4375,
   0.375,
   0.28125,
   0.46875,
   0.25,
   0.328125,
   0.59375,
   0.671875,
   0.359375,
   0.3125]},
 2: {'results': [{'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.9, 'missRate': 0.1},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.76, 'missRate': 0.24},
 

### KNN adaptativo sem peso

In [40]:
adaptiveKNN = KNN(weight=KNN.NO_WEIGHT, adaptive=True)
adaptiveExperiment = experiment(adaptiveKNN, kVariations, folds, 42)
adaptiveExperiment

Parameter k = 1
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 2
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 3
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 5
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 7
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Testing on fold 5
Testing on fold 6
Testing on fold 7
Testing on fold 8
Testing on fold 9
Parameter k = 9
Test

{1: {'results': [{'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.94, 'missRate': 0.06},
   {'hitRate': 0.88, 'missRate': 0.12},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.9, 'missRate': 0.1},
   {'hitRate': 0.76, 'missRate': 0.24},
   {'hitRate': 0.86, 'missRate': 0.14},
   {'hitRate': 0.92, 'missRate': 0.08},
   {'hitRate': 0.8958333333333334, 'missRate': 0.10416666666666667}],
  'testTimes': [0.140625,
   0.203125,
   0.1875,
   0.1875,
   0.28125,
   0.3125,
   0.359375,
   0.28125,
   0.359375,
   0.203125],
  'trainTimes': [0.328125,
   0.25,
   0.296875,
   0.609375,
   0.34375,
   0.359375,
   0.390625,
   0.34375,
   0.4375,
   0.421875]},
 2: {'results': [{'hitRate': 0.88, 'missRate': 0.12},
   {'hitRate': 0.9, 'missRate': 0.1},
   {'hitRate': 0.88, 'missRate': 0.12},
   {'hitRate': 0.88, 'missRate': 0.12},
   {'hitRate': 0.84, 'missRate': 0.16},
   {'hitRate': 0.88, 'missRate': 0.12},
   {'hitRate': 0.88, 'missRate':

## Base de Dados 2 - 

### KNN sem peso

### KNN com peso

### KNN adaptativo sem peso

# Conclusão