# Código Learning Vector Quantization

## Imports

In [1]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import random

## Funções de Suporte

In [134]:
def sample_prototypes(dataset, n_prototypes = 2):
    '''Choose a number of samples from the dataset to use as prototypes. Will keep at least one sample of every class.'''
    classes = {x[-1]: [a for a in dataset if a[-1] == x[-1]] for x in dataset}
        
    if (n_prototypes < len(classes)):
        print("There aren't prototypes enough for all classes")
    elif (n_prototypes > len(dataset)):
        print("There aren't samples enough for this amount of prototypes")
    else:
        prototypes = [random.choice(classes[x]) for x in classes]
        if (n_prototypes > len(classes)):
            still_not_chosen = [data for data in dataset if np.not_equal(prototypes, data).any(1).all()]
            prototypes.extend(random.sample(still_not_chosen, n_prototypes - len(classes)))
        return prototypes

sample_prototypes(np.array([[1,0,0], [1,1,1], [2,0,2], [2,1,0]]), 4)

[array([2, 1, 0]), array([1, 1, 1]), array([2, 0, 2]), array([1, 0, 0])]

In [135]:
def calculate_test_results(real, predicted):
    matched = [a == b for (a, b) in zip(real, predicted)]
    recalls = {i: list(zip(real, matched)).count((i, True)) / real.count(i) for i in real}
    return {
        "precision": matched.count(True) / len(real),
        "recalls": recalls
    }

## Adaptador do KNN

In [153]:
class KNN_adapter:
    '''An adapter class to use a basic KNN calling the same functions of the LVQ implementations.'''
    def __init__(self, k = 1):
        self.knn = KNeighborsClassifier(n_neighbors = k)
    
    def train(self, training):
        samples = [sample[0:-1] for sample in training]
        labels = [sample[-1] for sample in training]
        self.knn.fit(samples, labels)
    
    def predict(self, sample):
        return self.knn.predict(sample)
    
    def test(self, testing):
        samples = [sample[0:-1] for sample in testing]
        labels = [sample[-1] for sample in testing]
        predicted_labels = self.predict(samples)
        result = calculate_test_results(labels, predicted_labels)
        return result
    
    def get_prototypes(self):
        return None

## Código do LVQ1

In [156]:
class LVQ1:
    def __init__(self, k = 1, n_prototypes = 2, alpha_0 = 0.8):
        self.knn = KNeighborsClassifier(n_neighbors = k)
        self.n_prototypes = n_prototypes
        self.alpha_0 = alpha_0
    
    def adjust_prototype(self, prototype_index, alpha, sample):
        prototype = self.samples[prototype_index]
        if (self.labels[prototype_index] == sample[-1]):
            self.samples[prototype_index] = prototype + (sample[0:-1] - prototype) * alpha
        else:
            self.samples[prototype_index] = prototype - (sample[0:-1] - prototype) * alpha
    
    def train(self, training):
        prototypes = sample_prototypes(training, self.n_prototypes)
        self.samples = [sample[0:-1] for sample in prototypes]
        self.labels = [sample[-1] for sample in prototypes]
        
        alpha_t = self.alpha_0
        for sample in training:
            self.knn.fit(self.samples, self.labels)
            (_, closest) = self.knn.kneighbors([sample[0:-1]])
            [self.adjust_prototype(prototype_index, alpha_t, sample) for prototype_index in closest[0]]
            alpha_t *= self.alpha_0
        
        self.knn.fit(self.samples, self.labels)
    
    def predict(self, sample):
        return self.knn.predict(sample)
    
    def test(self, testing):
        samples = [sample[0:-1] for sample in testing]
        labels = [sample[-1] for sample in testing]
        predicted_labels = self.predict(samples)
        result = calculate_test_results(labels, predicted_labels)
        return result
    
    def get_prototypes(self):
        return {'samples': self.samples, 'labels': self.labels}

In [157]:
lvq1 = LVQ1(n_prototypes=3)
lvq1.train(np.array([[1,0,0], [1,1,1], [2,0,2], [2,1,0]]))
#lvq1.test([[0, 1, 1], [1, 0.2, 0]])
lvq1.get_prototypes()

{'samples': [array([1., 0.]), array([0.5904, 1.    ]), array([2., 0.])],
 'labels': [0, 1, 2]}

## Código do LVQ2.1

In [138]:
class LVQ2:
    def __init__(self, k = 1, nPrototypes = 2):
        self.knn = NearestNeighbors(n_neighbors = k)
        self.nPrototypes = nPrototypes

## Código do LVQ3

In [139]:
class LVQ3:
    def __init__(self, k = 1, nPrototypes = 2):
        self.knn = NearestNeighbors(n_neighbors = k)
        self.nPrototypes = nPrototypes

# Avaliações

## Imports

In [140]:
%matplotlib inline

from scipy.io import arff
import pandas as pd
from time import process_time

import matplotlib
import matplotlib.pyplot as plt

import math

## Funções para o experimento

In [141]:
def normalize(raw):
    '''Normalizes an dataset so all of its attributes have the same weight.'''
    attribute_mins = np.min(raw, axis=0)
    attribute_maxs = np.max(raw, axis=0)
    return (raw - attribute_mins) / attribute_maxs

In [142]:
def k_fold(sep_samples, k = 5):
    '''Splits the samples in k groups with similar amounts of samples and distributions of every class.'''
    folds = [[] for _ in range(k)]
    for i in sep_samples:
        split_class = np.array_split(sep_samples[i], k)
        [a.extend(b) for (a, b) in zip(folds, split_class)]
    return folds

In [159]:
def cross_validation(machine, folds):
    random.seed(42)
    '''Evaluates an algorithm through cross validation.'''
    precisions = []
    recalls = []
    train_times = []
    test_times = []
    prototypes = []
    for i in range(len(folds)):
        print("Testing on fold " + str(i))
        train = [s for j, fold in enumerate(folds) if i != j for s in fold]
        test = folds[i]

        train_times.append(process_time())
        machine.train(train)
        train_times[-1] = process_time() - train_times[-1]

        test_times.append(process_time())
        test_results = machine.test(test)
        test_times[-1] = process_time() - test_times[-1]

        precisions.append(test_results['precision'])
        recalls.append(test_results['recalls'])
        prototypes.append(machine.get_prototypes())
    return {
        'precisions': precisions,
        'recalls': recalls,
        'train_times': train_times,
        'test_times': test_times,
        'prototypes': prototypes
    }

## Base de Dados 1 - CM1

### Preparação

In [144]:
data = arff.loadarff('Datasets/cm1.arff')
dataFrame = pd.DataFrame(data[0])
dataFrame.defects = [1 if i == b'true' else 0 for i in dataFrame.defects]
rawData = dataFrame.values
rawData

array([[  1.1,   1.4,   1.4, ...,   1.2,   1.4,   0. ],
       [  1. ,   1. ,   1. , ...,   1. ,   1. ,   1. ],
       [ 24. ,   5. ,   1. , ...,  19. ,   9. ,   0. ],
       ...,
       [ 82. ,  11. ,   3. , ..., 190. ,  21. ,   1. ],
       [ 10. ,   2. ,   1. , ...,  13. ,   3. ,   1. ],
       [ 28. ,   6. ,   5. , ...,  37. ,  11. ,   1. ]])

In [145]:
# Normalizing the dataset
normalizedData = normalize(rawData)
normalizedData

array([[2.36406619e-04, 4.16666667e-03, 1.33333333e-02, ...,
        1.47420147e-03, 2.46913580e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.22850123e-03, 0.00000000e+00, 1.00000000e+00],
       [5.43735225e-02, 4.16666667e-02, 0.00000000e+00, ...,
        2.33415233e-02, 4.93827160e-02, 0.00000000e+00],
       ...,
       [1.91489362e-01, 1.04166667e-01, 6.66666667e-02, ...,
        2.33415233e-01, 1.23456790e-01, 1.00000000e+00],
       [2.12765957e-02, 1.04166667e-02, 0.00000000e+00, ...,
        1.59705160e-02, 1.23456790e-02, 1.00000000e+00],
       [6.38297872e-02, 5.20833333e-02, 1.33333333e-01, ...,
        4.54545455e-02, 6.17283951e-02, 1.00000000e+00]])

In [146]:
# Separating samples by class
classes = {i[-1]: normalizedData[np.where(rawData[:,-1] == i[-1])] for i in normalizedData}
classes

{0.0: array([[0.00023641, 0.00416667, 0.01333333, ..., 0.0014742 , 0.00246914,
         0.        ],
        [0.05437352, 0.04166667, 0.        , ..., 0.02334152, 0.04938272,
         0.        ],
        [0.04491726, 0.03125   , 0.1       , ..., 0.01965602, 0.03703704,
         0.        ],
        ...,
        [0.0070922 , 0.        , 0.        , ..., 0.00737101, 0.        ,
         0.        ],
        [0.00472813, 0.        , 0.        , ..., 0.002457  , 0.        ,
         0.        ],
        [0.02600473, 0.02083333, 0.        , ..., 0.03071253, 0.02469136,
         0.        ]]),
 1.0: array([[0.        , 0.        , 0.        , ..., 0.0012285 , 0.        ,
         1.        ],
        [0.07092199, 0.03125   , 0.        , ..., 0.06265356, 0.03703704,
         1.        ],
        [0.06619385, 0.04166667, 0.        , ..., 0.04545455, 0.04938272,
         1.        ],
        ...,
        [0.19148936, 0.10416667, 0.06666667, ..., 0.23341523, 0.12345679,
         1.        ],
  

In [147]:
# Separating the folds
folds = k_fold(classes)
folds

[[array([2.36406619e-04, 4.16666667e-03, 1.33333333e-02, 6.34920635e-03,
         1.44578313e-04, 7.59156005e-05, 1.00000000e+00, 1.03363282e-02,
         4.42658676e-03, 6.03615014e-07, 2.27670753e-01, 1.08650702e-05,
         2.50000000e-02, 5.89970501e-03, 1.21951220e-02, 1.00000000e+00,
         2.77777778e-03, 3.82165605e-03, 1.58604282e-04, 1.47420147e-03,
         2.46913580e-03, 0.00000000e+00]),
  array([0.05437352, 0.04166667, 0.        , 0.03174603, 0.02987952,
         0.01805215, 0.08461538, 0.07553471, 0.11080087, 0.0013636 ,
         0.01751313, 0.00136357, 0.0125    , 0.        , 0.03658537,
         0.        , 0.19444444, 0.0477707 , 0.03409992, 0.02334152,
         0.04938272, 0.        ]),
  array([0.04491726, 0.03125   , 0.1       , 0.01587302, 0.02216867,
         0.01258389, 0.04615385, 0.12721635, 0.04586625, 0.00160092,
         0.01225919, 0.00160093, 0.        , 0.        , 0.01829268,
         0.        , 0.20833333, 0.02547771, 0.02379064, 0.01965602,
     

### Execução do KNN básico

In [160]:
knn = KNN_adapter()
knn_results = cross_validation(knn, folds)
knn_results

Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4


{'precisions': [0.8, 0.85, 0.83, 0.87, 0.8979591836734694],
 'recalls': [{0.0: 0.8555555555555555, 1.0: 0.3},
  {0.0: 0.9111111111111111, 1.0: 0.3},
  {0.0: 0.9111111111111111, 1.0: 0.1},
  {0.0: 0.9555555555555556, 1.0: 0.1},
  {0.0: 0.9662921348314607, 1.0: 0.2222222222222222}],
 'train_times': [0.0, 0.015625, 0.0, 0.015625, 0.0],
 'test_times': [0.015625, 0.015625, 0.015625, 0.015625, 0.015625],
 'prototypes': [None, None, None, None, None]}

### Execução do LVQ1

In [161]:
lvq1 = LVQ1(n_prototypes = 5)
lvq1_results = cross_validation(lvq1, folds)
lvq1_results

Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4


{'precisions': [0.9, 0.9, 0.85, 0.81, 0.8877551020408163],
 'recalls': [{0.0: 0.9888888888888889, 1.0: 0.1},
  {0.0: 1.0, 1.0: 0.0},
  {0.0: 0.9333333333333333, 1.0: 0.1},
  {0.0: 0.8888888888888888, 1.0: 0.1},
  {0.0: 0.9662921348314607, 1.0: 0.1111111111111111}],
 'train_times': [0.375, 0.40625, 0.359375, 0.34375, 0.34375],
 'test_times': [0.015625, 0.015625, 0.015625, 0.015625, 0.015625],
 'prototypes': [{'samples': [array([4.38258540e-02, 1.55571242e-02, 8.26781413e-08, 2.12675696e-02,
           3.24269763e-02, 2.03169565e-02, 7.96807318e-02, 8.00888102e-02,
           1.18870377e-01, 1.63439945e-03, 2.08111147e-02, 1.63442296e-03,
           7.46208497e-07, 3.37722223e-04, 1.68350357e-02, 0.00000000e+00,
           1.96330650e-01, 6.13000264e-02, 3.31510882e-02, 3.13052256e-02,
           1.84380732e-02]),
    array([0.96926714, 0.75      , 0.96666667, 0.63492063, 0.72240964,
           0.74454342, 0.01538462, 0.34515385, 1.        , 0.25700935,
           0.74430823, 0.25700931,

### Execução do LVQ2.1

### Execução do LVQ3

## Base de Dados 2 - 