# Código Learning Vector Quantization

## Imports

In [1]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import numpy.linalg as LA
import random

## Funções de Suporte

In [2]:
def sample_prototypes(dataset, n_prototypes = 2):
    '''Choose a number of samples from the dataset to use as prototypes. Will keep at least one sample of every class.'''
    classes = {x[-1]: [a for a in dataset if a[-1] == x[-1]] for x in dataset}
        
    if (n_prototypes < len(classes)):
        print("There aren't prototypes enough for all classes")
    elif (n_prototypes > len(dataset)):
        print("There aren't samples enough for this amount of prototypes")
    else:
        prototypes = [random.choice(classes[x]) for x in classes]
        if (n_prototypes > len(classes)):
            still_not_chosen = [data for data in dataset if np.not_equal(prototypes, data).any(1).all()]
            prototypes.extend(random.sample(still_not_chosen, n_prototypes - len(classes)))
        return prototypes

sample_prototypes(np.array([[1,0,0], [1,1,1], [2,0,2], [2,1,0]]), 4)

[array([1, 0, 0]), array([1, 1, 1]), array([2, 0, 2]), array([2, 1, 0])]

In [3]:
def calculate_test_results(real, predicted):
    matched = [a == b for (a, b) in zip(real, predicted)]
    recalls = {i: list(zip(real, matched)).count((i, True)) / real.count(i) for i in real}
    return {
        "precision": matched.count(True) / len(real),
        "recalls": recalls
    }

## Adaptador do KNN

In [4]:
class KNN_adapter:
    '''An adapter class to use a basic KNN calling the same functions of the LVQ implementations.'''
    def __init__(self, k = 1):
        self.knn = KNeighborsClassifier(n_neighbors = k)
    
    def train(self, training):
        samples = [sample[0:-1] for sample in training]
        labels = [sample[-1] for sample in training]
        self.knn.fit(samples, labels)
    
    def predict(self, sample):
        return self.knn.predict(sample)
    
    def test(self, testing):
        samples = [sample[0:-1] for sample in testing]
        labels = [sample[-1] for sample in testing]
        predicted_labels = self.predict(samples)
        result = calculate_test_results(labels, predicted_labels)
        return result
    
    def get_prototypes(self):
        return None

## Código base das classes do LVQ

In [5]:
class LVQ:
    def __init__(self, k = 1, n_prototypes = 2, alpha_0 = 0.8):
        self.knn = KNeighborsClassifier(n_neighbors = k)
        self.n_prototypes = n_prototypes
        self.alpha_0 = alpha_0
    
    def predict(self, sample):
        return self.knn.predict(sample)
    
    def test(self, testing):
        samples = [sample[0:-1] for sample in testing]
        labels = [sample[-1] for sample in testing]
        predicted_labels = self.predict(samples)
        result = calculate_test_results(labels, predicted_labels)
        return result
    
    def get_prototypes(self):
        return {'samples': self.samples, 'labels': self.labels}

## Código do LVQ1

In [6]:
class LVQ1(LVQ):
    def adjust_prototype(self, prototype_index, alpha, sample):
        prototype = self.samples[prototype_index]
        if (self.labels[prototype_index] == sample[-1]):
            self.samples[prototype_index] = prototype + (sample[0:-1] - prototype) * alpha
        else:
            self.samples[prototype_index] = prototype - (sample[0:-1] - prototype) * alpha
    
    def train(self, training):
        prototypes = sample_prototypes(training, self.n_prototypes)
        self.samples = [sample[0:-1] for sample in prototypes]
        self.labels = [sample[-1] for sample in prototypes]
        
        alpha_t = self.alpha_0
        for sample in training:
            self.knn.fit(self.samples, self.labels)
            closest_index = self.knn.kneighbors([sample[0:-1]], n_neighbors = 1, return_distance = False)[0][0]
            self.adjust_prototype(closest_index, alpha_t, sample)
            alpha_t *= self.alpha_0
        
        self.knn.fit(self.samples, self.labels)

In [7]:
lvq1 = LVQ1(n_prototypes=3)
lvq1.train(np.array([[1,0,0], [1,1,1], [2,0,2], [2,1,0]]))
lvq1.test([[0, 1, 1], [1, 0.2, 0]])
#lvq1.get_prototypes()

{'precision': 0.5, 'recalls': {1: 1.0, 0: 0.0}}

## Código do LVQ2.1

In [8]:
class LVQ2(LVQ):
    def __init__(self, k = 1, n_prototypes = 2, alpha_0 = 0.8, w = 0.5):
        LVQ.__init__(self, k, n_prototypes, alpha_0)
        self.s = (1 - w) / (1 + w)
    
    def window_rule(self, sample, prototypes):
        distances = [LA.norm(prototype - sample[0:-1]) for prototype in prototypes]
        if 0 not in distances:
            min_dist = min(distances[0] / distances[1], distances[1] / distances[0])
            return min_dist > self.s
        return False
    
    def adjust_prototype(self, closest_indexes, alpha, sample):
        prototypes = [self.samples[index] for index in closest_indexes]
        if (self.window_rule(sample, prototypes)):
            if sample[-1] == self.labels[closest_indexes[0]] and sample[-1] != self.labels[closest_indexes[1]]:
                self.samples[closest_indexes[0]] = prototypes[0] + (sample[0:-1] - prototypes[0]) * alpha
                self.samples[closest_indexes[1]] = prototypes[1] - (sample[0:-1] - prototypes[1]) * alpha
            elif sample[-1] != self.labels[closest_indexes[0]] and sample[-1] == self.labels[closest_indexes[1]]:
                self.samples[closest_indexes[0]] = prototypes[0] - (sample[0:-1] - prototypes[0]) * alpha
                self.samples[closest_indexes[1]] = prototypes[1] + (sample[0:-1] - prototypes[1]) * alpha

    def train(self, training, prototypes = None):
        if prototypes == None:
            prototypes = sample_prototypes(training, self.n_prototypes)
            self.samples = [sample[0:-1] for sample in prototypes]
            self.labels = [sample[-1] for sample in prototypes]
        else:
            self.samples = prototypes['samples']
            self.labels = prototypes['labels']
        
        alpha_t = self.alpha_0
        for sample in training:
            self.knn.fit(self.samples, self.labels)
            closest_indexes = self.knn.kneighbors([sample[0:-1]], n_neighbors = 2, return_distance = False)[0]
            self.adjust_prototype(closest_indexes, alpha_t, sample)
            alpha_t *= self.alpha_0
        
        self.knn.fit(self.samples, self.labels)

In [9]:
lvq2 = LVQ2(n_prototypes=3)
lvq2.train(np.array([[1,0,0], [1,1,1], [2,0,2], [2,1,0]]))
#lvq2.test([[0, 1, 1], [1, 0.2, 0]])
lvq2.get_prototypes()

{'samples': [array([2, 1]), array([1, 1]), array([2, 0])], 'labels': [0, 1, 2]}

## Código do LVQ3

In [10]:
class LVQ3(LVQ2):
    def __init__(self, k = 1, n_prototypes = 2, alpha_0 = 0.8, w = 0.5, e = 0.5):
        LVQ2.__init__(self, k, n_prototypes, alpha_0, w)
        self.e = e
    
    def adjust_prototype(self, closest_indexes, alpha, sample):
        prototypes = [self.samples[index] for index in closest_indexes]
        if (self.window_rule(sample, prototypes)):
            if sample[-1] == self.labels[closest_indexes[0]] and sample[-1] != self.labels[closest_indexes[1]]:
                self.samples[closest_indexes[0]] = prototypes[0] + (sample[0:-1] - prototypes[0]) * alpha
                self.samples[closest_indexes[1]] = prototypes[1] - (sample[0:-1] - prototypes[1]) * alpha
            elif sample[-1] != self.labels[closest_indexes[0]] and sample[-1] == self.labels[closest_indexes[1]]:
                self.samples[closest_indexes[0]] = prototypes[0] - (sample[0:-1] - prototypes[0]) * alpha
                self.samples[closest_indexes[1]] = prototypes[1] + (sample[0:-1] - prototypes[1]) * alpha
            elif sample[-1] == self.labels[closest_indexes[0]]:
                self.samples[closest_indexes[0]] = prototypes[0] + (sample[0:-1] - prototypes[0]) * alpha * self.e
                self.samples[closest_indexes[1]] = prototypes[1] + (sample[0:-1] - prototypes[1]) * alpha * self.e

In [11]:
lvq3 = LVQ3(n_prototypes=3)
lvq3.train(np.array([[1,0,0], [1,1,1], [2,0,2], [2,1,0]]))
#lvq3.test([[0, 1, 1], [1, 0.2, 0]])
lvq3.get_prototypes()

{'samples': [array([1, 0]), array([1, 1]), array([2, 0])], 'labels': [0, 1, 2]}

# Avaliações

## Imports

In [12]:
%matplotlib inline

from scipy.io import arff
import pandas as pd
from time import process_time

import matplotlib
import matplotlib.pyplot as plt

import math

## Funções para o experimento

In [13]:
def normalize(raw):
    '''Normalizes an dataset so all of its attributes have the same weight.'''
    attribute_mins = np.min(raw, axis=0)
    attribute_maxs = np.max(raw, axis=0)
    return (raw - attribute_mins) / attribute_maxs

In [14]:
def k_fold(sep_samples, k = 5):
    '''Splits the samples in k groups with similar amounts of samples and distributions of every class.'''
    folds = [[] for _ in range(k)]
    for i in sep_samples:
        split_class = np.array_split(sep_samples[i], k)
        [a.extend(b) for (a, b) in zip(folds, split_class)]
    return folds

In [15]:
def cross_validation(machine, folds):
    '''Evaluates an algorithm through cross validation.'''
    random.seed(42)

    precisions = []
    recalls = []
    train_times = []
    test_times = []
    prototypes = []
    
    for i in range(len(folds)):
        print("Testing on fold " + str(i))
        train = [s for j, fold in enumerate(folds) if i != j for s in fold]
        test = folds[i]

        train_times.append(process_time())
        machine.train(train)
        train_times[-1] = process_time() - train_times[-1]

        test_times.append(process_time())
        test_results = machine.test(test)
        test_times[-1] = process_time() - test_times[-1]

        precisions.append(test_results['precision'])
        recalls.append(test_results['recalls'])
        prototypes.append(machine.get_prototypes())
    return {
        'precisions': precisions,
        'recalls': recalls,
        'train_times': train_times,
        'test_times': test_times,
        'prototypes': prototypes
    }

In [16]:
def cross_validation_lvq23(machine, folds, lvq1_prototypes):
    '''Evaluates an algorithm through cross validation.'''
    random.seed(42)

    precisions = []
    recalls = []
    train_times = []
    test_times = []
    prototypes = []
    
    for i in range(len(folds)):
        print("Testing on fold " + str(i))
        train = [s for j, fold in enumerate(folds) if i != j for s in fold]
        test = folds[i]

        train_times.append(process_time())
        machine.train(train, lvq1_prototypes[i])
        train_times[-1] = process_time() - train_times[-1]

        test_times.append(process_time())
        test_results = machine.test(test)
        test_times[-1] = process_time() - test_times[-1]

        precisions.append(test_results['precision'])
        recalls.append(test_results['recalls'])
        prototypes.append(machine.get_prototypes())
    return {
        'precisions': precisions,
        'recalls': recalls,
        'train_times': train_times,
        'test_times': test_times,
        'prototypes': prototypes
    }

## Base de Dados 1 - KC1

### Preparação

In [17]:
data = arff.loadarff('Datasets/kc1.arff')
dataFrame = pd.DataFrame(data[0])
dataFrame.defects = [1 if i == b'true' else 0 for i in dataFrame.defects]
rawData = dataFrame.values
rawData

array([[ 1.1,  1.4,  1.4, ...,  1.2,  1.4,  0. ],
       [ 1. ,  1. ,  1. , ...,  1. ,  1. ,  1. ],
       [83. , 11. ,  1. , ..., 64. , 21. ,  1. ],
       ...,
       [ 2. ,  1. ,  1. , ...,  1. ,  1. ,  0. ],
       [13. ,  1. ,  1. , ...,  8. ,  1. ,  0. ],
       [11. ,  2. ,  1. , ...,  9. ,  3. ,  0. ]])

In [18]:
# Normalizing the dataset
normalizedData = normalize(rawData)
normalizedData

array([[3.47222222e-04, 8.88888889e-03, 1.53846154e-02, ...,
        2.80373832e-03, 4.49438202e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.33644860e-03, 0.00000000e+00, 1.00000000e+00],
       [2.84722222e-01, 2.22222222e-01, 0.00000000e+00, ...,
        1.49532710e-01, 2.24719101e-01, 1.00000000e+00],
       ...,
       [3.47222222e-03, 0.00000000e+00, 0.00000000e+00, ...,
        2.33644860e-03, 0.00000000e+00, 0.00000000e+00],
       [4.16666667e-02, 0.00000000e+00, 0.00000000e+00, ...,
        1.86915888e-02, 0.00000000e+00, 0.00000000e+00],
       [3.47222222e-02, 2.22222222e-02, 0.00000000e+00, ...,
        2.10280374e-02, 2.24719101e-02, 0.00000000e+00]])

In [19]:
# Separating samples by class
classes = {i[-1]: normalizedData[np.where(rawData[:,-1] == i[-1])] for i in normalizedData}
classes

{0.0: array([[0.00034722, 0.00888889, 0.01538462, ..., 0.00280374, 0.00449438,
         0.        ],
        [0.01388889, 0.        , 0.        , ..., 0.00934579, 0.        ,
         0.        ],
        [0.00694444, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.00347222, 0.        , 0.        , ..., 0.00233645, 0.        ,
         0.        ],
        [0.04166667, 0.        , 0.        , ..., 0.01869159, 0.        ,
         0.        ],
        [0.03472222, 0.02222222, 0.        , ..., 0.02102804, 0.02247191,
         0.        ]]),
 1.0: array([[0.        , 0.        , 0.        , ..., 0.00233645, 0.        ,
         1.        ],
        [0.28472222, 0.22222222, 0.        , ..., 0.14953271, 0.2247191 ,
         1.        ],
        [0.15625   , 0.15555556, 0.19230769, ..., 0.12149533, 0.15730337,
         1.        ],
        ...,
        [0.11805556, 0.08888889, 0.15384615, ..., 0.07943925, 0.08988764,
         1.        ],
  

In [20]:
# Separating the folds
folds = k_fold(classes)
folds

[[array([3.47222222e-04, 8.88888889e-03, 1.53846154e-02, 8.88888889e-03,
         1.17540687e-03, 1.64165873e-04, 6.50000000e-01, 2.41860465e-02,
         6.73365793e-03, 4.00241980e-06, 4.92424242e-01, 7.20435542e-05,
         7.63358779e-03, 4.54545455e-02, 3.44827586e-02, 1.66666667e-01,
         3.24324324e-02, 1.00000000e-02, 1.76991150e-03, 2.80373832e-03,
         4.49438202e-03, 0.00000000e+00]),
  array([0.01388889, 0.        , 0.        , 0.        , 0.00994575,
         0.00389957, 0.19      , 0.04967442, 0.05998135, 0.00025354,
         0.00378788, 0.00025326, 0.01145038, 0.        , 0.        ,
         0.        , 0.10810811, 0.025     , 0.01032448, 0.00934579,
         0.        , 0.        ]),
  array([0.00694444, 0.        , 0.        , 0.        , 0.00090416,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.00381679, 0.        , 0.        ,
         0.        , 0.02702703, 0.        , 0.00147493, 0.        ,
     

### Execução do KNN básico

In [21]:
knn = KNN_adapter()
knn_results = cross_validation(knn, folds)
knn_results

Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4


{'precisions': [0.7825059101654847,
  0.7890995260663507,
  0.7535545023696683,
  0.8194774346793349,
  0.7553444180522565],
 'recalls': [{0.0: 0.8543417366946778, 1.0: 0.3939393939393939},
  {0.0: 0.865546218487395, 1.0: 0.36923076923076925},
  {0.0: 0.8235294117647058, 1.0: 0.36923076923076925},
  {0.0: 0.9185393258426966, 1.0: 0.27692307692307694},
  {0.0: 0.8286516853932584, 1.0: 0.35384615384615387}],
 'train_times': [0.03125, 0.0, 0.015625, 0.0, 0.015625],
 'test_times': [0.21875, 0.28125, 0.25, 0.1875, 0.203125],
 'prototypes': [None, None, None, None, None]}

### Execução do LVQ1

In [22]:
lvq1 = LVQ1(n_prototypes = 20)
lvq1_results = cross_validation(lvq1, folds)
lvq1_results

Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4


{'precisions': [0.806146572104019,
  0.8293838862559242,
  0.8199052132701422,
  0.7672209026128266,
  0.7339667458432304],
 'recalls': [{0.0: 0.9243697478991597, 1.0: 0.16666666666666666},
  {0.0: 0.9411764705882353, 1.0: 0.2153846153846154},
  {0.0: 0.907563025210084, 1.0: 0.3384615384615385},
  {0.0: 0.8370786516853933, 1.0: 0.38461538461538464},
  {0.0: 0.8398876404494382, 1.0: 0.15384615384615385}],
 'train_times': [3.0625, 3.03125, 2.4375, 2.703125, 2.515625],
 'test_times': [0.1875, 0.21875, 0.265625, 0.3125, 0.1875],
 'prototypes': [{'samples': [array([1.22328607e-29, 2.04324716e-51, 0.00000000e+00, 2.04324716e-51,
           4.95516813e-52, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           2.09176181e-51, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           0.00000000e+00, 4.56701330e-51, 0.00000000e+00, 1.28047102e-51,
           2.06620499e-51]),
    array([ 3.81944444e-02,  2.22222222e

### Execução do LVQ2.1

In [23]:
lvq2 = LVQ2(n_prototypes = 20)
lvq2_results = cross_validation_lvq23(lvq2, folds, lvq1_results['prototypes'])
lvq2_results

Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4


{'precisions': [0.8250591016548463,
  0.8293838862559242,
  0.8270142180094787,
  0.7648456057007126,
  0.7292161520190024],
 'recalls': [{0.0: 0.9663865546218487, 1.0: 0.06060606060606061},
  {0.0: 0.9411764705882353, 1.0: 0.2153846153846154},
  {0.0: 0.9131652661064426, 1.0: 0.35384615384615387},
  {0.0: 0.8370786516853933, 1.0: 0.36923076923076925},
  {0.0: 0.8370786516853933, 1.0: 0.13846153846153847}],
 'train_times': [2.0, 1.40625, 1.328125, 1.515625, 1.546875],
 'test_times': [0.25, 0.203125, 0.171875, 0.171875, 0.171875],
 'prototypes': [{'samples': [array([1.22328607e-29, 2.04324716e-51, 0.00000000e+00, 2.04324716e-51,
           4.95516813e-52, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           2.09176181e-51, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           0.00000000e+00, 4.56701330e-51, 0.00000000e+00, 1.28047102e-51,
           2.06620499e-51]),
    array([ 3.82905319e-02,  2.2427

### Execução do LVQ3

In [24]:
lvq3 = LVQ3(n_prototypes = 20)
lvq3_results = cross_validation_lvq23(lvq3, folds, lvq1_results['prototypes'])
lvq3_results

Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4


{'precisions': [0.8416075650118203,
  0.8293838862559242,
  0.8246445497630331,
  0.7695961995249406,
  0.7292161520190024],
 'recalls': [{0.0: 0.988795518207283, 1.0: 0.045454545454545456},
  {0.0: 0.9411764705882353, 1.0: 0.2153846153846154},
  {0.0: 0.9103641456582633, 1.0: 0.35384615384615387},
  {0.0: 0.8426966292134831, 1.0: 0.36923076923076925},
  {0.0: 0.848314606741573, 1.0: 0.07692307692307693}],
 'train_times': [1.734375, 1.46875, 1.5, 1.328125, 1.3125],
 'test_times': [0.171875, 0.171875, 0.171875, 0.171875, 0.1875],
 'prototypes': [{'samples': [array([1.79324471e-29, 3.06487074e-51, 0.00000000e+00, 3.06487074e-51,
           7.43275220e-52, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           3.13764272e-51, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
           0.00000000e+00, 6.85051994e-51, 0.00000000e+00, 1.92070653e-51,
           3.09930749e-51]),
    array([ 3.83877703e-02,  2.26344

## Base de Dados 2 - 

### Preparação

### Execução do KNN básico

### Execução do LVQ1

### Execução do LVQ2.1

### Execução do LVQ3