# Código One Class Classification

## Imports

In [38]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import numpy.linalg as LA
from functools import reduce

## Funções de Suporte

## K-distance

Este algoritmo é o que foi proposto na questão da prova correspondente a esta lista.
Para classificar uma amostra, são calculados os k pontos (amostras do treinamento) mais próximos e em seguida, para cada um deles é calculada a distância do k-ésimo ponto mais próximo dele. Para pertencer à mesma classe dos pontos, a distância entre a amostra e cada um deles deve ser menor ou igual à distância do k-ésimo ponto mais próximo de cada na maioria dos casos.

In [39]:
class KDistance:
    def __init__(self, k = 3, label = 0, positive = False):
        self.k = k
        self.nn = NearestNeighbors()
        self.label = label
        self.positive = positive
    
    def train(self, training):
        self.samples = training
        self.nn.fit(self.samples)
        self.distances = self.nn.kneighbors(n_neighbors=self.k)[0][:,-1]
        #print(self.distances)
    
    def predict(self, samples):
        def count_true(array):
            unique, counts = np.unique(array, return_counts=True)
            return counts[np.where(unique == True)][0] if True in unique else 0
            
        distances, closest = self.nn.kneighbors(X = samples, n_neighbors=self.k)
        distances_closest = self.distances[closest]
        #print(distances)
        #print(distances_closest)
        #print(np.less_equal(distances, distances_closest))
        votes = np.less_equal(distances, distances_closest)
        
        return np.array([(count_true(i) << 1) >= self.k for i in votes])
        #return np.less_equal(distances, distances_closest).all(axis=1)
    
    def test(self, testing):
        samples = testing[:, :-1]
        labels = (testing[:, -1] == self.label) == self.positive
        predicted_labels = self.predict(samples) == self.positive
        result = KDistance.calculate_test_results(labels, predicted_labels)
        return result
    
    @staticmethod
    def calculate_test_results(labels, predicted):
        #print(labels)
        #print(predicted)
        
        unique, counts = np.unique(np.logical_and(labels, predicted), return_counts=True)
        true_pos = counts[np.where(unique == True)][0] if True in unique else 0
        #print(true_pos)
        
        unique, counts = np.unique(np.logical_and(np.logical_not(labels), predicted), return_counts=True)
        false_pos = counts[np.where(unique == True)][0] if True in unique else 0
        #print(false_pos)
        
        unique, counts = np.unique(np.logical_and(labels, np.logical_not(predicted)), return_counts=True)
        false_neg = counts[np.where(unique == True)][0] if True in unique else 0
        #print(false_neg)
        
        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
        #print(precision)
        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
        #print(recall)
        
        return {
            "true_pos": true_pos / len(labels),
            "false_pos": false_pos / len(labels),
            "f1": 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        }

In [40]:
kdist = KDistance()
kdist.train(np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]))
#print(kdist.predict(np.array([[0, 0], [0, 5]])))
kdist.test(np.array([[0, 0, 0], [0, 3, 1], [0, 4, 1], [0, 5, 1]]))

{'true_pos': 0.25, 'false_pos': 0.0, 'f1': 0.5}

# Avaliações

## Imports

In [41]:
%matplotlib inline

from scipy.io import arff
import pandas as pd
from time import process_time

import matplotlib
import matplotlib.pyplot as plt
from statistics import mean

## Funções para o experimento

In [42]:
def k_fold(sepSamples, k):
    '''Splits the samples in k groups with similar amounts of samples and distributions of every class.'''
    folds = [[None, None] for _ in range(k)]
    splitClass = [None, None]
    splitClass[0] = np.array_split(sepSamples[0], k)
    splitClass[1] = np.array_split(sepSamples[1], k)
    for a in range(k):
        folds[a][0] = splitClass[0][a]
        folds[a][1] = splitClass[1][a]
    return folds

In [43]:
def cross_validation(machine, folds):
    '''Evaluates a machine through cross-validation.'''
    true_pos = []
    false_pos = []
    f1 = []
    train_times = []
    test_times = []
    for i in range(len(folds)):
        print("Testing on fold " + str(i))
        train_folds = [fold[0] for f, fold in enumerate(folds) if i != f]
        train = np.concatenate(train_folds)[:,:-1]
        test = np.concatenate((folds[i][0], folds[i][1]))
        
        train_times.append(process_time())
        machine.train(train)
        train_times[-1] = process_time() - train_times[-1]

        test_times.append(process_time())
        test_results = machine.test(test)
        test_times[-1] = process_time() - test_times[-1]

        true_pos.append(test_results['true_pos'])
        false_pos.append(test_results['false_pos'])
        f1.append(test_results['f1'])
    return {
        'true_pos': true_pos,
        'false_pos': false_pos,
        'f1': f1,
        'train_times': train_times,
        'test_times': test_times
    }

In [44]:
def experiment(machine, folds, kVariations = [1, 2, 3, 5, 7, 9, 11, 13, 15]):
    '''The code which will run the experiment.'''
    result = {}
    for i in kVariations:
        print('Parameter k = ' + str(i))
        machine.k = i
        result[i] = cross_validation(machine, folds)
    return result

## Base de Dados 1 - CM1

### Preparação

In [45]:
data = arff.loadarff('Datasets/cm1.arff')
dataFrame = pd.DataFrame(data[0])
dataFrame.defects = [1 if i == b'true' else 0 for i in dataFrame.defects]
rawData = dataFrame.values
#rawData

In [46]:
# Separating samples by class
classes = [None, None]
classes[0] = rawData[np.where(rawData[:,-1] == 0)]
classes[1] = rawData[np.where(rawData[:,-1] == 1)]
#classes

In [47]:
# Separating the folds
folds = k_fold(classes, k = 5)
#folds

### Execução

In [48]:
kdist = KDistance()
cm1_results = experiment(kdist, folds)
cm1_results

Parameter k = 1
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Parameter k = 2
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Parameter k = 3
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Parameter k = 5
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Parameter k = 7
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Parameter k = 9
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Parameter k = 11
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Parameter k = 13
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4
Parameter k = 15
Testing on fold 0
Testing on fold 1
Testing on fold 2
Testing on fold 3
Testing on fold 4


{1: {'true_pos': [0.06, 0.06, 0.04, 0.04, 0.02040816326530612],
  'false_pos': [0.32, 0.55, 0.35, 0.4, 0.336734693877551],
  'f1': [0.25,
   0.16901408450704225,
   0.163265306122449,
   0.14814814814814814,
   0.0909090909090909],
  'train_times': [0.015625, 0.0, 0.0, 0.0, 0.0],
  'test_times': [0.0, 0.015625, 0.0, 0.015625, 0.0]},
 2: {'true_pos': [0.02, 0.03, 0.0, 0.01, 0.0],
  'false_pos': [0.18, 0.28, 0.1, 0.19, 0.20408163265306123],
  'f1': [0.13333333333333333, 0.14634146341463414, 0, 0.06666666666666667, 0],
  'train_times': [0.0, 0.0, 0.0, 0.0, 0.0],
  'test_times': [0.015625, 0.0, 0.015625, 0.0, 0.0]},
 3: {'true_pos': [0.03, 0.04, 0.01, 0.01, 0.0],
  'false_pos': [0.25, 0.37, 0.2, 0.32, 0.2653061224489796],
  'f1': [0.15789473684210525,
   0.15686274509803924,
   0.06451612903225806,
   0.04651162790697675,
   0],
  'train_times': [0.0, 0.0, 0.0, 0.0, 0.0],
  'test_times': [0.0, 0.0, 0.015625, 0.0, 0.0]},
 5: {'true_pos': [0.03, 0.04, 0.01, 0.01, 0.02040816326530612],
  'fal

### Resultados

## Base de Dados 2 - JM1

### Preparação

In [49]:
data = arff.loadarff('Datasets/jm1.arff')
dataFrame = pd.DataFrame(data[0])
dataFrame.defects = [1 if i == b'true' else 0 for i in dataFrame.defects]
rawData = dataFrame.values
#rawData

In [50]:
# Remove rows with nans on them
def remove_nans(array):
    return array[~np.any(np.isnan(array), axis=1)]
rawData = remove_nans(rawData)

In [51]:
# Separating samples by class
classes = [None, None]
classes[0] = rawData[np.where(rawData[:,-1] == 0)]
classes[1] = rawData[np.where(rawData[:,-1] == 1)]
#classes

In [52]:
# Separating the folds
folds = k_fold(classes, k)
#folds

NameError: name 'k' is not defined

### Execução

In [None]:
kdist = KDistance()
jm1_results = experiment(kdist, folds)
jm1_results

### Resultados