In [2]:
from Datos import Datos
import numpy as np

In [7]:
from abc import ABCMeta,abstractmethod


class Clasificador:

    # Clase abstracta
    __metaclass__ = ABCMeta

    # Metodos abstractos que se implementan en casa clasificador concreto
    @abstractmethod
    # TODO: esta funcion debe ser implementada en cada clasificador concreto
    # datosTrain: matriz numpy con los datos de entrenamiento
    # atributosDiscretos: array bool con la indicatriz de los atributos nominales
    # diccionario: array de diccionarios de la estructura Datos utilizados para la codificacion de variables discretas
    def entrenamiento(self,datosTrain,atributosDiscretos,diccionario):
        pass


    @abstractmethod
    # TODO: esta funcion debe ser implementada en cada clasificador concreto
    # devuelve un numpy array con las predicciones
    def clasifica(self,datosTest,atributosDiscretos,diccionario):
        pass


    # Obtiene el numero de aciertos y errores para calcular la tasa de fallo
    # TODO: implementar
    def error(self,datos,pred):
        # Aqui se compara la prediccion (pred) con las clases reales y se calcula el error
        pass


    # Realiza una clasificacion utilizando una estrategia de particionado determinada
    # TODO: implementar esta funcion
    def validacion(self,particionado,dataset,clasificador,seed=None):
        # Creamos las particiones siguiendo la estrategia llamando a particionado.creaParticiones
        # - Para validacion cruzada: en el bucle hasta nv entrenamos el clasificador con la particion de train i
        # y obtenemos el error en la particion de test i
        # - Para validacion simple (hold-out): entrenamos el clasificador con la particion de train
        # y obtenemos el error en la particion test. Otra opci�n es repetir la validaci�n simple un n�mero especificado de veces, obteniendo en cada una un error. Finalmente se calcular�a la media.
        pass

##############################################################################

class ClasificadorNaiveBayes(Clasificador):

    # TODO: implementar
    def entrenamiento(self,datostrain,atributosDiscretos,diccionario):
        # First we calculate the priors.
        [ndata, nfeat] = datostrain.shape
        P = []
        for j in range(len(diccionario[nfeat-1])):
            cont = 0
            for i in range(ndata):
                cont = cont + (diccionario[nfeat-1].get(datostrain[i][nfeat-1]) == j)
            P.append(cont/ndata)
            
        # Now we calculate the conditional probabilities.
        CP = np.zeros((len(diccionario[nfeat-1]), nfeat-1, 3))

        for i in range (len(diccionario[nfeat-1])):
            for j in range (nfeat-1):
                cont = 0
                for l in range(len(diccionario[j])):
                    for k in range(ndata):
                        if(diccionario[nfeat-1].get(dataset.datos[k][nfeat-1]) == i):
                            cont = cont + (dataset.diccionario[j].get(dataset.datos[k][j]) == l)
                    CP[i][j][l] = cont/ndata
            


    # TODO: implementar
    def clasifica(self,datostest,atributosDiscretos,diccionario):
        Pred = []
        [ndata, nfeat] = datostest.shape
        for i in range (ndata):
            auxpred = []
            for k in range(len(diccionario[nfeat - 1])):
                aux = P[k]
                for j in range (nfeat-1):
                    aux = aux * CP[j][diccionario[j].get(datostest[i][j])][k]
                auxpred.append(aux)
            Pred.append(auxpred.index(max(auxpred)))

# CASO DISCRETO

In [23]:
dataset = Datos("../ConjuntosDatos/tic-tac-toe.data")

In [24]:
[ndata, nfeat] = dataset.datos.shape
print(ndata,nfeat)
print(dataset.nominalAtributos)

958 10
[True, True, True, True, True, True, True, True, True, True]


In [25]:
print ((dataset.diccionario[nfeat-1]))

{'negative': 0, 'positive': 1}


In [11]:
P = []
for j in range(len(dataset.diccionario[nfeat-1])):
    cont = 0
    for i in range(ndata):
        cont = cont + (dataset.diccionario[nfeat-1].get(dataset.datos[i][nfeat-1]) == j)
    P.append(cont/ndata)
    
print (P)

[0.3465553235908142, 0.6534446764091858]


In [36]:
nvals = 3
CP = np.zeros((nfeat-1, nvals, len(dataset.diccionario[nfeat-1])))

for i in range (len(dataset.diccionario[nfeat-1])):
    for j in range (nfeat-1):
        cont = 0
        for l in range(len(dataset.diccionario[j])):
            for k in range(ndata):
                if(dataset.diccionario[nfeat-1].get(dataset.datos[k][nfeat-1]) == i):
                    cont = cont + (dataset.diccionario[j].get(dataset.datos[k][j]) == l)
            CP[j][l][i] = cont/(ndata*P[i])
print (CP)

[[[0.18975904 0.22683706]
  [0.62951807 0.52875399]
  [1.         1.        ]]

 [[0.23493976 0.27476038]
  [0.53915663 0.64057508]
  [1.         1.        ]]

 [[0.18975904 0.22683706]
  [0.62951807 0.52875399]
  [1.         1.        ]]

 [[0.23493976 0.27476038]
  [0.53915663 0.64057508]
  [1.         1.        ]]

 [[0.14457831 0.17891374]
  [0.72289157 0.41533546]
  [1.         1.        ]]

 [[0.23493976 0.27476038]
  [0.53915663 0.64057508]
  [1.         1.        ]]

 [[0.18975904 0.22683706]
  [0.62951807 0.52875399]
  [1.         1.        ]]

 [[0.23493976 0.27476038]
  [0.53915663 0.64057508]
  [1.         1.        ]]

 [[0.18975904 0.22683706]
  [0.62951807 0.52875399]
  [1.         1.        ]]]


In [37]:
Pred = []
for i in range (ndata):
    auxpred = []
    for k in range(len(dataset.diccionario[nfeat - 1])):
        aux = P[k]
        for j in range (nfeat-1):
            aux = aux * CP[j][dataset.diccionario[j].get(dataset.datos[i][j])][k]
        auxpred.append(aux)
    Pred.append(auxpred.index(max(auxpred)))
print(Pred)
        
            

[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 

# CASO CONTINUO

In [35]:
import statistics

In [28]:
dataset = Datos("../ConjuntosDatos/german.data")

In [29]:
[ndata, nfeat] = dataset.datos.shape
print(ndata,nfeat)
print(dataset.nominalAtributos)
print ((dataset.diccionario[nfeat-1]))

1000 21
[True, False, True, True, False, True, True, False, True, True, False, True, False, True, True, False, True, False, True, True, False]
{}


In [44]:
m1 = []
m2 = []
M = np.zeros((nfeat - 1, 2))
V = np.zeros((nfeat - 1, 2))
for j in range (nfeat - 1):
    if (dataset.nominalAtributos[j] == False):
        # We calculate the mean coditioned to each possible class
        for i in range(ndata):
            if (dataset.datos[i][nfeat-1] == 1):
                m1.append(dataset.datos[i][j])
            if(dataset.datos[i][nfeat-1] == 2):
                m2.append(dataset.datos[i][j])
        M[j][0] = np.mean(m1)
        V[j][0] = np.var(m1)
        M[j][1] = np.mean(m2)
        V[j][1] = np.var(m2)
        
        # We calculate the variance conditioned to each possible class
        

In [45]:
print(M)

[[   0.            0.        ]
 [  19.20714286   24.86      ]
 [   0.            0.        ]
 [   0.            0.        ]
 [1502.33214286 1981.49333333]
 [   0.            0.        ]
 [   0.            0.        ]
 [1002.52809524 1322.02777778]
 [   0.            0.        ]
 [   0.            0.        ]
 [ 752.60678571  992.23333333]
 [   0.            0.        ]
 [ 609.33028571  800.57933333]
 [   0.            0.        ]
 [   0.            0.        ]
 [ 508.01261905  667.37722222]
 [   0.            0.        ]
 [ 435.6044898   572.20238095]
 [   0.            0.        ]
 [   0.            0.        ]]


In [46]:
print(V)

[[0.00000000e+00 0.00000000e+00]
 [1.22581378e+02 1.75840400e+02]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [5.07913627e+06 1.00586731e+07]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [3.88569944e+06 7.57557208e+06]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [3.10165687e+06 6.00797248e+06]
 [0.00000000e+00 0.00000000e+00]
 [2.56346399e+06 4.95332811e+06]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [2.18754640e+06 4.21648749e+06]
 [0.00000000e+00 0.00000000e+00]
 [1.90649741e+06 3.66848166e+06]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]]
