# Clasificador Bayesiano Gaussiano

In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [70]:
# exportar data
data = pd.read_csv("diabetes.csv")

In [71]:
# division del dataset
train_data, test_data = train_test_split(data, test_size=0.3, random_state=10)
train_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
491,2,89,90,30,0,33.5,0.292,42,0
69,4,146,85,27,100,28.9,0.189,27,0
667,10,111,70,27,0,27.5,0.141,40,1
566,1,99,72,30,18,38.6,0.412,21,0
97,1,71,48,18,76,20.4,0.323,22,0
...,...,...,...,...,...,...,...,...,...
369,1,133,102,28,140,32.8,0.234,45,1
320,4,129,60,12,231,27.5,0.527,31,0
527,3,116,74,15,105,26.3,0.107,24,0
125,1,88,30,42,99,55.0,0.496,26,1


Etapa de aprendizaje

In [72]:
# Aprender las prioris 0-NE, 1-E
frecEnf = np.sum(train_data.iloc[:,-1] == 1)
probPriori = np.array([train_data.shape[0]-frecEnf, frecEnf])/train_data.shape[0]
probPriori

array([0.66294227, 0.33705773])

In [73]:
# Aprender los Likelihood (estadisticos)
nc = 2 # <--- Automatico
nvariables = train_data.shape[1]-1
estadisticos = np.zeros((nvariables,nc*2)) # reservar el espacio en memoria
# obtener los estadísticos
for i in range(0,nc): # clases
    for j in range(0,nvariables): # variables
        train_variableK = train_data[train_data.iloc[:,-1]==i].iloc[:,j]
        estadisticos[j,0+2*i:2+2*i] = np.array([np.mean(train_variableK), 
                                        np.var(train_variableK)]).round(2)
estadisticos # visualizar

array([[3.240000e+00, 8.510000e+00, 4.880000e+00, 1.452000e+01],
       [1.097800e+02, 7.252300e+02, 1.441000e+02, 1.007460e+03],
       [6.811000e+01, 3.369000e+02, 7.196000e+01, 4.145200e+02],
       [1.994000e+01, 2.086100e+02, 2.339000e+01, 3.102800e+02],
       [7.163000e+01, 1.070690e+04, 1.127600e+02, 2.300126e+04],
       [3.041000e+01, 5.970000e+01, 3.502000e+01, 4.955000e+01],
       [4.300000e-01, 9.000000e-02, 5.300000e-01, 1.400000e-01],
       [3.070000e+01, 1.263700e+02, 3.801000e+01, 1.292500e+02]])

In [74]:
def fnGaussiana(x,mu,s2):
    A = 1/(np.sqrt(2*np.pi*s2))
    B = -0.5*((x-mu)**2/s2)
    fg = A*np.exp(B) # vectorial
    return fg

Etapa de evaluación

In [87]:
def fnPredict(test_data,estadisticos,probPriori):
    predicciones = np.zeros(len(test_data),dtype=int)
    probPosteriori = np.zeros(len(probPriori))
    for k in range(0,len(test_data)): # numExamples
        for i in range(0,nc): # clases
            probPosteriori[i] = probPriori[i]
            for j in range(0,nvariables): # variables
                x = test_data.iloc[k,j]
                mu = estadisticos[j,2*i]
                s2 = estadisticos[j,2*i+1]
                probPosteriori[i] = probPosteriori[i]*fnGaussiana(x,mu,s2)
        predicciones[k] = np.argmax(probPosteriori)
    return predicciones

In [91]:
prediccion = fnPredict(test_data,estadisticos,probPriori)
print(prediccion)
real = test_data.iloc[:,-1]
print(real)

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0
 0 0 1 1 0 0 0 1 0 0 0 1 1 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 1 0
 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1
 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0
 0 1 1 0 1 1 0 0 0]
568    0
620    0
456    0
197    1
714    0
      ..
345    0
408    1
304    0
686    0
202    0
Name: Outcome, Length: 231, dtype: int64
