In [317]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from scipy.stats import norm
from sklearn import preprocessing
from random import random
import requests
import csv
from sklearn.naive_bayes import GaussianNB

In [318]:
#se leen datos del url
CSV_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'
with requests.Session() as s:
    download = s.get(CSV_URL)
    decoded_content = download.content.decode('utf-8')
    cr = csv.reader(decoded_content.splitlines(), delimiter=',')
    lista = list(cr)

In [319]:
# se asigna un indice a cada uno de los renglones de la matriz de datos
index = np.array([i for i in range(len(lista))])

In [320]:
# se incluye en un Data frame tanto los datos como el indice correspondiente
df = pd.DataFrame(lista, index)
df = df.astype(float).fillna(0.0)

In [321]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


In [322]:
# se separan los datos entre entrenamiento y prueba al 75%
X_train, X_test, Y_train, Y_test = train_test_split(df[df.columns[0:-1]],df[df.columns[-1]], train_size=0.75)

In [323]:
index=np.array([1 if random() < 0.75 else 0 for i in range(len(df))])

In [324]:
# Separate both train and test as well as the response variable
X_train= np.array(df[df.columns[0:-1]])[index==1]
X_test=np.array(df[df.columns[0:-1]])[index==0]
Y_train=np.array(df[df.columns[-1]])[index==1]
Y_test=np.array(df[df.columns[-1]])[index==0]

In [325]:
# Normalizar no ayuda mucho pero sale igual al de sklearn. Para que las alturas del pdf signifiquen lo mismo 
scaler = preprocessing.StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [326]:
#Se obtienen medias y desviaciones estandar para cada uno de los casos NoSpam/Spam con los datos de training
MeanIsSpam = X_train[Y_train == 1].mean(axis = 0)
MeanNoSpam = X_train[Y_train == 0].mean(axis = 0)
StDevIsSpam = X_train[Y_train == 1].std(axis = 0)
StDevNoSpam = X_train[Y_train == 0].std(axis = 0)

In [327]:
# P(C = Spam/X1,....Xn) = log(P(C = Spam)) + Sum(log(P(xi/C= Spam)))
#Vamos a ir por partes, calculemos P(C = Spam)
PSpam = float(len(Y_train[Y_train==1])/float((len(Y_train[Y_train==0])+len(Y_train[Y_train==1]))))
PNSpam = 1 - PSpam
#Despues, calculemos Log(P(C = Spam))
LogPSpam = np.log(PSpam)
LogPSpam= np.log(PNSpam)

In [353]:
# definicion de funcion para acumular valores de comparacion entre las salidas de las probabilidades bayesaianas de Spam y no Spam
def FunDeterminacionSpam(X, MeanIsSpam, MeanNoSpam, StDevIsSpam, StDevNoSpam, LogPSpam):
    # logaritmo del complemento de la probabilidad de Spam
    LogPNSpam = np.log(1-np.exp(LogPSpam))
    comp = np.zeros((len(X_test),2))
    binar = np.zeros(len(X_test))
      
    for k in range (0, len(X)):    
        # evaluacion y acumulacion de las probabilidades bayesianas de que haya Spam por cada fila
        for i in range (0, MeanIsSpam.size):
            PSpam = norm.pdf(X[k][i], MeanIsSpam[i], StDevIsSpam[i])
            if (PSpam != 0):
                LogPSpam+=np.log(PSpam)
            else:
                LogPSpam+=-len(X)*100  
        comp[k][0]=LogPSpam
        LogPSpam=0.0
              
        # evaluacion y acumulacion de las probabilidades bayesianas de que no haya Spam por cada fila
        for l in range (0, MeanNoSpam.size):
            PNSpam = norm.pdf(X[k][l], MeanNoSpam[l], StDevNoSpam[l])
            if(PNSpam != 0):
                LogPNSpam+=np.log(PNSpam)
            else:  
                LogPNSpam+=-len(X)*100                
        comp[k][1]=LogPNSpam
        LogPNSpam=0.0
        
        # comparativo por fila en la cual se acumula un valor en caso de que la probabilidad bayesiana de que haya Spam sea mayor, a que no haya Spam  
        if(comp[k][0] > comp[k][1]):
            binar[k] = 1
        else:
            binar[k] = 0
            
    # EL resultado es el acumulado de cada comparativo                
    return binar

In [358]:
confusion_matrix(Y_test,FunDeterminacionSpam(X_test, MeanIsSpam, MeanNoSpam, StDevIsSpam, StDevNoSpam, LogPSpam))

array([[498, 185],
       [ 21, 412]])

In [359]:
from sklearn.naive_bayes import GaussianNB
NB=GaussianNB()
NB.fit(X_train,Y_train) 
pred=NB.predict(X_test) 
print(confusion_matrix(Y_test,pred))

[[500 183]
 [ 21 412]]
