# Spam Filter
### Alicia Brown

In [110]:
# Importar paquetes
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from scipy.stats import norm
from sklearn import preprocessing
from random import random

In [111]:
df = pd.read_csv("spambase.csv",header=None) # Leer el archivo

In [112]:
# Los atributos no tienen nombres por lo que se etiquetan las columnas con numeros
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


In [113]:
X_train, X_test, Y_train, Y_test = train_test_split(df[df.columns[0:-1]],df[df.columns[-1]], train_size=0.75)

In [114]:
# Se toma 75% de los datos para entrenamiento
index=np.array([1 if random() < 0.75 else 0 for i in range(len(df))])

In [115]:
# Se crean muestras de entrenamiento y de prueba.
X_train=np.array(df[df.columns[0:-1]])[index==1]
X_test=np.array(df[df.columns[0:-1]])[index==0]
Y_train=np.array(df[df.columns[-1]])[index==1]
Y_test=np.array(df[df.columns[-1]])[index==0]

In [116]:
# Normalizar no ayuda mucho pero sale igual al de sklearn. 
# Para que las alturas del pdf signifiquen lo mismo 
scaler = preprocessing.StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [117]:
# Probabilidades de ser Spam
p_spam = float(sum(Y_train))/len(Y_train)
p_noS = 1 - p_spam

In [118]:
# Medias y desviaciones estándar de los 57 atributos dividido Spam y NoSpam
mean_s = np.mean(X_train[Y_train==1], axis=0)
sd_s = np.std(X_train[Y_train==1], axis=0)
mean_ns = np.mean(X_train[Y_train==0], axis=0)
sd_ns = np.std(X_train[Y_train==0], axis=0)

In [119]:
# Calculo de los logaritmos de las probabilidades condicionales usando Naive Bayes 
def Bayes_Filter(x,mean_s,mean_ns,sd_s,sd_ns,p_spam): 
    p_noS = 1 - p_spam
    pred = np.zeros(len(x))
    spam = np.log(p_spam)
    nospam = np.log(p_noS)
    
    for i in range(len(x)):
        for j in range(len(mean_s)):
            p = norm.pdf(x[i][j],mean_s[j], sd_s[j])
            if (p != 0):
                spam += np.log(p)
            else:
                spam += -10000.0
        for k in range(len(mean_ns)):
            p = norm.pdf(x[i][k],mean_ns[k], sd_ns[k])
            if (p != 0):
                nospam += np.log(p)
            else:
                nospam += -10000.0
        if spam > nospam:
            pred[i]= 1
        else:
            pred[i]= 0
    return pred

In [None]:
# Prueba el filtro
prediction = Bayes_Filter(X_test,mean_s,mean_ns,sd_s,sd_ns,p_spam)

In [None]:
confusion_matrix(Y_test,prediction)

In [None]:
# Resultados con sklearn
from sklearn.naive_bayes import GaussianNB
NB=GaussianNB()
NB.fit(X_train,Y_train) 
pred=NB.predict(X_test) 
print(confusion_matrix(Y_test,pred))