## SPAM Filter

* Procedure
    * Divide data in train and test sets
    * Keep test data in a safe!
    * Transform test data (normalize, discretize, etc)
    * Train model
    * Transform test data with the parameters found in step 3
    * Test model with test data
    * Evaluate results

In [None]:
# Edwin Peter
# epeter@itam.mx

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from scipy.stats import norm
from sklearn import preprocessing
from random import random

Perhaps easiest way to read in data is using Pandas. 
Pandas is a library for manipulating data. Similar to R's dataframes and very useful albeit in some cases confusing to combine with other libraries:

In [2]:
df = pd.read_csv("data/spambase/spambase.data",header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.000,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.000,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.000,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.000,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.000,0.135,0.000,0.000,3.537,40,191,1
5,0.00,0.00,0.00,0.0,1.85,0.00,0.00,1.85,0.00,0.00,...,0.000,0.223,0.000,0.000,0.000,0.000,3.000,15,54,1
6,0.00,0.00,0.00,0.0,1.92,0.00,0.00,0.00,0.00,0.64,...,0.000,0.054,0.000,0.164,0.054,0.000,1.671,4,112,1
7,0.00,0.00,0.00,0.0,1.88,0.00,0.00,1.88,0.00,0.00,...,0.000,0.206,0.000,0.000,0.000,0.000,2.450,11,49,1
8,0.15,0.00,0.46,0.0,0.61,0.00,0.30,0.00,0.92,0.76,...,0.000,0.271,0.000,0.181,0.203,0.022,9.744,445,1257,1
9,0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.00,0.06,0.00,...,0.040,0.030,0.000,0.244,0.081,0.000,1.729,43,749,1


In [3]:
# This data does not have headers so each attribute or field is simply enumerated
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


There are a few ways to split data into train and test. The first is using Sklearn, which is a machine learning library in python has a method for spliting data into train and test

In [4]:
# Here df.columns is a list of all the columns and df.columns[0:-1] is all columns minus the last which is y. 
# If the data had headers you could use column names: df[['column1','column2','etc']]
X_train, X_test, Y_train, Y_test = train_test_split(df[df.columns[0:-1]],df[df.columns[-1]], train_size=0.75)

Something important to note. Sklearn is able to take in pandas dataframes but returns arrays 

The other way to split data that is useful to know is:

In [5]:
# index for selecting data 0.75 is the percentage in training
index=np.array([1 if random() < 0.75 else 0 for i in range(len(df))])

In [6]:
# Separate both train and test as well as the response variable
X_train=np.array(df[df.columns[0:-1]])[index==1]
X_test=np.array(df[df.columns[0:-1]])[index==0]
Y_train=np.array(df[df.columns[-1]])[index==1]
Y_test=np.array(df[df.columns[-1]])[index==0]

The above method for spliting data can also be used for selecting a subset of an array using the values of an equally sized array. Useful for the current excercise. For example, to extract all instances of spam for the training data: 

In [7]:
# Normalizar no ayuda mucho pero sale igual al de sklearn. Para que las alturas del pdf signifiquen lo mismo 
scaler = preprocessing.StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [8]:
sum(Y_train)

1391

In [9]:
prob_spam = float(sum(Y_train))/len(Y_train)
prob_spam

prob_not_spam = 1 - prob_spam
prob_not_spam

0.6021167048054921

In [10]:
spam_mean = np.mean(X_train[Y_train==1], axis=0)
spam_std = np.std(X_train[Y_train==1], axis=0)

not_spam_mean = np.mean(X_train[Y_train==0], axis=0)
not_spam_std = np.std(X_train[Y_train==0], axis=0)


In [11]:
np.random.normal(spam_mean, spam_std)

array([-0.9351171 ,  0.32525833, -0.69564871,  1.14044185,  0.15761007,
        1.01416972,  1.80906668, -0.20991207,  0.1840495 ,  0.63232816,
       -1.00863045,  0.12742711,  0.04833999, -0.08780155,  1.91843097,
       -0.26853104,  1.99495684,  0.5456773 ,  0.23546502, -2.53509119,
        2.11874302,  0.47931323,  1.71703511, -0.98332992, -0.30298221,
       -0.12047757, -0.23689141, -0.65785202, -0.17030325,  0.03610253,
       -0.20838548, -0.17020342, -0.25203359, -0.19352441, -0.18472523,
       -0.48234319, -0.38049562,  0.27568089, -0.00842298,  0.52729706,
       -0.11699124, -0.12750779,  0.11433421, -0.19915579, -0.27382267,
       -0.09837515, -0.45832062, -0.06690193,  0.11531123, -0.04089398,
       -0.05143984,  0.32417331,  0.79345584,  1.59801115,  3.37819129,
       -1.40277778, -0.48801266])

In [70]:
def bayes(row):
    spam_or_not_spam = {"spam": 0, "notspam":0}
    for i in range(0, len(row)):
        x = i
        
        posteriorspam = np.ma.log(norm(spam_mean[x], spam_std[x]).pdf(row[x]))
        spam = np.log(prob_spam) + posteriorspam.sum()

        notposteriorspam = np.ma.log(norm(not_spam_mean[x], not_spam_std[x]).pdf(row[x]))
        not_spam = np.log(prob_not_spam) + notposteriorspam.sum()

        if spam >= not_spam:
            spam_or_not_spam["spam"] += 1
        else:
            spam_or_not_spam["notspam"] += 1
        
    if spam_or_not_spam["spam"] > spam_or_not_spam["notspam"]:
        return "Spam"
    else:
        return "Not spam"

In [80]:
spam = 0
not_spam = 0
for i in range(0, len(X_train)):
    if bayes(X_train[i]) == "Spam":
        spam += 1
    else:
        not_spam +=1
        
print spam
print not_spam

1281
2215
