In [107]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB

##Reading and Processing Data

In [108]:
data_file = "spambase/spambase.data"
names_file = "spambase/spambase.names"

In [109]:
names = [] # establishing the column headers, some manual work required
with open(names_file) as f:
    l = f.readline()
    for l in f:
        if len(l) > 0:
            if l[:4] == "word" or l[:4] == "char":
                l2 = l[10:l.index(":")]
                names.append(l2)
names.append("capital_avg")
names.append("capital_longest")
names.append("capital_total")
names.append("spam")

In [110]:
len(names) # had to get this to match the number of columns from the data

58

In [111]:
texts = pd.read_csv(data_file, names=names) # read the data file

In [112]:
len(texts) # total number of records

4601

In [113]:
int(0.4*len(texts)) # we expect this many records in the test 

1840

In [114]:
texts.head() # columns and data look coherent

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,;,(,[,!,$,#,capital_avg,capital_longest,capital_total,spam
0,0.0,0.64,0.64,0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0,0.135,0.0,0.0,3.537,40,191,1


## Breaking into training/sample sets

In [115]:
txt_array = np.asarray(texts)
a = txt_array[:,:-1]
b = txt_array[:,-1]

In [116]:
train_data, test_data, train_scores, test_scores = train_test_split(a, b, test_size=0.4, random_state=42)

In [119]:
len(test_scores) # this is the predicted number of records in test set 

1841

## Applying Naieve Bayesan Regression
This approach comes from the sklearn package. It works like the prior linear regression tools, but is more effective for this type of data set.

In [117]:
classifier = MultinomialNB()
classifier.fit(train_data, train_scores)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [118]:
train_r2 = classifier.score(train_data, train_scores)
test_r2 = classifier.score(test_data, test_scores)
print(" Training correlation score: "+str(train_r2))
print(" Training correlation score: "+str(test_r2))

 Training correlation score: 0.782608695652
 Training correlation score: 0.781097229766


These scores matches with our expectations. The model is decent at predicting whether an email is spam within its training set, and it is a little bit worse at predicting if an email in the test set is spam.