In [84]:
import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pandas as pd

In [85]:
# Pulling in the spam data from the net to take a look
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', header=None)


In [86]:
# Without headers we get a generic enumeration of the column names
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [87]:
# The website also provides us with the actual column names

df.columns = ['word_freq_make','word_freq_address','word_freq_all','word_freq_3d','word_freq_our','word_freq_over','word_freq_remove','word_freq_internet','word_freq_order','word_freq_mail','word_freq_receive','word_freq_will','word_freq_people','word_freq_report','word_freq_addresses','word_freq_free','word_freq_business','word_freq_email','word_freq_you','word_freq_credit','word_freq_your','word_freq_font','word_freq_000','word_freq_money','word_freq_hp','word_freq_hpl','word_freq_george','word_freq_650','word_freq_lab','word_freq_labs','word_freq_telnet','word_freq_857','word_freq_data','word_freq_415','word_freq_85','word_freq_technology','word_freq_1999','word_freq_parts','word_freq_pm','word_freq_direct','word_freq_cs','word_freq_meeting','word_freq_original','word_freq_project','word_freq_re','word_freq_edu','word_freq_table','word_freq_conference','char_freq_;','char_freq_(','char_freq_[','char_freq_!','char_freq_$','char_freq_#','capital_run_length_average','capital_run_length_longest','capital_run_length_total','is_spam']

In [88]:
df.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [89]:
# "natural language processing" has been done to an extent based on the columns above
# Our model can be created in sklearn such that the first 57 columns are features, and the last one is the label

# create a df named X that represents only the first 57 columns
X = df.iloc[:,:-1]

# create a df named y that represents only the last column
y = df.iloc[:,-1]

In [90]:
# sklearn lets us easily create testing and training data sets from an exisiting data set using the function train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size= 0.3, random_state = 42)


In [91]:
# check out the lengths of the dfs we just made to make sure: 
len(X_train)

3220

In [92]:
len(X_test)

1381

In [93]:
# the sum of the above should equal the length of the original df
len(df)

4601

In [94]:
# The first step is to create a model object using the type of model we want to make
# We use the Bernoulli naive bayes 
bernoulli_nb_classifier = BernoulliNB()
# The next step is to train the model using the training data we extracted before
# The function 'fit' takes the feature set and the labels as inputs and 'fits' the features to the labels
bernoulli_nb_classifier.fit(X_train, y_train)

# How to get a prediciton
# The BernoulliNB() object has a function 'predict' which takes in a single row of features and returns a predicted label
# Here we can use any random row from our 'X_test' data
bernoulli_nb_classifier.predict(X_test.iloc[[56]])
bernoulli_nb_classifier.predict(X_test.iloc[[60]])
bernoulli_nb_classifier.predict(X_test.iloc[[1000]])


# We can also pass in several rows of data from X_test and get predictions for all of them 
print(bernoulli_nb_classifier.predict(X_test[0:30]))

# We can then check the accuracy of the model against the dataset y_test, which contains the actual labels
print(y_test[0:30].values)

[0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1]
[0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 1]


In [95]:
# As we can see the accuracy of the predictions is not that good. We can repeat this with other classifiers to see if we get better results.

In [96]:
#Trying it again with another classifier, Gaussian Naive Bayes

gnbc = GaussianNB()
gnbc.fit(X_train, y_train)
print(gnbc.predict(X_test[0:30]))
print(y_test[0:30].values)

[1 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 1 1 0 1 1 1 0 1 1]
[0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 1]


In [97]:
#Trying it again with Multinomial Naive Bayes

mnbc = MultinomialNB()
mnbc.fit(X_train, y_train)
print(mnbc.predict(X_test[0:30]))
print(y_test[0:30].values)

[1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 1 1]
[0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 1]


In [98]:
#Trying it again with Categorical Naive Bayes

cnbc = CategoricalNB()
cnbc.fit(X_train, y_train)
print(cnbc.predict(X_test[0:30]))
print(y_test[0:30].values)

[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1]
[0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 1]


In [99]:
#Trying it again with linear regression

lrc = LogisticRegression(solver="liblinear", random_state=0)
lrc.fit(X_train, y_train)
print(lrc.predict(X_test[0:30]))
print(y_test[0:30].values)

[0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 1 0 1 1]
[0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 1]


In [132]:
# It looks like linear regression gives us the best predictions
# These scores can be improved of course with more data


In [None]:
# roc_auc_score is a function is scikitlearn that does area-under-curve analysis of a model's output
# the higher the number, the higher the ratio of true positives to false positives 
# we can see that linear regression provides the best model so far, followed by categorical naive bayes

In [127]:
roc_auc_score(y_train, lrc.predict_proba(X_train)[:,1], multi_class='ovr')

0.9747468012710094

In [128]:
roc_auc_score(y_train, bernoulli_nb_classifier.predict_proba(X_train)[:,1], multi_class='ovr')

0.9509557446628041

In [129]:
roc_auc_score(y_train, gnbc.predict_proba(X_train)[:,1], multi_class='ovr')

0.9453477741022027

In [130]:
roc_auc_score(y_train, mnbc.predict_proba(X_train)[:,1], multi_class='ovr')

0.8474456248695063

In [131]:
roc_auc_score(y_train, cnbc.predict_proba(X_train)[:,1], multi_class='ovr')

0.9669575454770853