# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [32]:
import os
import io
import numpy 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return pd.DataFrame(rows, index=index)

data = pd.DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('emails/ham', 'ham'))
trainingSet, testSet = train_test_split(data, test_size=0.2)


Let's have a look at that DataFrame:

In [33]:
trainingSet

Unnamed: 0,message,class
emails/ham/00173.253a0161257f4fe309df9d9ffabd5ef3,">peter fwded:\n\n>>Finally, Constable Evans hu...",ham
emails/ham/00748.d6ca40c29f4224487fc8d802cb5dca88,"Of the three lying politicians, which liar wou...",ham
emails/ham/00663.cbfae39e27122415329840060e7619e8,"On Thu, 19 Sep 2002, Mr. FoRK wrote:\n\n--]I t...",ham
emails/ham/00616.1111fc61de078f069db9d72e961ab5a1,Pity. Reading that woman's ad and knowing Roh...,ham
emails/spam/00124.db848e36f1b4c2705cbc16ef33a302d4,This is a multi-part message in MIME format.\n...,spam
...,...,...
emails/spam/00368.2c1ab4bc7f408e0fcb22dca9b2d5a113,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 3.2//E...",spam
emails/ham/01383.18c85b7ead9efe35b9a128c42e5170fc,Dan Kohn <dan@dankohn.com> writes:\n\n\n\n> Gu...,ham
emails/spam/00200.bacd4b2168049778b480367ca670254f,Dear zzzz =2C\n\n\n\n=3CBODY bgColor=3D#ffccff...,spam
emails/ham/01447.98e4b20ceb192594e992f7db9f8dfc53,"On Fri, Sep 06, 2002 at 06:22:52PM +0100, Matt...",ham


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [34]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(trainingSet['message'].values)
classifier = MultinomialNB()
targets = trainingSet['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
sample = testSet.tail(10)
sample2 = testSet.head(10)
#print(sample.reset_index(drop=True))
#print(sample2.reset_index(drop=True))
result = DataFrame({'class': []})
result = sample.reset_index(drop=True)['class'] ==sample2.reset_index(drop=True)['class']
#print(result.value_counts())

In [60]:
testSet = testSet.reset_index(drop=True)

Let's try it out:

In [147]:
def evaluation (evalDF):
    examples = evalDF['message']
    example_counts = vectorizer.transform(examples)
    predictions = classifier.predict(example_counts)
    #predictDF = pd.DataFrame({'message': [], 'class': []})
    predictDF = pd.DataFrame(columns = ['message','class','prediction','results'])
    predictDF['message']=evalDF['message']
    predictDF['class']=evalDF['class']
    predictDF['prediction'] = predictions
    conditions = [
        (predictDF['class'] == 'ham') & (predictDF["prediction"] == 'ham'),
        (predictDF['class'] == 'ham') & (predictDF["prediction"] == 'spam'),
        (predictDF['class'] == 'spam') & (predictDF["prediction"] == 'spam'),
        (predictDF['class'] == 'spam') & (predictDF["prediction"] == 'ham')
    ]
    outputs = [
        'True Negative',
        'False Positive',
        'True Positive',
        'False Negative'
    ]
    res = numpy.select(conditions, outputs)
    predictDF['results'] = res
    return predictDF['results'].value_counts()

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [148]:
test = evaluation(testSet)
test

True Negative     502
True Positive      78
False Negative     20
Name: results, dtype: int64