In [1]:
import os
import io
import numpy
from pandas import DataFrame, concat
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)
            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message

def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = concat([data, dataFrameFromDirectory('emails/spam', 'spam')])
data = concat([data, dataFrameFromDirectory('emails/ham', 'ham')])

In [3]:
data.head()

Unnamed: 0,message,class
emails/spam/00241.c28ade5771085a8fddd054a219566b7c,------=_NextPart_000_00B6_07E34C7A.C3030C43\n\...,spam
emails/spam/00312.75c839d7d4f6da9e860a11b617904fb5,"ilug ,\n\n\n\n From;Mr.Michael Kamah and Fami...",spam
emails/spam/00132.0ead3e293c6c41cbffb69670e8b85ae7,"As seen on NBC, CBS, CNN, and even Oprah! The ...",spam
emails/spam/00113.eebc11982ccc4730fb8759f94400ce19,Request A Free No Obligation Consultation!\n\n...,spam
emails/spam/00253.83b95b05e275286eddcf557ea581e754,There is NO stumbling on to it! \n\n\n\nThe gr...,spam


In [4]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

In [5]:
examples = ['Free Viagra now', 'Hi Bob, how about a game of golf tomorrow?']
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)

In [8]:
predictions

array(['spam', 'ham'], dtype='<U4')