In [2]:
import os
import numpy
import io # modified for Python 2.7 MP
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score

In [3]:
NEWLINE = '\n'

HAM = 'ham'
SPAM = 'spam'

SOURCES = [
    ('data/spam',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/hard_ham',    HAM),
]
    
SOURCES_ALL = [
    ('data/spam',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/hard_ham',    HAM),
    ('data/beck-s',      HAM),
    ('data/farmer-d',    HAM),
    ('data/kaminski-v',  HAM),
    ('data/kitchen-l',   HAM),
    ('data/lokay-m',     HAM),
    ('data/williams-w3', HAM),
    ('data/BG',          SPAM),
    ('data/GP',          SPAM),
    ('data/SH',          SPAM)
]

SKIP_FILES = {'cmds'}

In [4]:
def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = io.open(file_path, encoding="latin-1") # modified for Python 2.7 MP
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content

In [5]:
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(numpy.random.permutation(data.index))

pipeline = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
    ('classifier',         MultinomialNB())
])

  data = data.append(build_data_frame(path, classification))
  data = data.append(build_data_frame(path, classification))
  data = data.append(build_data_frame(path, classification))


In [18]:
data

Unnamed: 0,text,class
data/easy_ham/02197.8ff83816cea0884898d358cd0423b356,URL: http://boingboing.net/#85514221\n\nDate: ...,ham
data/easy_ham/01100.3f3a79ad6a2cdd501aa86421fb5157a5,"On Wed, Feb 06, 2002 at 04:30:18PM +0200, Harr...",ham
data/spam/00240.2ff7f745285653a238214d975859406b,\n\nDear Sir or Madam\n\n\n\nIn the past you h...,spam
data/easy_ham/02034.bd799f09b362a83731ae6931a7916caf,URL: http://www.askbjoernhansen.com/archives/2...,ham
data/easy_ham/01651.7cafcb2d9dcaadd665afabc65c267f36,\n\n >> Ultimately I'd like to see tight in...,ham
...,...,...
data/easy_ham/02199.663c7327f5f9c7aa46d0fc56fbb68208,URL: http://diveintomark.org/archives/2002/10/...,ham
data/easy_ham/01341.34cf1021232db9d1c782888dcd1e5328,| \n\n| 0 hits here. :(\n\n| \n\n\n\nI also ge...,ham
data/easy_ham/01719.a401ddc61fc3d89fbaee70ea107a9956,[Neil Schemenauer]\n\n> These results are from...,ham
data/easy_ham/01053.9f4c2fea143d25bf2680c444e547df55,\n\n--------------Boundary-00=_OYOXHTVA0T2X8R5...,ham


In [22]:
n = len(data)
k_fold = KFold(n_splits=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for i,(train_indices, test_indices) in enumerate(k_fold.split(data)):
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values.astype(str)

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values.astype(str)

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)

Total emails classified: 3250
Score: 0.9355973519816052
Confusion matrix:
[[2744    6]
 [  56  444]]
