In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [47]:
import os
import numpy
import io # modified for Python 2.7 MP
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score


In [21]:
!cp -r '/content/drive/MyDrive/Colab Notebooks/Lab_4/data' .

In [22]:
NEWLINE = '\n'

HAM = 'ham'
SPAM = 'spam'

SOURCES = [
    ('data/spam',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/hard_ham',    HAM),
]
    
SOURCES_ALL = [
    ('data/spam',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/hard_ham',    HAM),
    ('data/beck-s',      HAM),
    ('data/farmer-d',    HAM),
    ('data/kaminski-v',  HAM),
    ('data/kitchen-l',   HAM),
    ('data/lokay-m',     HAM),
    ('data/williams-w3', HAM),
    ('data/BG',          SPAM),
    ('data/GP',          SPAM),
    ('data/SH',          SPAM)
]

SKIP_FILES = {'cmds'}

In [17]:
def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = io.open(file_path, encoding="latin-1") # modified for Python 2.7 MP
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content


In [18]:
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

In [23]:
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(numpy.random.permutation(data.index))

In [24]:
data

Unnamed: 0,text,class
data/easy_ham/01948.d51fec6f7672e603b0a3113a86869d10,"URL: http://www.newsisfree.com/click/-3,825650...",ham
data/easy_ham/01419.97da4f8a986b55cbe1f81bb22836ac58,[Skip Montanaro]\n\n> Any thought to wrapping ...,ham
data/spam/00393.13d4d84cb98ea19954f895c629520bf8,NEED Health Insurance? \n\n In addition to fea...,spam
data/easy_ham/02073.1b332bbccca72969c7af61749d0f3b4c,URL: http://www.askbjoernhansen.com/archives/2...,ham
data/easy_ham/01794.e322c3e66406d3a985a61aba25902c5b,\n\nForwarded-by: William Knowles <wk@c4i.org>...,ham
...,...,...
data/hard_ham/00111.9b8da30db6709b590398fe8923a7180c,--16675500.1026989382816.JavaMail.root.umsan1\...,ham
data/easy_ham/01227.0c0989577c7476c986aa5328e4ef6118,"Once upon a time, Gary wrote :\n\n\n\n> On Wed...",ham
data/easy_ham/00762.c95ecbfb41e18e9ae4b9aceb4a7de176,"\n\nIn a message dated 9/23/2002 6:30:31 PM, k...",ham
data/easy_ham/01011.026a9a3bdab758181c61e6d828a4e212,"\n\n\n\n>>>>> On Fri, 13 Sep 2002, ""Tony"" == T...",ham


In [55]:
pipeline_Mnb = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
    ('classifier',         MultinomialNB())
])

In [56]:
pipeline_Bnb = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
    ('classifier',         BernoulliNB(binarize=0.0))
])

In [57]:
pipeline_lr = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
    ('classifier',         LogisticRegression())
])


In [65]:
def do_pipeline(pipeline,data):
  scores = []
  confusion = numpy.array([[0, 0], [0, 0]])
  for i, (train_indices, test_indices) in enumerate(k_fold.split(data)):

    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values.astype(str)

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values.astype(str)
    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)
    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, average="macro")
    scores.append(score)

  print('Total emails classified:', len(data))
  print('Score:', sum(scores)/len(scores))
  print('Confusion matrix:')
  print(confusion)


In [66]:
k_fold = KFold(n_splits=3, random_state=4, shuffle=True)

multi = do_pipeline(pipeline_Mnb,data)
bernoulli = do_pipeline(pipeline_Bnb,data)
logicreg = do_pipeline(pipeline_lr,data)

Total emails classified: 3250
Score: 0.9583571933121945
Confusion matrix:
[[2746    4]
 [  63  437]]
Total emails classified: 3250
Score: 0.6072507409988185
Confusion matrix:
[[2737   13]
 [ 413   87]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Total emails classified: 3250
Score: 0.9637013677991456
Confusion matrix:
[[2724   26]
 [  35  465]]


In [50]:
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)

Total emails classified: 3250
Score: 0.9882049693752192
Confusion matrix:
[[2723   27]
 [  38  462]]
