In [2]:
import os
import numpy as np
import io # modified for Python 2.7 MP
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score

In [4]:
train_data = pd.read_csv('archive/Train.csv')
test_data = pd.read_csv('archive/Test.csv')
valid_data = pd.read_csv('archive/Valid.csv')

In [15]:
pipeline_mNB = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2),stop_words='english')),
    ('classifier',         MultinomialNB())
])

In [16]:
pipeline_bNB = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2),stop_words='english')),
    ('classifier',         BernoulliNB(binarize=0.0))
])

In [23]:
pipeline_lr = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2),stop_words='english')),
    ('classifier',         LogisticRegression(solver='lbfgs', max_iter=100))
])

In [21]:
def pipeline_run(pipeline,train_data,test_data,pipe_name):
    pipeline.fit(train_data.text, train_data.label)
    predictions = pipeline.predict(test_data.text)
    confusion = confusion_matrix(test_data.label, predictions)
    score = f1_score(test_data.label, predictions, average="macro")
    print(pipe_name)
    print('Total emails classified:', len(test_data))
    print('Score:', score)
    print('Confusion matrix:')
    print(confusion)
    print("\n")



In [24]:
pipeline_run(pipeline_mNB,train_data,test_data,"MultinomialNB")
pipeline_run(pipeline_bNB,train_data,test_data,"BernoulliNB")



MultinomialNB
Total emails classified: 5000
Score: 0.884394006985322
Confusion matrix:
[[2229  266]
 [ 312 2193]]


BernoulliNB
Total emails classified: 5000
Score: 0.868369325493924
Confusion matrix:
[[2315  180]
 [ 476 2029]]




In [25]:
pipeline_run(pipeline_lr,train_data,test_data,"LogisticRegression")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
Total emails classified: 5000
Score: 0.8987801609115387
Confusion matrix:
[[2212  283]
 [ 223 2282]]




In [10]:
k_fold = KFold(n_splits=3, random_state=4, shuffle=True)
scores = []
confusion = np.array([[0, 0], [0, 0]])

data = train_data
for i, (train_indices, test_indices) in enumerate(k_fold.split(data)):

    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['label'].values.astype(str)

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['label'].values.astype(str)
    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)
    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, average="macro")
    scores.append(score)


In [11]:
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)

Total emails classified: 40000
Score: 0.8746224444911764
Confusion matrix:
[[17875  2144]
 [ 2869 17112]]
