## Evaluating my Multinomial Naive Bayes Implementation

In [1]:
import numpy as np
import pandas as pd
import time
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from MultinomialNaiveBayes import MyMultinomialNB as nb
from TextProcessing import BagOfWords as BoW
from prettytable import PrettyTable as pt

### The Data

The Spooky Author dataset contains short exceprts from the writing of Edgar Allan Poe, Mary Shelley, and HP Lovecraft. Based on these excerpts, we are tasked with predicting the author.

In [2]:
# Load Data
data = pd.read_csv('C:/docs/ml_algorithms/data/spooky_author/train.csv')
test = pd.read_csv('C:/docs/ml_algorithms/data/spooky_author/test.csv')

In [3]:
# Use sklearn functions to process text

def bag_of_words(data, stop, max):
    clean = []
    text = data['text'].tolist()
    for t in text:
        clean.append(clean_text(t))
        vectorizer = CountVectorizer(analyzer='word', stop_words=stop, max_features=max)
    return vectorizer, vectorizer.fit_transform(clean).toarray()

def clean_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z ]', '', text)
    return text

# Binarizes integers
def binarize(a):
    if a > 0:
        a = 1
    return a

# Vectoriztion of binarize for convenience
v_binarize = np.vectorize(binarize)

In [4]:
# Use sklearn to create bag of words represenations
feature_size = 12000

y = data['author'].as_matrix()

start = time.time()
vectorizer, X_counts = bag_of_words(data, 'english', feature_size)
print('Clock Time: ' + str(time.time() - start))

#Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_counts, y, test_size=0.2)

Clock Time: 0.7968251705169678


In [5]:
# For accuarcy calculation of my implementation
def decode(value):
    if value == 0:
        return 'EAP'
    if value == 1:
        return 'HPL'
    if value == 2:
        return 'MWS'

In [6]:
# Sklearn implementation
start = time.time()
skl_clf = MultinomialNB()
skl_clf.fit(X_train, y_train)
sk_clock = time.time() - start
sk_acc = skl_clf.score(X_test, y_test)

# My implemenation

start = time.time()
clf = nb(X_train, y_train, 1.0)
my_clock = time.time() - start

# Calculate accuracy of my implementation
correct = 0
for i in range(0, X_test.shape[0]):
    if decode(clf.predict(X_test[i,:])) == y_test[i]:
        correct = correct + 1
my_acc = correct / X_test.shape[0]

# Display Results
table = pt()
table.field_names = ['Implemenation', 'Accuracy', 'Clock Time']
table.add_row(['Sklearn', sk_acc, sk_clock])
table.add_row(['My Implemenatation', my_acc, my_clock])
print(table)

+--------------------+--------------------+--------------------+
|   Implemenation    |      Accuracy      |     Clock Time     |
+--------------------+--------------------+--------------------+
|      Sklearn       |   0.812308478039   | 3.2614662647247314 |
| My Implemenatation | 0.8125638406537283 | 1.9472846984863281 |
+--------------------+--------------------+--------------------+


## On this dataset, my implemenation performs comparably to sklearn's implementation