In [50]:
import scipy.sparse as sp
import numpy as np
import pandas as pd
import random
import json
import re
from tqdm import tqdm

In [21]:
featureMatrix = sp.load_npz("feature_matrix.npz")

In [22]:
featureMatrix

<19579x24951 sparse matrix of type '<class 'numpy.int64'>'
	with 250731 stored elements in Compressed Sparse Row format>

In [23]:
# Testing if generated matrix was okay or not
np.where(featureMatrix.toarray()[0] == 1)

(array([  453,  1286,  1636,  3702,  6102,  6862,  8189, 10730, 13337,
        13640, 13875, 16003, 16564, 17069, 18513, 19478, 19625, 23225,
        24168, 24416, 24641]),)

#### The code above was a test to check if everything seems fine after loading the featureMatrix from disk. Looks good to me

### TRAINING: 

In [24]:
train_df = pd.read_csv("dataset/train.csv")
train_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [25]:
num_examples, num_vocab_words = featureMatrix.shape

In [26]:
class_labels = list(train_df["author"].unique())
class_labels

['EAP', 'HPL', 'MWS']

In [28]:
labelMap = {}
for idx, label in enumerate(class_labels):
    labelMap[label] = idx
labelMap

{'EAP': 0, 'HPL': 1, 'MWS': 2}

In [125]:
def train_test_split_indices(num_examples):
    all_indices = [i for i in range(num_examples)]
    random.shuffle(all_indices)
    num_training_examples = int(num_examples * 0.9)
    num_test_examples = num_examples - num_training_examples
    train_indices, test_indices = all_indices[:num_training_examples], all_indices[num_training_examples:]
    return train_indices, test_indices

In [54]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]


In [30]:
with open("word_index.json", "r") as file:
    wordIndex = json.load(file)

In [121]:
class NaiveBayesClassifier():
    def __init__(
        self, 
        num_training_examples, 
        num_test_examples, 
        train_indices,
        test_indices, 
        train_df, 
        featureMatrix,
        labelMap
    ):
        self.num_train_examples = num_training_examples
        self.num_test_examples = num_test_examples
        self.train_indices = train_indices
        self.test_indices = test_indices
        self.dataframe = train_df
        self.featureMatrix = featureMatrix
        self.labelMap = labelMap
        self.trainFeatureMatrix = featureMatrix[train_indices]
        self.class_distribution = np.zeros((len(self.labelMap.items())))

    def train(self, labelMap):
        for i in train_indices:
            self.class_distribution[labelMap[self.dataframe["author"][i]]] += 1
            
        self.likelihood_probabilities = np.zeros((len(class_labels), self.featureMatrix.shape[1], 2))
        for i in tqdm(self.train_indices):
            class_label = self.labelMap[self.dataframe["author"][i]]
            text = self.dataframe["text"][i]
            words = re.findall("[a-z0-9]+", text.lower())
            for word in words:
                if word in stopwords:
                    continue
                self.likelihood_probabilities[class_label,wordIndex[word]][1] += 1
        for i in range(self.likelihood_probabilities.shape[0]):
            for j in range(self.likelihood_probabilities.shape[1]):
                self.likelihood_probabilities[i,j,0] = self.class_distribution[i] - self.likelihood_probabilities[i,j,1]
                self.likelihood_probabilities[i,j,0] /= self.class_distribution[i]
                self.likelihood_probabilities[i,j,1] /= self.class_distribution[i]
        for i in range(len(self.class_distribution)):
            self.class_distribution[i] /= self.num_train_examples
    def evaluateExample(self, feature_vector):
        """
        Parameters
        ------------
        featureVector : (X1,X2,X3,.....Xn) [n = num_vocab_words]


        Returns
        ------------
        The predicted label_id of the example
        
        """
        feature_vector = feature_vector.toarray()
        best_class, best_probability = None, 0
        for class_name, class_num in self.labelMap.items():
            cur_probability = self.class_distribution[class_num]
            for j in range(feature_vector.shape[1]):
                cur_probability *= self.likelihood_probabilities[class_num,j,feature_vector[0][j]]
            if best_class is None:
                best_class  = class_name
                best_probability = cur_probability
            elif cur_probability > best_probability:
                best_class  = class_name
                best_probability = cur_probability
        return best_class

    def test(self):
        """
        Predict Accuracy of All test samples
        """
        correctly_predicted = 0
        total_predicted = len(self.test_indices)
        for j in tqdm(range(len(self.test_indices))):
            if(self.evaluateExample(self.featureMatrix[self.test_indices[j]]) == self.dataframe["author"][self.test_indices[j]]):
                correctly_predicted += 1
        correctly_predicted /= total_predicted
        self.answer = correctly_predicted
        return correctly_predicted

In [126]:
train_indices, test_indices = train_test_split_indices(num_examples)

In [127]:
NBClassifier = NaiveBayesClassifier(
    num_training_examples, 
    num_test_examples, 
    train_indices, 
    test_indices, 
    train_df, 
    featureMatrix,
    labelMap
)

In [128]:
NBClassifier.train(labelMap)

100%|██████████| 17621/17621 [00:01<00:00, 15367.36it/s]


In [129]:
print(NBClassifier.test())

100%|██████████| 1958/1958 [01:00<00:00, 32.45it/s]

0.6235955056179775





In [15]:
featureMatrix[train_indices]

<13705x24951 sparse matrix of type '<class 'numpy.int64'>'
	with 176355 stored elements in Compressed Sparse Row format>

In [77]:
featureMatrix[test_indices[0]]

<1x24951 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

EAP MWS
HPL HPL
EAP EAP
EAP EAP
EAP EAP
EAP EAP
EAP EAP
EAP MWS
HPL HPL
HPL EAP
