In [1]:
import scipy.sparse as sp
import numpy as np
import pandas as pd
import random
import json
import re
from tqdm import tqdm

In [2]:
featureMatrix = sp.load_npz("feature_matrix.npz")

In [3]:
featureMatrix

<19579x24951 sparse matrix of type '<class 'numpy.int64'>'
	with 250731 stored elements in Compressed Sparse Row format>

In [4]:
# Testing if generated matrix was okay or not
np.where(featureMatrix.toarray()[0] == 1)

(array([  453,  1286,  1636,  3702,  6102,  6862,  8189, 10730, 13337,
        13640, 13875, 16003, 16564, 17069, 18513, 19478, 19625, 23225,
        24168, 24416, 24641]),)

#### The code above was a test to check if everything seems fine after loading the featureMatrix from disk. Looks good to me

### TRAINING: 

In [5]:
train_df = pd.read_csv("dataset/train.csv")
train_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [6]:
num_examples, num_vocab_words = featureMatrix.shape

In [7]:
class_labels = list(train_df["author"].unique())
class_labels

['EAP', 'HPL', 'MWS']

In [8]:
labelMap = {}
for idx, label in enumerate(class_labels):
    labelMap[label] = idx
labelMap

{'EAP': 0, 'HPL': 1, 'MWS': 2}

In [9]:
def train_test_split_indices(num_examples, percentage_split):
    all_indices = [i for i in range(num_examples)]
    random.shuffle(all_indices)
    num_training_examples = int(num_examples * percentage_split / 100 )
    num_test_examples = num_examples - num_training_examples
    train_indices, test_indices = all_indices[:num_training_examples], all_indices[num_training_examples:]
    return train_indices, test_indices

In [10]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [11]:
with open("word_index.json", "r") as file:
    wordIndex = json.load(file)

In [12]:
class NaiveBayesClassifier():
    def __init__(
        self, 
        train_indices,
        test_indices, 
        train_df, 
        featureMatrix,
        labelMap
    ):
        self.train_indices = train_indices
        self.test_indices = test_indices
        
        self.num_train_examples = len(train_indices)
        self.num_test_examples = len(test_indices)
        
        self.dataframe = train_df
        self.featureMatrix = featureMatrix
        self.labelMap = labelMap
        self.trainFeatureMatrix = featureMatrix[train_indices]
        
        num_labels = len(labelMap.items())
        num_vocab_words = self.featureMatrix.shape[1]
        
        self.class_distribution = np.zeros( (num_labels) )
        self.likelihood_probabilities = np.zeros((num_labels, num_vocab_words, 2))

    def train(self):
        
        for i in train_indices:
            self.class_distribution[labelMap[self.dataframe["author"][i]]] += 1
            
        for i in tqdm(self.train_indices):
            class_label = self.labelMap[self.dataframe["author"][i]]
            text = self.dataframe["text"][i]
            words = set(re.findall("[a-z0-9]+", text.lower()))
            for word in words:
                if word in stopwords:
                    continue
                self.likelihood_probabilities[class_label,wordIndex[word]][1] += 1
        
        for i in range(self.likelihood_probabilities.shape[0]):
            for j in range(self.likelihood_probabilities.shape[1]):
                self.likelihood_probabilities[i,j,0] = self.class_distribution[i] - self.likelihood_probabilities[i,j,1]
                self.likelihood_probabilities[i,j,0] /= self.class_distribution[i]
                self.likelihood_probabilities[i,j,1] /= self.class_distribution[i]
        
#         for i in range(len(self.class_distribution)):
#             self.class_distribution[i] /= self.num_train_examples
            
#       Vectorize
        self.class_distribution /= self.num_train_examples

    
    def apply_laplace_correction(self, alpha):
        
        self.class_distribution = np.zeros(self.class_distribution.shape)
        self.likelihood_probabilities = np.zeros(self.likelihood_probabilities.shape)
        
        for i in train_indices:
            self.class_distribution[labelMap[self.dataframe["author"][i]]] += 1
            
        self.likelihood_probabilities = np.zeros((len(class_labels), self.featureMatrix.shape[1], 2))
        for i in tqdm(self.train_indices):
            class_label = self.labelMap[self.dataframe["author"][i]]
            text = self.dataframe["text"][i]
            words = set(re.findall("[a-z0-9]+", text.lower()))
            for word in words:
                if word in stopwords:
                    continue
                self.likelihood_probabilities[class_label,wordIndex[word]][1] += 1
           
        for i in range(self.likelihood_probabilities.shape[0]):
            for j in range(self.likelihood_probabilities.shape[1]):
                
                self.likelihood_probabilities[i,j,0] = self.class_distribution[i] - self.likelihood_probabilities[i,j,1]
                
                if self.likelihood_probabilities[i,j,0] == 0 or self.likelihood_probabilities[i,j,1] == 0 :
                    self.likelihood_probabilities[i,j,0] = (self.likelihood_probabilities[i,j,0] + alpha) / (3*alpha + self.class_distribution[i])
                    self.likelihood_probabilities[i,j,1] = (self.likelihood_probabilities[i,j,1] + alpha) / (3*alpha + self.class_distribution[i])
                else:
                    self.likelihood_probabilities[i,j,1] /= self.class_distribution[i]
                    self.likelihood_probabilities[i,j,0] /= self.class_distribution[i]
                 
                
#         for i in range(self.class_distribution.shape[0]):
#             self.class_distribution[i] /= self.num_train_examples
        
#       Vectorize
        self.class_distribution /= self.num_train_examples

    def evaluate_example(self, feature_vector):
        """
        Parameters
        ------------
        featureVector : (X1,X2,X3,.....Xn) [n = num_vocab_words]


        Returns
        ------------
        The predicted label of the example
        
        """
        
        feature_vector = feature_vector.toarray()[0]
        
        best_class, best_probability = None, 0
        
        for class_name, class_num in self.labelMap.items():
            
            cur_probability = self.class_distribution[class_num]
            
            
#             for j in range(feature_vector.shape[1]):
#                 cur_probability *= self.likelihood_probabilities[class_num, j,feature_vector[0][j]]
            
            feature_vector_row_indices = [i for i in range(feature_vector.shape[0])]
            vectorized = self.likelihood_probabilities[class_num, feature_vector_row_indices, feature_vector]
            
            cur_probability *= np.prod(vectorized)
    
            if best_class is None:
                best_class  = class_name
                best_probability = cur_probability
            elif cur_probability > best_probability:
                best_class  = class_name
                best_probability = cur_probability
        
        return best_class

    
    def test(self):
        """
        Predict Accuracy of All test samples
        """
        correctly_predicted = 0
        
        for j in tqdm(range(self.num_test_examples)):
            if(self.evaluate_example(self.featureMatrix[self.test_indices[j]]) == self.dataframe["author"][self.test_indices[j]]):
                correctly_predicted += 1
        
        correctly_predicted /= self.num_test_examples
        
        self.accuracy = correctly_predicted * 100
        
        return self.accuracy

In [32]:
train_indices, test_indices = train_test_split_indices(num_examples, percentage_split = 70)

In [33]:
NBClassifier = NaiveBayesClassifier(
    train_indices, 
    test_indices, 
    train_df, 
    featureMatrix,
    labelMap
)

In [34]:
NBClassifier.train()

100%|██████████| 13705/13705 [00:00<00:00, 18586.13it/s]


In [35]:
NBClassifier.test()

100%|██████████| 5874/5874 [00:40<00:00, 144.59it/s]


58.324821246169556

In [36]:
# for alpha in [0.01,0.1,1,10,100]:
#     NBClassifier.apply_laplace_correction(alpha)
#     print(f"{NBClassifier.test()} <== {alpha}")

# alpha  |   %accuracy
# 0.01 ------ 83
# 0.1 ------- 83
# 1 --------- 77-79
# 10 -------- 41
# 100 ------- 39

In [37]:
NBClassifier.apply_laplace_correction(alpha = 1)

100%|██████████| 13705/13705 [00:00<00:00, 17827.34it/s]


In [38]:
NBClassifier.test()

100%|██████████| 5874/5874 [00:41<00:00, 141.22it/s]


78.26012938372489

In [43]:
with open("test_indices.json","w") as f:
    f.write(json.dumps(test_indices, indent = 4))
    
with open("train_indices.json","w") as f:
    f.write(json.dumps(train_indices, indent = 4))