In [5]:
import scipy.sparse as sp
import numpy as np
import pandas as pd
import random
import json
import re
from tqdm import tqdm

In [6]:
featureMatrix = sp.load_npz("feature_matrix.npz")

In [7]:
featureMatrix

<19579x24951 sparse matrix of type '<class 'numpy.int64'>'
	with 250731 stored elements in Compressed Sparse Row format>

In [8]:
# Testing if generated matrix was okay or not
np.where(featureMatrix.toarray()[0] == 1)

(array([  453,  1286,  1636,  3702,  6102,  6862,  8189, 10730, 13337,
        13640, 13875, 16003, 16564, 17069, 18513, 19478, 19625, 23225,
        24168, 24416, 24641]),)

#### The code above was a test to check if everything seems fine after loading the featureMatrix from disk. Looks good to me

### TRAINING: 

In [9]:
train_df = pd.read_csv("dataset/train.csv")
train_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [10]:
num_examples, num_vocab_words = featureMatrix.shape

In [11]:
class_labels = list(train_df["author"].unique())
class_labels

['EAP', 'HPL', 'MWS']

In [12]:
labelMap = {}
for idx, label in enumerate(class_labels):
    labelMap[label] = idx
labelMap

{'EAP': 0, 'HPL': 1, 'MWS': 2}

In [13]:
def train_test_split_indices(num_examples, percentage_split):
    all_indices = [i for i in range(num_examples)]
    random.shuffle(all_indices)
    num_training_examples = int(num_examples * percentage_split / 100 )
    num_test_examples = num_examples - num_training_examples
    train_indices, test_indices = all_indices[:num_training_examples], all_indices[num_training_examples:]
    return train_indices, test_indices

In [14]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [15]:
with open("word_index.json", "r") as file:
    wordIndex = json.load(file)

In [16]:
class NaiveBayesClassifier():
    def __init__(
        self, 
        train_indices,
        test_indices, 
        train_df, 
        featureMatrix,
        labelMap
    ):
        self.train_indices = train_indices
        self.test_indices = test_indices
        
        self.num_train_examples = len(train_indices)
        self.num_test_examples = len(test_indices)
        
        self.dataframe = train_df
        self.featureMatrix = featureMatrix
        self.labelMap = labelMap
        self.trainFeatureMatrix = featureMatrix[train_indices]
        
        num_labels = len(labelMap.items())
        num_vocab_words = self.featureMatrix.shape[1]
        
        self.class_distribution = np.zeros( (num_labels) )
        self.likelihood_probabilities = np.zeros((num_labels, num_vocab_words, 2))

    def train(self):
        
        for i in train_indices:
            self.class_distribution[labelMap[self.dataframe["author"][i]]] += 1
            
        for i in tqdm(self.train_indices):
            class_label = self.labelMap[self.dataframe["author"][i]]
            text = self.dataframe["text"][i]
            words = set(re.findall("[a-z0-9]+", text.lower()))
            for word in words:
                if word in stopwords:
                    continue
                self.likelihood_probabilities[class_label,wordIndex[word]][1] += 1
        
        for i in range(self.likelihood_probabilities.shape[0]):
            for j in range(self.likelihood_probabilities.shape[1]):
                self.likelihood_probabilities[i,j,0] = self.class_distribution[i] - self.likelihood_probabilities[i,j,1]
                self.likelihood_probabilities[i,j,0] /= self.class_distribution[i]
                self.likelihood_probabilities[i,j,1] /= self.class_distribution[i]
        
#         for i in range(len(self.class_distribution)):
#             self.class_distribution[i] /= self.num_train_examples
            
#       Vectorize
        self.class_distribution /= self.num_train_examples

    
    def apply_laplace_correction(self, alpha):
        
        self.class_distribution = np.zeros(self.class_distribution.shape)
        self.likelihood_probabilities = np.zeros(self.likelihood_probabilities.shape)
        
        for i in train_indices:
            self.class_distribution[labelMap[self.dataframe["author"][i]]] += 1
            
        self.likelihood_probabilities = np.zeros((len(class_labels), self.featureMatrix.shape[1], 2))
        for i in tqdm(self.train_indices):
            class_label = self.labelMap[self.dataframe["author"][i]]
            text = self.dataframe["text"][i]
            words = set(re.findall("[a-z0-9]+", text.lower()))
            for word in words:
                if word in stopwords:
                    continue
                self.likelihood_probabilities[class_label,wordIndex[word]][1] += 1
           
        for i in range(self.likelihood_probabilities.shape[0]):
            for j in range(self.likelihood_probabilities.shape[1]):
                
                self.likelihood_probabilities[i,j,0] = self.class_distribution[i] - self.likelihood_probabilities[i,j,1]
                
                if self.likelihood_probabilities[i,j,0] == 0 or self.likelihood_probabilities[i,j,1] == 0 :
                    self.likelihood_probabilities[i,j,0] = (self.likelihood_probabilities[i,j,0] + alpha) / (3*alpha + self.class_distribution[i])
                    self.likelihood_probabilities[i,j,1] = (self.likelihood_probabilities[i,j,1] + alpha) / (3*alpha + self.class_distribution[i])
                else:
                    self.likelihood_probabilities[i,j,1] /= self.class_distribution[i]
                    self.likelihood_probabilities[i,j,0] /= self.class_distribution[i]
                 
                
#         for i in range(self.class_distribution.shape[0]):
#             self.class_distribution[i] /= self.num_train_examples
        
#       Vectorize
        self.class_distribution /= self.num_train_examples

    def evaluate_example(self, feature_vector):
        """
        Parameters
        ------------
        featureVector : (X1,X2,X3,.....Xn) [n = num_vocab_words]


        Returns
        ------------
        The predicted label of the example
        
        """
        
        feature_vector = feature_vector.toarray()[0]
        
        best_class, best_probability = None, 0
        
        for class_name, class_num in self.labelMap.items():
            
            cur_probability = self.class_distribution[class_num]
            
            
#             for j in range(feature_vector.shape[1]):
#                 cur_probability *= self.likelihood_probabilities[class_num, j,feature_vector[0][j]]
            
            feature_vector_row_indices = [i for i in range(feature_vector.shape[0])]
            vectorized = self.likelihood_probabilities[class_num, feature_vector_row_indices, feature_vector]
            
            cur_probability *= np.prod(vectorized)
    
            if best_class is None:
                best_class  = class_name
                best_probability = cur_probability
            elif cur_probability > best_probability:
                best_class  = class_name
                best_probability = cur_probability
        
        return best_class

    
    def test(self):
        """
        Predict Accuracy of All test samples
        """
        correctly_predicted = 0
        results = []
        
        for j in tqdm(range(self.num_test_examples)):
            predicted = self.evaluate_example(self.featureMatrix[self.test_indices[j]])
            actual = self.dataframe["author"][self.test_indices[j]]
            if predicted == actual:
                correctly_predicted += 1
            results.append([actual, predicted])
        
        correctly_predicted /= self.num_test_examples
        
        self.accuracy = correctly_predicted
        self.predictions = results

In [17]:
train_indices, test_indices = train_test_split_indices(num_examples, percentage_split = 70)

In [18]:
NBClassifier = NaiveBayesClassifier(
    train_indices, 
    test_indices, 
    train_df, 
    featureMatrix,
    labelMap
)

In [19]:
NBClassifier.train()

100%|██████████| 13705/13705 [00:00<00:00, 17118.57it/s]


In [20]:
NBClassifier.test()

100%|██████████| 5874/5874 [00:39<00:00, 149.69it/s]


### Accuracy, Precision, Sensitivity(Recall) , Specificity,  F-Score

In [33]:
from statistics import mean
import math

def get_confidence_interval(score, number_of_samples):
    """
    Parameter
    ------------
    score: score of the metric we find to find out confidence interval of
    number_of_samples: Number of samples
    
    Returns 
    ------------
    Confidence interval
    """
    CI_CONST = 1.96
    confidence_interval_upper = score + CI_CONST * math.sqrt((score*(1-score))/number_of_samples)
    confidence_interval_lower = score - CI_CONST * math.sqrt((score*(1-score))/number_of_samples)
    return [confidence_interval_lower, confidence_interval_upper]

def generate_statistics(predictions, class_labels, labelMap):
    """
    Parameter
    ------------
    predictions : A list of list of the format [actual,predicted]
    class_labels : List of possible outcomes
    labelMap : Mapping of Outcome to integer
    
    Returns 
    ------------
    precision, f-score, sensitivity, specificity
    """
    
    num_classes = len(class_labels)
    
    true_positive, true_negative, false_positive, false_negative = [0] * num_classes,  [0] * num_classes,  [0] * num_classes,  [0] * num_classes
#     print(true_positive, true_negative, false_positive, false_negative)
    
    for [actual_label, predicted_label] in predictions:
        
        actual_label_id = labelMap[actual_label]
        predicted_label_id = labelMap[predicted_label]
        
        if actual_label == predicted_label:
            true_positive[actual_label_id] += 1
            
            for label_id in range(num_classes):
                if actual_label_id != label_id:
                    true_negative[label_id] += 1
        else: 
            false_positive[predicted_label_id] += 1
            false_negative[actual_label_id] += 1
            
    
    micro_precision = sum(true_positive) / ( sum(true_positive) + sum(false_positive) )
    micro_sensitivity = sum(true_positive) / ( sum(true_positive) + sum(false_negative) )
    micro_specificity = sum(true_negative) / ( sum(true_negative) + sum(false_positive) )
    micro_f_score = (2 * micro_precision * micro_sensitivity) / (micro_precision + micro_sensitivity)
    
    
    classwise_precision = [ true_positive[i] / (true_positive[i] + false_positive[i])  for i in range(num_classes)] 
    classwise_sensitivity = [ true_positive[i] / (true_positive[i] + false_negative[i])  for i in range(num_classes)]
    classwise_specificity = [ true_negative[i] / (true_negative[i] + false_positive[i])  for i in range(num_classes)]
    
    macro_precision = mean(classwise_precision)
    macro_sensitivity = mean(classwise_sensitivity)
    macro_specificity = mean(classwise_specificity)
    macro_f_score = mean([(2*classwise_precision[i] * classwise_sensitivity[i]) / (classwise_precision[i] + classwise_sensitivity[i]) for i in range(num_classes)])
    
    number_of_samples = len(predictions)
    macro_precision_ci = get_confidence_interval(macro_precision, number_of_samples)
    macro_sensitivity_ci = get_confidence_interval(macro_sensitivity, number_of_samples)
    macro_specificity_ci = get_confidence_interval(macro_specificity, number_of_samples)
    macro_f_score_ci = get_confidence_interval(macro_f_score, number_of_samples)
#     print(true_positive, true_negative, false_positive, false_negative)
#     print(number_of_samples)
    
    print("MICRO STATS")
    print(f"Micro Precision = {micro_precision}")
    print(f"Micro Sensitivity(Recall) = {micro_sensitivity}")    
    print(f"Micro Specificity = {micro_specificity}")    
    print(f"Micro F-Score = {micro_f_score}")    
    
    
    print("\n****************\n")
    print("MACRO STATS")
    print(f"Macro Precision = {macro_precision}, {macro_precision_ci}")
    print(f"Macro Sensitivity(Recall) = {macro_sensitivity}, {macro_sensitivity_ci}")    
    print(f"Macro Specificity = {macro_specificity}, {macro_specificity_ci}")    
    print(f"Macro F-Score = {macro_f_score}, {macro_f_score_ci}")    
    

In [34]:
NBClassifier.accuracy

0.5885257065032345

In [35]:
predictions = NBClassifier.predictions
generate_statistics(predictions, class_labels, labelMap)

MICRO STATS
Micro Precision = 0.5885257065032345
Micro Sensitivity(Recall) = 0.5885257065032345
Micro Specificity = 0.7409709570249705
Micro F-Score = 0.5885257065032345

****************

MACRO STATS
Macro Precision = 0.70153440127993, [0.6898323842128004, 0.7132364183470598]
Macro Sensitivity(Recall) = 0.5455080416028929, [0.5327743953575621, 0.5582416878482237]
Macro Specificity = 0.7572522567773409, [0.7462878033057502, 0.7682167102489316]
Macro F-Score = 0.5488136239457543, [0.5360879869433933, 0.5615392609481152]


### Observation
- For Micro Case, Precision, Accuracy, Fscore,Recall all should be same and we got it as well 

In [None]:
# for alpha in [0.01,0.1,1,10,100]:
#     NBClassifier.apply_laplace_correction(alpha)
#     print(f"{NBClassifier.test()} <== {alpha}")

# alpha  |   %accuracy
# 0.01 ------ 83
# 0.1 ------- 83
# 1 --------- 77-79
# 10 -------- 41
# 100 ------- 39

In [36]:
NBClassifier.apply_laplace_correction(alpha = 1)

100%|██████████| 13705/13705 [00:00<00:00, 18532.94it/s]


In [37]:
NBClassifier.test()

100%|██████████| 5874/5874 [00:41<00:00, 142.50it/s]


In [38]:
NBClassifier.accuracy

0.783792986040177

In [39]:
predictions = NBClassifier.predictions
generate_statistics(predictions, class_labels, labelMap)

MICRO STATS
Micro Precision = 0.783792986040177
Micro Sensitivity(Recall) = 0.783792986040177
Micro Specificity = 0.8787936629127696
Micro F-Score = 0.783792986040177

****************

MACRO STATS
Macro Precision = 0.8310914420531165, [0.8215098134613498, 0.8406730706448832]
Macro Sensitivity(Recall) = 0.7633631898637615, [0.7524940329518034, 0.7742323467757196]
Macro Specificity = 0.8751469128281495, [0.8666935555336789, 0.8836002701226201]
Macro F-Score = 0.7798771438366304, [0.7692813190989893, 0.7904729685742715]


In [None]:
# # Cheating for future reference
# with open("test_indices.json","w") as f:
#     f.write(json.dumps(test_indices, indent = 4))
    
# with open("train_indices.json","w") as f:
#     f.write(json.dumps(train_indices, indent = 4))

In [None]:
# with open("test_indices.json", "r") as file:
#     tind = json.load(file)
# tind