In [1]:
#!/usr/bin/env python
# coding: utf-8
#===============================================================================
#
#           FILE: unigramMNB_3_ntb.py
#         AUTHOR: Bianca Ciobanica
#	       EMAIL: bianca.ciobanica@student.uclouvain.be
#
#           BUGS: 
#        VERSION: 3.10.6
#        CREATED: 25-10-2023 
#
#===============================================================================
#    DESCRIPTION: sources used for this code : 
#               https://medium.com/@johnm.kovachi/implementing-a-multinomial-naive-bayes-classifier-from-scratch-with-python-e70de6a3b92e
#               https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html
#               https://datasciencedojo.com/blog/naive-bayes-from-scratch-part-1/#
#               http://web.stanford.edu/~jurafsky/slp3/4.pdf
# 
#    
#          USAGE: 
#===============================================================================

In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import WordPunctTokenizer
from nltk.util import ngrams
from collections import Counter
from nltk.lm import Vocabulary
from math import log

In [3]:
def remove_html_tags(text):
  import re
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

In [24]:
def create_vocabulary(df, threshold):
        tokenized_corpus = [token 
                            for row in df
                            for token in row] # flattened corpus

        word_counts = Counter(tokenized_corpus)
    
        restricted_voc = ['<UNK>' if word_counts[word] < threshold else word for word in tokenized_corpus]
    
        return Counter(restricted_voc)

In [5]:
# initialize training corpus
df_corpus = pd.read_csv("data/train.csv")
unk_threshold = 3
# preprocess 
df_corpus['Body'] = df_corpus['Body'].apply(lambda x: remove_html_tags(x))
df_corpus['Body_tokenized'] = df_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))


In [6]:
print(df_corpus.keys())
print(df_corpus.shape)

Index(['Body', 'Y', 'Body_tokenized'], dtype='object')
(14000, 3)


In [7]:
classes_counts = len(df_corpus['Y'])
classes = df_corpus['Y'].unique().tolist()
print(classes)

['HQ', 'LQ']


In [8]:
#print(len(tokenized_corpus))
#print("bigdoc voc size: ", bigdoc_vocab_size)

In [9]:
# P(class | doc) = (P(doc | class) * P(class)) / P(doc)
# p(  x   |  y ) = (P( y  |  x   ) * P( x ) ) / P( y )

In [51]:
class NaiveBayesClassifier:
    # This code is inspired by code generated with the GPT-3 model developed by OpenAI.
    # I initally created a NB model which i would then initialize per class
    # But then I realized i should create a model which iterates through each class as shown in SLP
    
    def __init__(self, docs, classes, alpha=None, voc = None):
        self.bigdoc = docs
        self.classes = classes
        self.alpha = alpha
        self.bigdoc_v = voc # is a Counter obj
        self.bigdoc_v_size = self.bigdoc_v.total()
        
        self.bow = {}
        self.logprior = {}
        self.logll = {}
        self.probability = {}
       
    def train(self):
        if self.bigdoc_v is None or len(self.bigdoc_v) == 0:
            raise ValueError("Cannot train on an empty vocabulary.")
        
        total_doc_counts = len(self.bigdoc)
        # go through each class
        for label in self.classes:
            print(self.alpha)
            # get features from class
            class_docs = self.bigdoc[self.bigdoc['Y'] == label]
            class_counts = len(class_docs)
            
            # get P(c)
            self.logprior[label] = log(class_counts / total_doc_counts)
            
            # generate bag of words with freq
            self.bow[label] = self.extract_features(class_docs)
            
            class_bow = self.bow[label] 
            total_class_tokens = class_bow.total()
            self.logll[label] = {}
            self.probability[label] = {}
            
             # calculate log likelihood
            for token in self.bigdoc_v: # store logll for each token in each class
                count_w = class_bow.get(token,0.0)
               # print(label, token, prob_word)
                self.logll[label][token] = log((count_w + self.alpha) / (total_class_tokens + self.bigdoc_v_size * self.alpha))
                if token in class_bow:
                    self.probability[label][token] = class_bow[token] / total_class_tokens

    def check_consistency(self):
        probs = {}
        for label in self.classes:
            p = 0
            for token in self.probability[label]:
                p += self.probability[label][token]
            probs[label] = p
            
        #print(probs)
        
        is_consistent = False
        p_class1, p_class2 = probs.values()
        if round(p_class1) == 1 and round (p_class2) == 1:
            is_consistent = True
        return "Is consistent : " + str(is_consistent)
        
        
    def test_model(self, test_doc):
        sums = {} # {"class" : logprior}
        #C_NB = argmax (logprior + sum logll)
        
        for label in self.classes:
            sums[label] = self.logprior[label]
            # go through each word
            for token in test_doc:
                if token in self.bigdoc_v:
                    sums[label] += self.logll[label][token]
        
        # get class with highest score
        argmax = max(sums, key=sums.get)
        return argmax

    def test_accuracy(self, data=None, data_type=None):
        if data is None or data.empty:
            raise ValueError("Cannot train on an empty corpus.")
            
        correct_predictions = 0;
        total_predictions = 0;
    
        for index, row in data.iterrows(): # used GPT to help me find how to iter on rows in my df
            true_label = row['Y']
            predicted_label = self.test_model(row[data_type])
           # print(true_label, "< true =?= predicted >", predicted_label)
    
            total_predictions += 1
            if predicted_label == true_label:
                correct_predictions += 1
    
        accuracy = correct_predictions / total_predictions
        return round(accuracy * 100, 3)
            
    def __repr__(self):
        s = ""
        s += "total docs    = " + str(len(self.bigdoc)) + "\n"
        s += "labels        = " + str(self.classes)+ "\n"
        s += "Voc          = " + str(self.bigdoc_v_size)+ "\n"
        s += "Alpha        = " + str(self.alpha)+ "\n"
        s+= "_____________________\n"

        for label in self.classes:
            s += str(label) + "\n"
            s += "Voc size    = " + str(sum(self.bow[label][token] for token in self.bow[label])) + "\n"
            s += "P(c)          = " + str(self.logprior[label]) + "\n"
            s+= "_____________________\n"
        return s 

In [44]:
class UnigramMultinomialNB(NaiveBayesClassifier):
    def __init__(self, docs, classes, alpha = None, voc = None):
        
        super().__init__(docs,classes, alpha = alpha, voc = voc)

    def extract_features (self, docs):
        """creates unigram counts per class
        """
        flattened_docs = [token 
                          for row in docs['Body_tokenized']
                          for token in row]
        
        return Counter(flattened_docs)
    

In [47]:
# initialize test corpus and prepocess it
test_corpus = pd.read_csv("data/test.csv")
# preprocess 
test_corpus['Body'] = test_corpus['Body'].apply(lambda x: remove_html_tags(x))
test_corpus['Body_tokenized'] = test_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))

In [48]:
bigdoc_voc = create_vocabulary(df_corpus['Body_tokenized'], unk_threshold) # restricted voc

In [49]:
# create multinomial class conditional unigram model

unigramMNB_model = UnigramMultinomialNB(df_corpus, classes, alpha=1, voc = bigdoc_voc) # Laplace smoothing
unigramMNB_model.train()

print(unigramMNB_model.check_consistency(), '\n')

print(unigramMNB_model)

1
1
Is consistent : True 

total docs    = 14000
labels        = ['HQ', 'LQ']
Voc          = 2653867
Alpha        = 1
_____________________
HQ
Voc size    = 1382815
P(c)          = -0.6931471805599453
_____________________
LQ
Voc size    = 1271052
P(c)          = -0.6931471805599453
_____________________



In [50]:
unigramMNB_accuracy =  unigramMNB_model.test_accuracy(test_corpus, data_type='Body_tokenized')
print("Accuracy prediction for unigram MNB on test data :\n", unigramMNB_accuracy )

  true_label = row[index]['Y']


TypeError: string indices must be integers, not 'str'