In [1]:
#!/usr/bin/env python
# coding: utf-8
#===============================================================================
#
#           FILE: binaryCounts_4_ntb.py
#         AUTHOR: Bianca Ciobanica
#	       EMAIL: bianca.ciobanica@student.uclouvain.be
#
#           BUGS: 
#        VERSION: 3.10.6
#        CREATED: 26-10-2023 
#
#===============================================================================
#    DESCRIPTION: used this ressource to learn how to remove duplicates
#                https://www.w3schools.com/python/python_howto_remove_duplicates.asp
# 
#    
#          USAGE: 
#===============================================================================

In [2]:
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
from nltk.lm import Vocabulary
from math import log
from collections import Counter

In [3]:
def remove_html_tags(text):
  import re
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

In [4]:
def create_vocabulary(df):
        tokenized_corpus = [token 
                            for row in df
                            for token in row] # flattened corpus
      
        return tokenized_corpus, Vocabulary(tokenized_corpus, unk_cutoff=unk_threshold)

In [5]:
def test_remove_duplicates():
    """
    for testing list(set(row))
    """
    t = [['banane','banane','pomme','poire','poire','citron','citron'],['kiwi','banane','poire','poire','citron'],['citron','mangue']]
    expected_counts = {
        'banane': 2,
        'pomme': 1,
        'poire': 2,
        'kiwi': 1,
        'citron': 3,
        'mangue': 1
    }
    test = [token for row in t for token in list(set(row))] # get only unique tokens
    test_voc = Vocabulary(test, unk_cutoff=1)
    print(test)
    # used chatGPT for formatted prints
    for token, expected_count in expected_counts.items():
        if test_voc[token] == expected_count:
            print(f"Test passed! Counted {expected_count} for {token}")
        else:
            print(f"Test failed! Expected {expected_count} for {token}, but got {test_voc[token]}")

    
#test_remove_duplicates()

In [6]:
# initialize training corpus
df_corpus = pd.read_csv("data/train.csv")
unk_threshold = 3
# preprocess 
df_corpus['Body'] = df_corpus['Body'].apply(lambda x: remove_html_tags(x))
df_corpus['Body_tokenized'] = df_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))
# remove duplicates
df_corpus['Body_tokenized_unique'] = df_corpus['Body_tokenized'].apply(lambda x: list(set(x)))

In [7]:
corpus_unique, bigdoc_voc = create_vocabulary(df_corpus['Body_tokenized_unique'])

In [8]:
print(df_corpus.keys())
print(df_corpus.shape)

Index(['Body', 'Y', 'Body_tokenized', 'Body_tokenized_unique'], dtype='object')
(14000, 4)


In [9]:
classes_counts = len(df_corpus['Y'])
classes = df_corpus['Y'].unique().tolist()

In [10]:
classes = df_corpus['Y'].unique().tolist()
#print(classes)

In [11]:
class NaiveBayesClassifier:
    # This code is inspired by code generated with the GPT-3 model developed by OpenAI.
    # I initally created a NB model which i would then initialize per class
    # But then I realized i should create a model which iterates through each class as shown in SLP
    
    def __init__(self, docs, classes, alpha=0, voc = None):
        self.bigdoc = docs
        self.classes = classes
        self.alpha = alpha
        self.bigdoc_v = voc
        self.bigdoc_v_size = self.bigdoc_v.total()
        
        self.bow = {}
        self.logprior = {}
        self.logll = {}
        self.probability = {}
       
    def train(self):
        if self.bigdoc_v is None or len(self.bigdoc_v) == 0:
            raise ValueError("Cannot train on an empty vocabulary.")
        
        total_doc_counts = len(self.bigdoc)
        #print("total_doc_counts",total_doc_counts)
        
        # go through each class
        for label in self.classes:
            # get features from class
            class_docs = self.bigdoc[self.bigdoc['Y'] == label] # select documents per class
            class_counts = len(class_docs)
            
            # get P(c)
            self.logprior[label] = log(class_counts / total_doc_counts)
            
            # generate bag of words
            self.bow[label] = self.extract_features(class_docs)
            logll = 0
            
            # calculate log likelihood
            class_bow = self.bow[label]
            total_class_tokens = class_bow.total()
            self.logll[label] = {}
            self.probability[label] = {}
            
            for token in self.bigdoc_v:
                prob_word = class_bow.get(token,0.0)
                self.logll[label][token] = log((prob_word + self.alpha) / (total_class_tokens + self.bigdoc_v_size * self.alpha))
                if token in class_bow:
                    self.probability[label][token] = class_bow[token] / total_class_tokens
                
    def check_consistency(self):
        probs = {}
        for label in self.classes:
            p = 0
            for token in self.probability[label]:
                p += self.probability[label][token]
            probs[label] = p
            
        #print(probs)
        
        is_consistent = False
        p_class1, p_class2 = probs.values()
        if round(p_class1) == 1 and round (p_class2) == 1:
            is_consistent = True
        return "Is consistent : " + str(is_consistent)
        
    def test_model(self, test_doc):
        sums = {} # {"class" : logprior}
        #C_NB = argmax (logprior + sum logll)
        
        for label in self.classes:
            sums[label] = self.logprior[label]
            # go through each word
            for token in test_doc:
                if token not in self.bigdoc_v:
                    continue
                sums[label] = sums[label] + self.logll[label][token]
        
        # get class with highest score
        argmax = max(sums, key=sums.get)
        return argmax

    def test_accuracy(self, data=None):
        if data is None or data.empty:
            raise ValueError("Cannot train on an empty corpus.")
            
        correct_predictions = 0;
        total_predictions = 0;
    
        for index, row in data.iterrows(): # used GPT to help me find how to iter on rows in my df
            true_label = row['Y']
            predicted_label = self.test_model(row['Body_tokenized'])
           # print(true_label, "< true =?= predicted >", predicted_label)
    
            total_predictions += 1
            if predicted_label == true_label:
                correct_predictions += 1
    
        accuracy = correct_predictions / total_predictions
        return round(accuracy * 100, 3)
            
    def __repr__(self):
        s = ""
        s += "total docs    = " + str(len(self.bigdoc)) + "\n"
        s += "labels        = " + str(self.classes)+ "\n"
        s += "Voc        = " + str(self.bigdoc_v_size)+ "\n"
        s+= "_____________________\n"

        for label in self.classes:
            s += str(label) + "\n"
            s += "Voc size    = " + str(sum(self.bow[label][token] for token in self.bow[label])) + "\n"
            s += "P(c)          = " + str(self.logprior[label]) + "\n"
           # s += "Logll         = " + str(self.logll[label]) + "\n"
            s+= "_____________________\n"
        return s 

In [12]:
class BinaryNB(NaiveBayesClassifier):
    def __init__(self, docs, classes, alpha = 0, voc = None):
        
        super().__init__(docs,classes, alpha = alpha, voc = voc)

    def extract_features (self, docs):
        """creates binary counts per class
        """
        flattened_docs = [token 
                          for row in docs['Body_tokenized_unique']
                          for token in row] # ignore oov tokens
        
        return Counter(flattened_docs)

In [13]:
test_corpus = pd.read_csv("data/test.csv")
# preprocess 
test_corpus['Body'] = test_corpus['Body'].apply(lambda x: remove_html_tags(x))
test_corpus['Body_tokenized'] = test_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))

print(test_corpus.keys())
print(test_corpus.shape)

Index(['Body', 'Y', 'Body_tokenized'], dtype='object')
(3500, 3)


In [15]:
# create binary class conditional model

tokens_no_dupl_counter = Counter(corpus_unique)

binaryNB_Model = BinaryNB(df_corpus, classes, alpha= 1, voc = tokens_no_dupl_counter) # Laplace smoothing
binaryNB_Model.train()
print(binaryNB_Model.check_consistency(), '\n')
print(binaryNB_Model)

Is consistent : True 

total docs    = 14000
labels        = ['HQ', 'LQ']
Voc        = 1172490
_____________________
HQ
Voc size    = 625117
P(c)          = -0.6931471805599453
_____________________
LQ
Voc size    = 547373
P(c)          = -0.6931471805599453
_____________________



In [16]:
binaryNB_accuracy = binaryNB_Model.test_accuracy(test_corpus)
print("Accuracy prediction for binary NB on test data :\n", binaryNB_accuracy)

Accuracy prediction for binary NB on test data :
 84.486
