In [115]:
#!/usr/bin/env python
# coding: utf-8
#===============================================================================
#
#           FILE: negativeNB_4_ntb.py 
#         AUTHOR: Bianca Ciobanica
#	       EMAIL: bianca.ciobanica@student.uclouvain.be
#
#           BUGS: 
#        VERSION: 3.10.6
#        CREATED: 25-10-2023 
#
#===============================================================================
#    DESCRIPTION:
#
#          USAGE: 
#===============================================================================

In [116]:
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
from nltk.util import ngrams
from collections import Counter
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm import Vocabulary
from math import log

In [117]:
def remove_html_tags(text):
  import re
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

In [118]:
def create_vocabulary(df):
        tokenized_corpus = [token 
                            for row in df
                            for token in row] # flattened corpus
      
        return Vocabulary(tokenized_corpus, unk_cutoff=unk_threshold)

In [119]:
def add_negation(tokens):
    """
        Theta(n) for applying _NOT to tokens between neg tokens and punctuation (or not)
        the idea is to keep pointers of whether to apply the _NOT or no
    """
    neg_tokens = ['no','not','never']
    punctuation = ['.', ',', ':', '?', '!']

    i = 0
    apply_negation = False
    
    while i < len(tokens):
        if tokens[i] in neg_tokens:
            apply_negation = True
            
        elif tokens[i] in punctuation and apply_negation:
            # stops adding _NOT because we reached punctuation
            apply_negation = False
            
        elif apply_negation:
            tokens[i] += '_NOT'
        i += 1

    return tokens

In [120]:
def test_first_ten_rows():
    print(add_negation(['I', 'did', 'not', 'like', 'this', 'movie','.', 'But', 'I', 'still', 'watched', 'it','.']))
    print(add_negation(['I', 'did', 'not', 'want', 'to', 'get', 'off', 'my', 'bed','not','to','sound','too','depressed', 'but', 'I', 'had', 'no', 'choice']))
    for i in range (10):
        row = df_corpus['Body_tokenized'][i]
       # print(add_negation(row))
#test_first_ten_rows()

In [121]:
# initialize training corpus
df_corpus = pd.read_csv("data/train.csv")
unk_threshold = 3

# preprocess 
df_corpus['Body'] = df_corpus['Body'].apply(lambda x: remove_html_tags(x))
df_corpus['Body_tokenized'] = df_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))

# replace any token t 
#between a negative token (['not', 'no', 'never']) 
# and a punctuation sign (['.', ',', ':', '?', '!']) by the token t_NOT.

df_corpus['Body_tokenized_Negation'] = df_corpus['Body_tokenized'].apply(lambda x: add_negation(x))

In [122]:
training_neg_voc = create_vocabulary(df_corpus['Body_tokenized_Negation'])
#print(training_neg_voc['like_NOT'])

In [123]:
print(df_corpus.keys())
print(df_corpus.shape)

Index(['Body', 'Y', 'Body_tokenized', 'Body_tokenized_Negation'], dtype='object')
(14000, 4)


In [124]:
classes_counts = len(df_corpus['Y'])
classes = df_corpus['Y'].unique().tolist()
#print(classes)

In [125]:
class NaiveBayesClassifier:
    # This code is inspired by code generated with the GPT-3 model developed by OpenAI.
    # I initally created a NB model which i would then initialize per class
    # But then I realized i should create a model which iterates through each class as shown in SLP
    
    def __init__(self, docs, classes, alpha=0, voc = None):
        self.bigdoc = docs
        self.classes = classes
        self.alpha = alpha
        self.bigdoc_v = voc
        self.bigdoc_v_size = sum(self.bigdoc_v[token] for token in self.bigdoc_v if token != '<UNK>')
        
        self.bow = {}
        self.logprior = {}
        self.logll = {}
        self.probability = {}
       
    def train(self):
        if self.bigdoc_v is None or len(self.bigdoc_v) == 0:
            raise ValueError("Cannot train on an empty vocabulary.")
        
        total_doc_counts = len(self.bigdoc)
        # go through each class
        for label in self.classes:
            # get features from class
            class_docs = self.bigdoc[self.bigdoc['Y'] == label]
            class_counts = len(class_docs)
            
            # get P(c)
            self.logprior[label] = log(class_counts / total_doc_counts)
            
            # generate bag of words
            self.bow[label] = self.extract_features(class_docs)
            logll = 0
            
            # calculate log likelihood
            class_bow = self.bow[label]
            total_class_tokens = sum(class_bow[token] for token in class_bow)
            self.logll[label] = {}
            self.probability[label] = {}
            
            for token in self.bigdoc_v:
                prob_word = class_bow.get(token,0)
                self.logll[label][token] = log((prob_word + self.alpha) / (total_class_tokens + self.bigdoc_v_size * self.alpha))
                if token in class_bow:
                    self.probability[label][token] = class_bow[token] / total_class_tokens


    def check_consistency(self):
        probs = {}
        for label in self.classes:
            p = 0
            for token in self.probability[label]:
                p += self.probability[label][token]
            probs[label] = p
            
        #print(probs)
        
        is_consistent = False
        p_class1, p_class2 = probs.values()
        if round(p_class1) == 1 and round (p_class2) == 1:
            is_consistent = True
        return "Is consistent : " + str(is_consistent)
        
    def test_model(self, test_doc):
        sums = {} # {"class" : logprior}
        #C_NB = argmax (logprior + sum logll)
        
        for label in self.classes:
            sums[label] = self.logprior[label]
            # go through each word
            for token in test_doc:
                if token not in self.bigdoc_v:
                    continue
                sums[label] = sums[label] + self.logll[label][token]
        
        # get class with highest score
        argmax = max(sums, key=sums.get)
        return argmax

    def test_accuracy(self, data=None, data_type=None):
        if data is None or data.empty:
            raise ValueError("Cannot train on an empty corpus.")
            
        correct_predictions = 0;
        total_predictions = 0;
    
        for index, row in data.iterrows(): # used GPT to help me find how to iter on rows in my df
            true_label = row['Y']
            predicted_label = self.test_model(row[data_type])
           # print(true_label, "< true =?= predicted >", predicted_label)
    
            total_predictions += 1
            if predicted_label == true_label:
                correct_predictions += 1
    
        accuracy = correct_predictions / total_predictions
        return round(accuracy * 100, 3)
            
    def __repr__(self):
        s = ""
        s += "total docs    = " + str(len(self.bigdoc)) + "\n"
        s += "labels        = " + str(self.classes)+ "\n"
        s += "Voc        = " + str(self.bigdoc_v)+ "\n"
        s+= "_____________________\n"

        for label in self.classes:
            s += str(label) + "\n"
            s += "Voc size    = " + str(sum(self.bow[label][token] for token in self.bow[label])) + "\n"
            s += "P(c)          = " + str(self.logprior[label]) + "\n"
           # s += "Logll         = " + str(self.logll[label]) + "\n"
            s+= "_____________________\n"
        return s 
        

In [126]:
class NegationNB(NaiveBayesClassifier):
    def __init__(self, docs, classes, alpha = 0, voc = None):
        
        super().__init__(docs,classes, alpha = alpha, voc = voc)

    def extract_features (self, docs):
        """creates unigram counts per class
        """
        flattened_docs = [token 
                          for row in docs['Body_tokenized_Negation']
                          for token in row if token in self.bigdoc_v]
        
        return Counter(flattened_docs)

In [127]:
# initialize test corpus and prepocess it
test_corpus = pd.read_csv("data/test.csv")
# preprocess 
test_corpus['Body'] = test_corpus['Body'].apply(lambda x: remove_html_tags(x))
test_corpus['Body_tokenized'] = test_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))
test_corpus['Body_tokenized_Negation'] = test_corpus['Body_tokenized'].apply(lambda x: add_negation(x))

print(test_corpus.keys())
print(test_corpus.shape)

Index(['Body', 'Y', 'Body_tokenized', 'Body_tokenized_Negation'], dtype='object')
(3500, 4)


In [128]:
# create binary class conditional model
negativeMNB_Model = NegationNB(df_corpus, classes, alpha=1, voc = training_neg_voc) # smoothing according to optimal alpha
negativeMNB_Model.train()
print(negativeMNB_Model.check_consistency(), '\n')

print(negativeMNB_Model)

Is consistent : True 

total docs    = 14000
labels        = ['HQ', 'LQ']
Voc        = <Vocabulary with cutoff=3 unk_label='<UNK>' and 37664 items>
_____________________
HQ
Voc size    = 1340383
P(c)          = -0.6931471805599453
_____________________
LQ
Voc size    = 1221078
P(c)          = -0.6931471805599453
_____________________



In [129]:
negativeMNB_accuracy =  negativeMNB_Model.test_accuracy(test_corpus, data_type='Body_tokenized_Negation')
print("Accuracy prediction for MNB with logical negation on test data :\n", negativeMNB_accuracy )

Accuracy prediction for MNB with logical negation on test data :
 83.114
