In [295]:
"""
NaiveBayes is a generative classifier based on the Naive assumption that features are independent from each other
P(w1, w2, ..., wn|y) = P(w1|y) P(w2|y) ... P(wn|y)
Thus argmax_{y} (P(y|w1,w2, ... wn)) can be modeled as argmax_{y} P(w1|y) P(w2|y) ... P(wn|y) P(y) using Bayes Rule
and P(w1, w2, ... ,wn) is constant with respect to argmax_{y} 
Please refer to lecture notes Chapter 4 for more details
"""
from work.Features import Features_NB
from work.Model import *
from collections import Counter, defaultdict
import numpy as np
import math
class NaiveBayes(Model):

    def __init__(self, model_file, threshold):
        super(NaiveBayes, self).__init__(model_file)
        self.count_word_label = None
        self.count_words_per_label = None
        self.count_label = None
        self.threshold = threshold

    def __count_frequency_word_label(self, sentences, labels):
        
        """
        :param sentences (list[list]): sentences tokenized
        :param labels (list): list of labels
        :return: count(c_j, w_i) refers to the count of word w_i in documents with label c_j
                _sum_{i=1}^{V}{count(c_j, w_i)} sum of the counts of each word in our vocabulary in class c_j 
                 count(c_j) refers to the count of label c_j 
        """
        count_word_label = []
        count_words_per_label = defaultdict(int)
        for sentence, label in zip(sentences, labels):
            for token in sentence:
                count_word_label.append((token, label))
                count_words_per_label[label] += 1
            
        # count_word_label = [(token, label) for sentence, label in zip(sentences, labels) for token in sentence]
        count_label = Counter(labels)
        return Counter(count_word_label), count_words_per_label, count_label
    
    def __compute_feature_weights(self, count_word_label, count_words_per_label, count_label, size_vocabulary, alpha=1):
        """
        :param alpha (int): Hyperparemeter alpha for Laplace Smoothing
        """
        feature_weights = defaultdict(dict)
        for word, label in count_word_label.keys():
            # Maximum Likelihood Estimates
            tmp = math.log((count_word_label[(word, label)] + alpha)/(size_vocabulary*alpha + count_words_per_label[label]))
            feature_weights[label][word] = tmp

        # Include Probability of each label: 
        total_documents = sum(count_label.values())
        for label in count_label.keys():
            probability_label_name = "prob_mu"
            feature_weights[label][probability_label_name] = math.log(count_label[label]/total_documents)
        return feature_weights

    
    def train(self, input_file):
        """
        This method is used to train your models and generated for a given input_file a trained model
        :param input_file: path to training file with a text and a label per each line
        :return: model: trained model 
        """

        # Instanciate Features_NB class:
        #   - Create Vocabulary
        features_naive_bayes = Features_NB(input_file, self.threshold)

        # Replace words that are not in vocabulary with OOV (Out-of-Vocabulary)
        # token
        updated_text = []
        labels = features_naive_bayes.labels
        for sentence in features_naive_bayes.tokenized_text:
            tmp = features_naive_bayes.replace_unknown_word_with_oov(sentence)
            updated_text.append(tmp)            

        # Compute Feature Weights
        count_word_label, count_words_per_label, count_label = self.__count_frequency_word_label(updated_text, labels)
        self.count_word_label = count_word_label
        self.count_words_per_label = count_words_per_label
        self.count_label = count_label
        feature_weights = self.__compute_feature_weights(count_word_label, count_words_per_label, count_label, len(features_naive_bayes.vocabulary))

        # Build Model
        nb_model = {
            "feature_weights": feature_weights,
            "Feature": features_naive_bayes
        }
        
        self.save_model(nb_model)
    
        

    def classify(self, input_file, model):
        """
        This method will be called by us for the validation stage and or you can call it for evaluating your code 
        on your own splits on top of the training sets seen to you
        :param input_file: path to input file with a text per line without labels
        :param model: the pretrained model
        :return: predictions list
        """ 
        feature_weights = model["feature_weights"]
        Feature_NB_class = model["Feature"]

        # Read Input File
        tokenized_text = Feature_NB_class.read_inference_file(input_file)

        preds = []

        # Choosing the label y which maximizes log p(x, y; μ, φ):
        for sentence in tokenized_text:
            sentence_features = Feature_NB_class.get_features(sentence, model)
            # print("Sentence Characters: ", len(sentence_features.keys()))
            class_predictions = defaultdict()

            for label in Feature_NB_class.labelset:
                # print(label)
                feature_weights_y = feature_weights[label]
                # Compute Inner Product: feature_weights*feature_vector
                # print(len(sentence_features))
                total_sum = 0 
                # print("Size match: @@@@@ ", len(sentence_features.keys() & feature_weights_y.keys()))
                for key in sentence_features.keys():
                    if key in feature_weights_y.keys():
                        # print(key)
                        total_sum += sentence_features[key] * feature_weights_y[key]
                # result = sum(sentence_features[key] * feature_weights_y[key] for key in sentence_features.keys() & feature_weights_y.keys())
                # print(result)

                class_predictions[label] = total_sum
            # Find the class with the highest value
            class_with_highest_value = max(class_predictions, key=lambda k: class_predictions[k])
            preds.append(class_with_highest_value)
        
        return preds

In [279]:
""" 
    Basic feature extractor
"""
from operator import methodcaller
import string 
import re

# def tokenize(text):
#     # TODO customize to your needs
#     text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
#     # re.sub('[^a-zA-Z]', '', dataset['Text'][i])
#     return re.sub('[^a-zA-Z]', ' ', text.lower()).split()

def expand_contradictions(text):

    contraction_mapping = {
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }

    pattern = re.compile(r"\b(?:" + "|".join(re.escape(contraction) for contraction in contraction_mapping.keys()) + r")\b")
    text = pattern.sub(lambda x: contraction_mapping[x.group()], text)
    
    return text

def remove_digits_and_words_digits(text):
    # Define a regular expression pattern to match words containing digits
    pattern = r'\b\w*\d\w*\b'
    text_without_words_with_digits = re.sub(pattern, '', text)

    return text_without_words_with_digits

def tokenize(text):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    # re.sub('[^a-zA-Z]', '', dataset['Text'][i])

    # Text preprocessing techniques:
    # 1) Lowercase
    text = text.lower()

    # 2) Expand Contradictions
    text = expand_contradictions(text)

    # 3) Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)

    # 4) Remove digits and words with digits
    text = remove_digits_and_words_digits(text)

    return text.split()

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [280]:
########################
# Naive Bayes Features #
########################

class Features_NB(Features):

    def __init__(self, model_file, threshold):
        super(Features_NB, self).__init__(model_file)
        self.vocabulary = self.create_vocabulary(self.tokenized_text, threshold)

    def read_inference_file(self, input_file):
        """Read inference file that is in the form: <text> i.e. a line
        of text that does not contain a tab.
        """
        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text
    
    def create_vocabulary(self, tokenized_text, threshold):
        """Creat vocabulary from training set, considering only words that have an occurence > threshold.
        """
        # Append everything together in a dictionary
        flattened_list = [item for sublist in tokenized_text for item in sublist]
        flattened_list_count = Counter(flattened_list)

        # Considering only words that have an occurence > threshold.
        flattened_list_count_filter = [word for word, count in flattened_list_count.items() if count > threshold]

        return flattened_list_count_filter

    def replace_unknown_word_with_oov(self, tokenized_sentence):
        """Replace words that are not in vocabulary with OOV (Out-of-Vocabulary)
        token
        """
        updated_sentence = []
        for word in tokenized_sentence:
            if word not in self.vocabulary:
                updated_sentence.append('OOV')
            else:
                updated_sentence.append(word)
        return updated_sentence
        
    def get_features(self, tokenized, model):
        """Bag-of-words: return column vector of word counts, including OOV (Out-of-Vocabulary) token, if present.
        Vector stores only non-zero values to improve performance
        """

        # Replace words that are not in vocabulary with OOV
        updated_text = model["Feature"].replace_unknown_word_with_oov(tokenized)

        bag_of_words = Counter(updated_text)
        # Include OffsetFeature "prob_mu" to 1; which allows to include the probability of the label
        # to the maximum likelihood estimation.

        bag_of_words["prob_mu"] = 1
        return bag_of_words

In [288]:
# questions
train_file = "work/datasets/questions/train.txt"
pred_file = "work/datasets/questions/val.test"
pred_true_labels = "work/datasets/questions/val.txt"
model_file_name = "nb.questions.model"
model_nb = NaiveBayes(model_file_name, threshold=0)

# # odiya
# train_file = "work/datasets/odiya/train.txt"
# pred_file = "work/datasets/odiya/val.test"
# pred_true_labels = "work/datasets/odiya/val.txt"
# model_file_name = "logreg.odiya.model"
# model_LR = LogisticRegression(model_file_name, learning_rate=0.000001, epochs=1000, threshold=10, max_features=1000)


# 4dim
# train_file = "work/datasets/4dim/train.txt"
# pred_file = "work/datasets/4dim/val.test"
# pred_true_labels = "work/datasets/4dim/val.txt"
# model_file_name = "logreg.4dim.model"
# model_LR = LogisticRegression(model_file_name, learning_rate=0.2, epochs=200, threshold=1, max_features=100)


# Products
# train_file = "work/datasets/products/train.txt"
# pred_file = "work/datasets/products/val.test"
# pred_true_labels = "work/datasets/products/val.txt"
# model_file_name = "nb.products.model"

# 4dim
# train_file = "work/datasets/4dim/train.txt"
# pred_file = "work/datasets/4dim/val.test"
# pred_true_labels = "work/datasets/4dim/val.txt"
# model_file_name = "nb.4dim.model"

In [289]:
model_nb.train(train_file)

In [290]:
preds = model_nb.classify(pred_file + ".txt", model_nb.load_model())

In [291]:
## Save the predictions: one label prediction per line
with open(pred_file + ".pred.txt", "w") as file:
    for pred in preds:
        file.write(pred+"\n")

# Evaluation

In [292]:
import pandas as pd

In [293]:
true_dataset = pd.read_csv(pred_true_labels, sep='\t', header=None, names=['text', 'true_label'])
pred_dataset = pd.read_csv(pred_file + ".pred.txt", sep='\t', header=None, names=['pred'])

In [294]:
# Check if the columns have the same name; adjust as needed
column_name = 'true_label'  # Change to the actual column name
pred_column_name = 'pred'  # Change to the actual predicted column name

# Merge the two DataFrames on a common index or key if available
merged_df = true_dataset.merge(pred_dataset, left_index=True, right_index=True)

# Calculate the accuracy by comparing the two columns
accuracy = (merged_df[column_name] == merged_df[pred_column_name]).mean()

# Print the accuracy as a percentage
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 5.87%
