In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [68]:
import pandas as pd

In [69]:
from operator import methodcaller
import string
import re
from collections import Counter, defaultdict
import numpy as np
from itertools import islice

def expand_contradictions(text):

    contraction_mapping = {
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }

    pattern = re.compile(r"\b(?:" + "|".join(re.escape(contraction) for contraction in contraction_mapping.keys()) + r")\b")
    text = pattern.sub(lambda x: contraction_mapping[x.group()], text)
    
    return text

def remove_digits_and_words_digits(text):
    # Define a regular expression pattern to match words containing digits
    pattern = r'\b\w*\d\w*\b'
    text_without_words_with_digits = re.sub(pattern, '', text)

    return text_without_words_with_digits

def remove_stop_words(text):
    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                  'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those',
                  'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
                  'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                  'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
                  'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                  'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
                  'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
                  'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                  "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
                  'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    text_clean = []
    for word in text:
        if word not in stop_words:
            text_clean.append(word)
    return text_clean


def tokenize(text, stop_words=True, split=True):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    # re.sub('[^a-zA-Z]', '', dataset['Text'][i])

    # Text preprocessing techniques:
    # 1) Lowercase
    text = text.lower()

    # 2) Expand Contradictions
    text = expand_contradictions(text)

    # 3) Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)

    # 4) Remove digits and words with digits
    text = remove_digits_and_words_digits(text)

    if split:
        text = text.split()

    # # 5) Remove Stop Words
    # if stop_words:
    # text = remove_stop_words(text)

    return text

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [70]:
########################
# Naive Bayes Features #
########################

class Features_NB_TF_IDF(Features):

    def __init__(self, model_file, threshold):
        super(Features_NB_TF_IDF, self).__init__(model_file)
        self.tfidf_vectorizer = None
        self.threshold = threshold
        
    def get_features(self, input_data: pd.DataFrame):
        """Compute TF-IDF for input dataframe
        """
        # loading CountVectorizer
        tf_vectorizer = TfidfVectorizer(
            min_df=self.threshold
            )
        
        # Text Preprocessing
        input_data.apply(lambda x: tokenize(x, split=False))

        X_train_tf = tf_vectorizer.fit_transform(input_data)

        self.tfidf_vectorizer = tf_vectorizer
        return X_train_tf

In [71]:
"""
NaiveBayes is a generative classifier based on the Naive assumption that features are independent from each other
P(w1, w2, ..., wn|y) = P(w1|y) P(w2|y) ... P(wn|y)
Thus argmax_{y} (P(y|w1,w2, ... wn)) can be modeled as argmax_{y} P(w1|y) P(w2|y) ... P(wn|y) P(y) using Bayes Rule
and P(w1, w2, ... ,wn) is constant with respect to argmax_{y} 
Please refer to lecture notes Chapter 4 for more details
"""
from work.Features import Features_NB
from work.Model import *
from collections import Counter, defaultdict
import numpy as np
import math
class NaiveBayes_TF_IDF(Model):

    def __init__(self, model_file, threshold=None):
        super(NaiveBayes_TF_IDF, self).__init__(model_file)
        self.threshold = threshold # Minimum number of occurences of word
    
    def train(self, input_file):
        """
        This method is used to train your models and generated for a given input_file a trained model
        :param input_file: path to training file with a text and a label per each line
        :return: model: trained model 
        """
        # Read dataset

        # train_dataset = pd.read_csv(input_file, sep='\t', header=None, names=['text', 'true_label'])

        with open(input_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, labels = map(list, zip(*data_split))
        train_dataset = pd.DataFrame({'text': texts, 'true_label': labels})

        
        X_train = train_dataset['text']
        y_train = train_dataset['true_label']


        # Instanciate Features_NB_TF_IDF class:
        #   - Create TF-IDF Matrix
        features_naive_bayes = Features_NB_TF_IDF(input_file, self.threshold)
        
        X_train_tf = features_naive_bayes.get_features(X_train)


        # Train Model
        naive_bayes = MultinomialNB()
        naive_bayes.fit(X_train_tf, y_train)

        # Build Model
        nb_model = {
            "NaiveBayes": naive_bayes,
            "feature_weights": features_naive_bayes.tfidf_vectorizer,
            "Feature": features_naive_bayes
        }
        
        self.save_model(nb_model)
    
        

    def classify(self, input_file, model):
        """
        This method will be called by us for the validation stage and or you can call it for evaluating your code 
        on your own splits on top of the training sets seen to you
        :param input_file: path to input file with a text per line without labels
        :param model: the pretrained model
        :return: predictions list
        """ 
        feature_weights = model["feature_weights"]
        Feature_NB_TF_IDF_class = model["Feature"]
        NaiveBayes = model["NaiveBayes"]

        # Read dataset
        # test_dataset = pd.read_csv(input_file, sep='\t', header=None, names=['text'])

        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data
        test_dataset = pd.DataFrame({'text': data})
        
        X_test = test_dataset['text'].values.astype('U')

        # return X_test

        X_test_tfidf = feature_weights.transform(X_test)

        # Make Prediction
        y_pred = NaiveBayes.predict(X_test_tfidf)

        # Convert to string for saving
        preds = [str(num) for num in y_pred]
        
        return preds

In [72]:
#questions
# train_file = "work/datasets/questions/train.txt"
# pred_file = "work/datasets/questions/val.test"
# pred_true_labels = "work/datasets/questions/val.txt"
# model_file_name = "naivebayes_tfidf.questions.model"
# model_nb = NaiveBayes_TF_IDF(model_file_name, threshold=9)

# # odiya
# train_file = "work/datasets/odiya/train.txt"
# pred_file = "work/datasets/odiya/val.test"
# pred_true_labels = "work/datasets/odiya/val.txt"
# model_file_name = "naivebayes_tfidf.odiya.model"
# model_nb = NaiveBayes_TF_IDF(model_file_name, threshold=2)

#Products
train_file = "work/datasets/products/train.txt"
pred_file = "work/datasets/products/val.test"
pred_true_labels = "work/datasets/products/val.txt"
model_file_name = "naivebayes_tfidf.products.model"
model_nb = NaiveBayes_TF_IDF(model_file_name, threshold=2)

# # 4dim
# train_file = "work/datasets/4dim/train.txt"
# pred_file = "work/datasets/4dim/val.test"
# pred_true_labels = "work/datasets/4dim/val.txt"
# model_file_name = "naivebayes_tfidf.4dim.model"
# model_nb = NaiveBayes_TF_IDF(model_file_name, threshold=0)

In [73]:
model_nb.train(train_file)

In [62]:
preds = model_nb.classify(pred_file + ".txt", model_nb.load_model())

In [63]:
## Save the predictions: one label prediction per line
with open(pred_file + ".pred.txt", "w") as file:
    for pred in preds:
        file.write(pred+"\n")

# Evaluation

In [64]:
import pandas as pd

In [65]:
true_dataset = pd.read_csv(pred_true_labels, sep='\t', header=None, names=['text', 'true_label'])
pred_dataset = pd.read_csv(pred_file + ".pred.txt", sep='\t', header=None, names=['pred'])

In [66]:
# Check if the columns have the same name; adjust as needed
column_name = 'true_label'  # Change to the actual column name
pred_column_name = 'pred'  # Change to the actual predicted column name

# Merge the two DataFrames on a common index or key if available
merged_df = true_dataset.merge(pred_dataset, left_index=True, right_index=True)

# Calculate the accuracy by comparing the two columns
accuracy = (merged_df[column_name] == merged_df[pred_column_name]).mean()

# Print the accuracy as a percentage
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 66.35%
