In [1]:
""" 
    Basic feature extractor
"""
from operator import methodcaller
import string

def tokenize(text):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    return text.lower().split()

class Features:

    def __init__(self, data_file, has_label=True):
        with open(data_file) as file:
            data = file.read().splitlines()

        # if has_label:
        print('####### HAS LABEL')
        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))
        self.labelset = list(set(self.labels))
        # else:
        #     print('####### DOES NOT HAVE LABEL')
        #     texts = data
        #     self.labels = None
        #     self.labelset = None

        self.tokenized_text = [tokenize(text) for text in texts]

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [91]:
class Features_NB(Features):

    def __init__(self, model_file, has_label=True):
        super(Features_NB, self).__init__(model_file, has_label)
        self.vocabulary = self.create_vocabulary(self.tokenized_text)
        self.size_vocab = len(self.vocabulary)
        self.feature_weights = self.generate_feature_weights(laplace_smoothing=True)

    def read_input_file(self, input_file):

        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text

    def count_frequency_word_label(self, sentences, labels):
        
        """
        :param sentences (list[list]): sentences tokenized
        :param labels (list): list of labels
        :return: count(c_j, w_i) refers to the count of word w_i in documents with label c_j
                _sum_{i=1}^{V}{count(c_j, w_i)} sum of the counts of each word in our vocabulary in class c_j 
                 count(c_j) refers to the count of label c_j 
        """
        count_word_label = []
        count_words_per_label = defaultdict(int)
        for sentence, label in zip(sentences, labels):
            for token in sentence:
                count_word_label.append((token, label))
                count_words_per_label[label] += 1
            
        # count_word_label = [(token, label) for sentence, label in zip(sentences, labels) for token in sentence]
        count_label = Counter(labels)
        return Counter(count_word_label), count_words_per_label, count_label

    def create_vocabulary(self, tokenized_text):

        # Append everything together in a dictionary
        flattened_list = [item for sublist in tokenized_text for item in sublist]
        flattened_list_count = Counter(flattened_list)
        vocabulary = list(flattened_list_count.keys())
        return vocabulary
    
    def generate_feature_weights(self, laplace_smoothing=True):
        # Vocabulary
        size_vocab = len(self.vocabulary)

        # Maximum Likelihood Estimates
        count_word, count_words_label, count_label = self.count_frequency_word_label(self.tokenized_text, self.labels)

        # Feature Weights
        feature_weights = defaultdict()
        feature_weights["count_word"] = count_word
        feature_weights["count_words_label"] = count_words_label
        feature_weights["count_label"] = count_label

        # Generate feature weights => P(w_i|c_i)
        
        # for word_label in count_word.keys():
        #     word, label = word_label
        #     if laplace_smoothing:
        #         prob = (count_word[word_label] + 1)/(count_words_label[label] + size_vocab)
        #     else:
        #         prob = count_word[word_label]/count_words_label[label]
        #     feature_weights[word_label] = np.log(prob)
        # Generate feature weights, apriori probability: P(c_i)
        # for label in self.labels:
        #     feature_weights[label] = np.log(count_label[label]/len(self.tokenized_text))

        return feature_weights

    @classmethod 
    def get_features(cls, tokenized, target_label, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 

        #         # Read Input File
        # tokenized_text, labels = self.read_input_file(input_file)
        # # Vocabulary
        # vocabulary = self.create_vocabulary(tokenized_text)
        # # Maximum Likelihood Estimates
        # count_word, count_words_label, count_label = self.count_frequency_word_label(tokenized_text, labels)

        # Compute log(P(w_i|c_j)
        features = []
        total_prob = 0.0
        for word in tokenized:
            word_label = (word, target_label)
            # Laplace Smoothing
            prob = (model.feature_weights["count_word"][word_label] + 1)/(model.feature_weights["count_words_label"][target_label] + model.size_vocab)
            prob_log = np.log(prob)
            total_prob += prob_log # save total

        # add prior probability
        n_documents = len(model.tokenized_text)
        total_prob += np.log(model.feature_weights["count_label"][target_label]/n_documents)
        return total_prob

In [92]:
from abc import ABCMeta, abstractmethod
import pickle


class Model(object, metaclass=ABCMeta):
    def __init__(self, model_file):
        self.model_file = model_file

    def save_model(self, model):
        with open(self.model_file, "wb") as file:
            pickle.dump(model, file)

    def load_model(self):
        with open(self.model_file, "rb") as file:
            model = pickle.load(file)
        return model

    @abstractmethod
    def train(self, input_file):
        pass

    @abstractmethod
    def classify(self, input_file, model):
        pass


In [102]:
"""
NaiveBayes is a generative classifier based on the Naive assumption that features are independent from each other
P(w1, w2, ..., wn|y) = P(w1|y) P(w2|y) ... P(wn|y)
Thus argmax_{y} (P(y|w1,w2, ... wn)) can be modeled as argmax_{y} P(w1|y) P(w2|y) ... P(wn|y) P(y) using Bayes Rule
and P(w1, w2, ... ,wn) is constant with respect to argmax_{y} 
Please refer to lecture notes Chapter 4 for more details
"""

from work.Model import *
from collections import Counter, defaultdict
import numpy as np
class NaiveBayes(Model):

    def __init__(self, model_file):
        super(NaiveBayes, self).__init__(model_file)

    
    def train(self, input_file):
        """
        This method is used to train your models and generated for a given input_file a trained model
        :param input_file: path to training file with a text and a label per each line
        :return: model: trained model 
        """

        features_naive_bayes = Features_NB(input_file, True)


        # # Read Input File
        # tokenized_text, labels = self.read_input_file(input_file)
        # # Vocabulary
        # vocabulary = self.create_vocabulary(tokenized_text)
        # # Maximum Likelihood Estimates
        # count_word, count_words_label, count_label = self.count_frequency_word_label(tokenized_text, labels)
        

        # ## TODO write your code here
        # x = 3
        # breakpoint()
        # model = None

        # model = {
        #     "count_word": count_word,
        #     "count_words_label": count_words_label,
        #     "count_label": count_label,
        #     "n_documents": len(labels),
        #     "vocabulary": vocabulary
        # }
        # ## Save the model
        self.save_model(features_naive_bayes)
    
    # def read_input_file(self, model_file):

    #     with open(model_file) as file:
    #         data = file.read().splitlines()

    #     data_split = map(methodcaller("rsplit", "\t", 1), data)
    #     texts, labels = map(list, zip(*data_split))

    #     tokenized_text = [tokenize(text.lower()) for text in texts]

    #     return tokenized_text, labels

    # def create_vocabulary(self, tokenized_text):

    #     # Append everything together in a dictionary
    #     flattened_list = [item for sublist in tokenized_text for item in sublist]
    #     flattened_list_count = Counter(flattened_list)
    #     vocabulary = list(flattened_list_count.keys())
    #     return vocabulary

    # def count_frequency_word_label(self, sentences, labels):
        
    #     """
    #     :param sentences (list[list]): sentences tokenized
    #     :param labels (list): list of labels
    #     :return: count(c_j, w_i) refers to the count of word w_i in documents with label c_j
    #             _sum_{i=1}^{V}{count(c_j, w_i)} sum of the counts of each word in our vocabulary in class c_j 
    #              count(c_j) refers to the count of label c_j 
    #     """
    #     count_word_label = []
    #     count_words_per_label = defaultdict(int)
    #     for sentence, label in zip(sentences, labels):
    #         for token in sentence:
    #             count_word_label.append((token, label))
    #             count_words_per_label[label] += 1
            
    #     # count_word_label = [(token, label) for sentence, label in zip(sentences, labels) for token in sentence]
    #     count_label = Counter(labels)
    #     return Counter(count_word_label), count_words_per_label, count_label

    # def compute_probability(self,
    #     sentence_tokenized,
    #     label,
    #     count_word,
    #     count_words_label,
    #     count_label,
    #     n_documents,
    #     size_vocab,
    #     laplace_smoothig=True):
    #     total_prob = 0.0

    #     # Compute log(P(w_i|c_j)
    #     for word in sentence_tokenized:
    #         word_label = (word, label)
    #         if laplace_smoothig:
    #             prob = (count_word[word_label] + 1)/(count_words_label[label] + size_vocab)
    #         else:
    #             prob = count_word[word_label]/count_words_label[label]
    #         prob_log = np.log(prob)
    #         total_prob += prob_log # save total

    #     # add prior probability
    #     total_prob += np.log(count_label[label]/n_documents)
    #     return total_prob
        

    def classify(self, input_file, model):
        """
        This method will be called by us for the validation stage and or you can call it for evaluating your code 
        on your own splits on top of the training sets seen to you
        :param input_file: path to input file with a text per line without labels
        :param model: the pretrained model
        :return: predictions list
        """ 



        # Read Input File
        tokenized_text = model.read_input_file(input_file)

        # Compute P(sentence|c_i) for each of c_i
        # labels = model['count_label'].keys()
        # count_word = model['count_word']
        # count_words_label = model['count_words_label']
        # count_label = model['count_label']
        # n_documents = model['n_documents']
        # size_vocab = len(model['vocabulary'])
        

        preds = []
        for sentence in tokenized_text:
            class_predictions = defaultdict()
            for label in set(model.labels):
                class_predictions[label] = model.get_features(sentence, label, model)
            # Find the class with the highest value
            class_with_highest_value = max(class_predictions, key=lambda k: class_predictions[k])
            preds.append(class_with_highest_value)
                #self.compute_probability(
        #             sentence, label, count_word,
        #             count_words_label, count_label,
        #             n_documents, size_vocab)
                
            # Find the key with the highest value
            # key_with_highest_value = max(class_predictions, key=lambda k: class_predictions[k])
        #     preds.append(key_with_highest_value)
        
        return preds

In [103]:
nb = NaiveBayes("nb.4dim.model")

### Features

In [104]:
feat = Features_NB("work/datasets/4dim.train.txt")

####### HAS LABEL


### Training

In [105]:
nb.train("work/datasets/4dim.train.txt")

####### HAS LABEL


In [106]:
# n_documents = len(labels)

In [107]:
# text_sample_1, label_1 = text[1], 'pos.dec'

#### Classify

In [115]:
loaded_model = nb.load_model()
val_dataset = "work/datasets/4dim.train.txt"

In [116]:
preds = nb.classify(val_dataset, loaded_model)

In [118]:
output_file = "work/datasets/4dim.train" + ".pred.txt"

## Save the predictions: one label prediction per line
with open(output_file, "w") as file:
    for pred in preds:
        file.write(pred+"\n")