In [1]:
""" 
    Basic feature extractor
"""
from operator import methodcaller
import string 

def tokenize(text):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    return text.split()

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass


In [10]:
from abc import ABCMeta, abstractmethod
import pickle


class Model(object, metaclass=ABCMeta):
    def __init__(self, model_file):
        self.model_file = model_file

    def save_model(self, model):
        with open(self.model_file, "wb") as file:
            pickle.dump(model, file)

    def load_model(self):
        with open(self.model_file, "rb") as file:
            model = pickle.load(file)
        return model

    @abstractmethod
    def train(self, input_file):
        pass

    @abstractmethod
    def classify(self, input_file, model):
        pass


In [152]:
"""
NaiveBayes is a generative classifier based on the Naive assumption that features are independent from each other
P(w1, w2, ..., wn|y) = P(w1|y) P(w2|y) ... P(wn|y)
Thus argmax_{y} (P(y|w1,w2, ... wn)) can be modeled as argmax_{y} P(w1|y) P(w2|y) ... P(wn|y) P(y) using Bayes Rule
and P(w1, w2, ... ,wn) is constant with respect to argmax_{y} 
Please refer to lecture notes Chapter 4 for more details
"""

from work.Model import *
from collections import Counter, defaultdict
import numpy as np
class NaiveBayes(Model):
    
    def train(self, input_file):
        """
        This method is used to train your models and generated for a given input_file a trained model
        :param input_file: path to training file with a text and a label per each line
        :return: model: trained model 
        """
        ## TODO write your code here
        x = 3
        breakpoint()
        model = None
        ## Save the model
        self.save_model(model)
        return model
    
    def read_input_file(self):

        with open(self.model_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, labels = map(list, zip(*data_split))

        tokenized_text = [tokenize(text.lower()) for text in texts]

        return tokenized_text, labels

    def create_vocabulary(self, tokenized_text):

        # Append everything together in a dictionary
        flattened_list = [item for sublist in tokenized_text for item in sublist]
        flattened_list_count = Counter(flattened_list)
        vocabulary = list(flattened_list_count.keys())
        return vocabulary

    def count_frequency_word_label(self, sentences, labels):
        
        """
        :param sentences (list[list]): sentecnces tokenized
        :param labels (list): list of labels
        :return: count(c_j, w_i) refers to the count of word w_i in documents with label c_j
                _sum_{i=1}^{V}{count(c_j, w_i)} sum of the counts of each word in our vocabulary in class c_j 
                 count(c_j) refers to the count of label c_j 
        """
        count_word_label = []
        count_words_per_label = defaultdict(int)
        for sentence, label in zip(sentences, labels):
            for token in sentence:
                count_word_label.append((token, label))
                count_words_per_label[label] += 1
            
        # count_word_label = [(token, label) for sentence, label in zip(sentences, labels) for token in sentence]
        count_label = Counter(labels)
        return Counter(count_word_label), count_words_per_label, count_label

    def compute_probability(self, sentence_tokenized, label, count_word, count_words_label, count_label, n_documents):
        total_prob = 0.0

        # Compute log(P(w_i|c_j)
        for word in sentence_tokenized:
            word_label = (word, label)
            prob = count_word[word_label]/count_words_label[label]
            if(prob == 0):
                print(word_label)
            prob_log = np.log(prob)
            total_prob += prob_log # save total

        # add prior probability
        total_prob += np.log(count_label[label]/n_documents)
        return total_prob
        

    def classify(self, input_file, model):
        """
        This method will be called by us for the validation stage and or you can call it for evaluating your code 
        on your own splits on top of the training sets seen to you
        :param input_file: path to input file with a text per line without labels
        :param model: the pretrained model
        :return: predictions list
        """ 
        ## TODO write your code here
        preds = None
        return preds

In [153]:
nb = NaiveBayes("work/datasets/4dim.train.txt")

In [154]:
text, labels = nb.read_input_file()

In [155]:
count_word, count_words_label, count_label =nb.count_frequency_word_label(text, labels)

In [156]:
n_documents = len(labels)

In [157]:
text_sample_1, label_1 = text[1], 'pos.dec'

In [158]:
nb.compute_probability(text_sample_1, label_1, count_word, count_words_label, count_label, n_documents)

('twin', 'pos.dec')
('brilliant', 'pos.dec')
('quicker', 'pos.dec')
('opposite', 'pos.dec')
('grabbing', 'pos.dec')
('croissant', 'pos.dec')
('bagel', 'pos.dec')


  prob_log = np.log(prob)


-inf

In [126]:
nb.compute_probability(text_sample_1, label_1, count_word, count_words_label, count_label, n_documents)

-1274.5210730012898

In [12]:
feat = Features("work/datasets/4dim.train.txt")

AttributeError: 'Features' object has no attribute 'data_split'