In [250]:
""" 
    Basic feature extractor
"""
from operator import methodcaller
import string
from collections import Counter, defaultdict
import numpy as np

def tokenize(text):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    return text.split()

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [253]:
################################
# Logistic Regression Features #
################################

class Features_LR(Features):

    def __init__(self, model_file, threshold=2):
        super(Features_LR, self).__init__(model_file)
        self.vocabulary = self.create_vocabulary(self.tokenized_text, threshold)
        self.word2index = {word: i for i, word in enumerate(self.vocabulary, start=0)}

    def read_inference_file(self, input_file):
        """Read inference file that is in the form: <text> i.e. a line
        of text that does not contain a tab.
        """
        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text
    
    def create_vocabulary(self, tokenized_text, threshold):
        """Creat vocabulary from training set, considering only words
        that have an occurence > threshold.
        """
        # Append everything together in a dictionary
        flattened_list = [item for sublist in tokenized_text for item in sublist]
        flattened_list_count = Counter(flattened_list)

        # Sort the dictionary by values in descending order
        flattened_list_count = dict(sorted(flattened_list_count.items(), key=lambda item: item[1], reverse=True))

        # Considering only words that have an occurence > threshold.
        flattened_list_count_filter = [word for word, count in flattened_list_count.items() if count > threshold]

        return flattened_list_count_filter
    
    def get_features(self, tokenized_sentence, idf_array):
        """Convert sentence to TF-IDF space
        """
        size_vocabulary = len(self.vocabulary)
        n_documents = 1
        tf_array = np.zeros(size_vocabulary)
        words_per_document = 0
        # Compute Term-Frequency
        words_in_document = []
        for word in tokenized_sentence:
            index_word = self.word2index.get(word)
            if word in self.word2index.keys():
                tf_array[index_word] += 1
                words_per_document += 1
        tf = tf_array/words_per_document
        return tf*idf_array
        
    
    def tf_idf(self, tokenized_text): #max_features=1000):
        """Term frequency-inverse document frequency
        """
        size_vocabulary = len(self.vocabulary)
        n_documents = len(tokenized_text)
        tf_array = np.zeros((n_documents, size_vocabulary))
        idf_array = np.zeros(size_vocabulary) # Inverse Document Frequency
        words_per_document = np.zeros(n_documents)
        # Compute Term-Frequency
        for d_i, sentence in enumerate(tokenized_text, start=0):
            words_in_document = []
            for word in sentence:

                index_word = self.word2index.get(word)
                if word in self.word2index.keys():
                    tf_array[d_i][index_word] += 1
                    words_per_document[d_i] += 1
                    # Inverse Document Frequency
                    if word not in words_in_document: # does not count repeated words in the same document
                        words_in_document.append(word) 
                        idf_array[index_word] += 1 # number of documents containing the term
        tf = tf_array/words_per_document.reshape(-1, 1)
        # Smoothing: to avoid division by zero errors and to ensure that terms with zero document
        # frequency still get a non-zero IDF sc
        # ore
        idf = np.log((n_documents + 1)/(idf_array + 1)) + 1 # Smoothing

        tf_idf = tf*idf
        return tf_idf, tf, idf # Shape (n_documents, vocabulary)

In [287]:
"""
 Refer to Chapter 5 for more details on how to implement a LogisticRegression
"""
from work.Model import *

class LogisticRegression(Model):
    def __init__(self, model_file, learning_rate=0.01, epochs=1000):
        super(LogisticRegression, self).__init__(model_file)
        self.weights = None
        self.bias = None
        self.loss = []
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.Y_mapping = None # Map Y label to numerical

    def initialize_weights(self, num_features, num_labels):
        self.weights = np.zeros((num_features, num_labels))
        self.bias = np.zeros(num_labels)

    def softmax(self, Z):
        """Softmax function: normalizing logit scores
        :param Z([num_documents, num_labels])
        : return e^Z/sum_{i=0}^{k}{e^{Z}}
        """
        return np.exp(Z)/np.sum(np.exp(Z), axis=1, keepdims=True)
        

    def predict(self, X, weights, bias):
        # z[num_documents, num_labels] = X[num_documents, num_features]*W[num_features, num_labels] + bias[num_labels]
        Z = np.dot(X, weights) + bias

        # Apply Softmax
        S = self.softmax(Z)

    def cross_entropy_loss(self, S, target):
        """Calculate the cross-entropy
        L = -1/n*_sum_{i=0}^{n}{y_i*log(s_i)} 
        y label is a vector containing K classes where yc = 1 if c is the correct class and the remaining elements will be 0.

        :param S[num_documents, num_labels]: probabilities of features after softmax
        :target [num_documents, num_labels]: target one hot encoded
        """
        return -np.mean(np.log(S)*target)

    def OneHot(self, targets, num_labels):
        """Convert arrary of targets to One Hot 
        :param targets([num_documents,])
        :param num_labels(int)
        :return Y[num_documents, num_labels]
        """
        Y = np.zeros((len(targets), num_labels))
        Y[np.arange(len(targets)), targets] = 1
        return Y
    

    def train(self, input_file):
        """
        This method is used to train your models and generated for a given input_file a trained model
        :param input_file: path to training file with a text and a label per each line
        :return: model: trained model 
        """
        # Read dataset and create vocabulary
        features_lr_class = Features_LR(input_file)

        # Transform dataset to TF-IDF space
        # Return features with format (n_documents, size_vocabulary)
        X = features_lr_class.tf_idf(features_lr_class.tokenized_text)
        
        # Y
        self.Y_mapping = {label: index for index, label in enumerate(np.unique(features_lr_class.labels))}
        Y = [self.Y_mapping[y] for y in features_lr_class.labels]


        # Initialize Weights
        sample_size = len(features_lr_class.tokenized_text)
        n_features = len(features_lr_class.vocabulary)
        num_labels = len(features_lr_class.labelset)
        self.initialize_weights(sample_size, n_features)
        # One Hot encoded Y
        Y_onehot = self.OneHot(Y, num_labels)


        for i in range(self.epochs):
            # Z = softmax(X*W + b)
            prob = self.predict(X, self.weights, self.bias)
            
            # dL/dW
            grad_w = (1/sample_size)*np.dot(X.T, prob - Y_onehot)
            grad_b =  (1/sample_size)*np.sum(prob - Y_onehot, axis=0)




        
        ## TODO write your code here
        model = None
        ## Save the model
        self.save_model(model)
        return model


    def classify(self, input_file, model):
        """
        This method will be called by us for the validation stage and or you can call it for evaluating your code 
        on your own splits on top of the training sets seen to you
        :param input_file: path to input file with a text per line without labels
        :param model: the pretrained model
        :return: predictions list
        """
        ## TODO write your code here (and change return)
        preds = None
        return preds


In [292]:
def OneHot(targets, num_labels):
    Y = np.zeros((len(targets), num_labels))
    Y[np.arange(len(targets)), targets] = 1
    return Y

In [295]:
targets = [1, 0, 3, 2, 1, 1]
num_labels = 4

In [296]:
OneHot(targets, num_labels)

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [283]:
Z = np.random.rand(4, 5)

In [270]:
loss = -np.mean(np.log(Softmax(z)[np.arange(len(y)), y]))

array([[0.9468947 , 0.68072595, 0.02457844, 0.71077654, 0.12075853],
       [0.33585364, 0.36489347, 0.42197878, 0.26787945, 0.7907006 ],
       [0.58635958, 0.12694744, 0.87740103, 0.17388748, 0.79550203],
       [0.67859676, 0.51188968, 0.80106312, 0.24684786, 0.45458267],
       [0.50807223, 0.9378627 , 0.43132378, 0.66662698, 0.15393962],
       [0.81975571, 0.40176412, 0.9147828 , 0.64112081, 0.81244211],
       [0.26190353, 0.56067029, 0.04882806, 0.62511781, 0.72490852],
       [0.35422805, 0.83007158, 0.8746002 , 0.78267619, 0.71202362],
       [0.64031492, 0.61272653, 0.69183811, 0.79761212, 0.8256599 ],
       [0.05098652, 0.8515289 , 0.99808452, 0.05089972, 0.79544978]])

In [286]:
np.max(Z)

0.9417422323201528

In [280]:
np.exp(Z)/np.sum(np.exp(Z), axis=1, keepdims=True)

array([[0.29486943, 0.22596133, 0.11723921, 0.23285466, 0.12907537],
       [0.17763172, 0.18286575, 0.1936084 , 0.16595858, 0.27993555],
       [0.20558758, 0.12986039, 0.27503918, 0.13610137, 0.25341148],
       [0.22596802, 0.19127007, 0.25540739, 0.1467374 , 0.18061712],
       [0.18738325, 0.28799597, 0.1735399 , 0.21957871, 0.13150217],
       [0.21799882, 0.1435234 , 0.23973083, 0.18233666, 0.21641028],
       [0.16173181, 0.21804604, 0.13069469, 0.23256128, 0.25696618],
       [0.13779662, 0.22176591, 0.231864  , 0.21150042, 0.19707304],
       [0.18519756, 0.18015809, 0.19498962, 0.21674474, 0.22291   ],
       [0.11223106, 0.24991032, 0.289356  , 0.11222132, 0.2362813 ]])

In [275]:
np.exp(0.9468947) + np.exp(0.68072595) + np.exp(0.02457844) + np.exp(0.71077654) +  np.exp(0.12075853)

8.74181064171361

In [246]:
train_file = "work/datasets/test.txt"

In [247]:
feat = Features_LR(train_file)

In [248]:
tf_idf, tf, idf_arra = feat.tf_idf(feat.tokenized_text)

In [249]:
feat.get_features(feat.tokenized_text[0], idf_arra)

array([0.16666667, 0.16666667, 0.2821912 , 0.        , 0.34976871,
       0.34976871, 0.34976871, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [244]:
1/6

0.16666666666666666

In [165]:
idf_arra

array([1.        , 1.        , 1.69314718, 1.69314718, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229])

In [65]:
a = np.zeros((5, 10))

In [68]:
a/b.reshape(-1, 1)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [42]:
a[0][0] = 2

In [43]:
a[1][4] =5

In [61]:
a*b

array([[ 6.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0., 25.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [52]:
b.shape

(5,)

In [58]:
b

array([3., 1., 1., 1., 5., 1., 1., 1., 1., 1.])

In [66]:
b = np.ones(5)

In [56]:
b[0] = 3

In [57]:
b[4] = 5

In [33]:
b[1] = 5

In [30]:
b

array([5., 1., 1., 1., 1.])