In [776]:
""" 
    Basic feature extractor
"""
from operator import methodcaller
import string
from collections import Counter, defaultdict
import numpy as np
import re
from itertools import islice

# def tokenize(text):
#     # TODO customize to your needs
#     text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
#     return text.split()

def expand_contradictions(text):

    contraction_mapping = {
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }

    pattern = re.compile(r"\b(?:" + "|".join(re.escape(contraction) for contraction in contraction_mapping.keys()) + r")\b")
    text = pattern.sub(lambda x: contraction_mapping[x.group()], text)
    
    return text

def remove_digits_and_words_digits(text):
    # Define a regular expression pattern to match words containing digits
    pattern = r'\b\w*\d\w*\b'
    text_without_words_with_digits = re.sub(pattern, '', text)

    return text_without_words_with_digits

def tokenize(text):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    # re.sub('[^a-zA-Z]', '', dataset['Text'][i])

    # Text preprocessing techniques:
    # 1) Lowercase
    text = text.lower()

    # 2) Expand Contradictions
    text = expand_contradictions(text)

    # 3) Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)

    # 4) Remove digits and words with digits
    text = remove_digits_and_words_digits(text)

    return text.split()

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [1363]:
################################
# Logistic Regression Features #
################################

class Features_LR(Features):

    def __init__(self, model_file, threshold, max_features):
        super(Features_LR, self).__init__(model_file)
        self.vocabulary = self.create_vocabulary(self.tokenized_text, threshold, max_features)
        self.word2index = {word: i for i, word in enumerate(self.vocabulary, start=0)}
        self.idf = None # Need to save IDF values for inference

    def read_inference_file(self, input_file):
        """Read inference file that is in the form: <text> i.e. a line
        of text that does not contain a tab.
        """
        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text
    
    def create_vocabulary(self, tokenized_text, threshold, max_features=None):
        """Creat vocabulary from training set, considering only words
        that have an occurence > threshold.
        """
        # Append everything together in a dictionary
        flattened_list = [item for sublist in tokenized_text for item in sublist]
        flattened_list_count = Counter(flattened_list)

        # Sort the dictionary by values in descending order
        flattened_list_count = dict(sorted(flattened_list_count.items(), key=lambda item: item[1], reverse=True))

        # Considering only words that have an occurence > threshold.
        flattened_list_count_filter = {word:count for word, count in flattened_list_count.items() if count > threshold}

        # Limit the size of the vocabulary based on max_features
        if max_features:
            flattened_list_count_filter = dict(islice(flattened_list_count_filter.items(), max_features))

        return list(flattened_list_count_filter.keys())
    
    def get_features(self, tokenized_sentence, idf_array):
        """Convert sentence to TF-IDF space
        """
        size_vocabulary = len(self.vocabulary)
        n_documents = 1
        tf_array = np.zeros(size_vocabulary)
        words_per_document = 0
        # Compute Term-Frequency
        words_in_document = []
        for word in tokenized_sentence:
            index_word = self.word2index.get(word)
            if word in self.word2index.keys():
                tf_array[index_word] += 1
                words_per_document += 1
        tf = (tf_array + 1)/(words_per_document+1) # with smoothin
        return tf*idf_array
        
    
    def tf_idf(self, tokenized_text):
        """Term frequency-inverse document frequency
        """
        size_vocabulary = len(self.vocabulary)
        n_documents = len(tokenized_text)
        tf_array = np.zeros((n_documents, size_vocabulary))
        idf_array = np.zeros(size_vocabulary) # Inverse Document Frequency
        words_per_document = np.zeros(n_documents)
        # Compute Term-Frequency
        for d_i, sentence in enumerate(tokenized_text, start=0):
            words_in_document = []
            for word in sentence:

                index_word = self.word2index.get(word)
                
                if word in self.word2index.keys():
                    tf_array[d_i][index_word] += 1
                    words_per_document[d_i] += 1
                    # Inverse Document Frequency
                    if word not in words_in_document: # does not count repeated words in the same document
                        words_in_document.append(word) 
                        idf_array[index_word] += 1 # number of documents containing the term
        tf = (tf_array + 1)/(words_per_document.reshape(-1, 1) + 1)
        # Smoothing: to avoid division by zero errors and to ensure that terms with zero document
        # frequency still get a non-zero IDF score
        idf = np.log((n_documents + 1)/(idf_array + 1)) + 1 # Smoothing

        self.idf = idf
        tf_idf = tf*idf
        return tf_idf # Shape (n_documents, vocabulary)

In [1364]:
"""
 Refer to Chapter 5 for more details on how to implement a LogisticRegression
"""
from work.Model import *

class LogisticRegression(Model):
    def __init__(self, model_file, learning_rate, epochs, threshold, max_features):
        super(LogisticRegression, self).__init__(model_file)
        self.weights = None
        self.bias = None
        self.loss = []
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.Y_to_categorical = None # Map Y label to numerical
        self.threshold = threshold
        self.max_features = max_features

    def initialize_weights(self, num_features, num_labels):
        self.weights = np.zeros((num_features, num_labels))
        self.bias = np.zeros(num_labels)
        # np.random.seed(0)
        # self.weights = np.random.rand(num_features, num_labels)
        # self.bias = np.random.rand(num_labels)

    def softmax(self, Z):
        """Softmax function: normalizing logit scores
        :param Z([num_documents, num_labels])
        :return e^Z/sum_{i=0}^{k}{e^{Z}}
        """
        return np.exp(Z - np.max(Z, axis=1, keepdims=True))/np.sum(np.exp(Z), axis=1, keepdims=True)
    
    def sigmoid(self, Z):
        """Sigmoid function for binary classification

        :param Z([num_documents, num_labels])
        :return 1/(1+e^{-Z})
        """
        return 1/(1 + np.exp(-Z))

        
    def predict_prob(self, X, weights, bias, multinomial):
        """Return prediction of shape [num_documents, num_labels]
        """
        # z[num_documents, num_labels] = X[num_documents, num_features]*W[num_features, num_labels] + bias[num_labels]
        Z = np.dot(X, weights) + bias

        if multinomial:
            # Apply Softmax
            S = self.softmax(Z)
        else:
            # Apply Sigmoid
            S = self.sigmoid(Z)
        return S

    def cross_entropy_loss(self, S, target):
        """Calculate the cross-entropy
        L = -1/n*_sum_{i=0}^{n}{y_i*log(s_i)} 
        y label is a vector containing K classes where yc = 1 if c is the correct class and the remaining elements will be 0.

        :param S[num_documents, num_labels]: probabilities of features after softmax
        :target [num_documents, num_labels]: target one hot encoded
        """
        return -np.mean(np.log(S)*target)
    
    def binary_cross_entropy_loss(self, S, target):
        """Calculate Binary cross-entropy
        """
        return  -np.mean(target*(np.log(S)) + (1-target)*np.log(1-S))

    def OneHot(self, targets, num_labels):
        """Convert arrary of targets to One Hot 
        :param targets([num_documents,])
        :param num_labels(int)
        :return Y[num_documents, num_labels]
        """
        Y_onehot = np.zeros((len(targets), num_labels))
        Y_onehot[np.arange(len(targets)), targets] = 1
        return Y_onehot
    
    def predict(self, X, weights, bias, multinomial):
        """Return prediction of X with the categorical values]
        """
        # z[num_documents, num_labels] = X[num_documents, num_features]*W[num_features, num_labels] + bias[num_labels]
        Z = np.dot(X, weights) + bias

        if multinomial:
            # Apply Softmax
            S = self.softmax(Z)

            # Rows with highest probability
            S_max = np.argmax(S, axis=1)
        else:
            # Apply Sigmoid
            S = self.sigmoid(Z)
            print(S)
            # Rows with highest probability
            S_max = [1 if i > 0.5 else 0 for i in S]

        return S_max
    

    def train(self, input_file, verbose=False):
        """
        This method is used to train your models and generated for a given input_file a trained model
        :param input_file: path to training file with a text and a label per each line
        :return: model: trained model 
        """
        # Read dataset and create vocabulary
        features_lr_class = Features_LR(input_file, self.threshold, self.max_features)

        # Transform dataset to TF-IDF space
        # Return features with format (n_documents, size_vocabulary)
        X = features_lr_class.tf_idf(features_lr_class.tokenized_text)
        
        # Y
        Y_mapping = {label: index for index, label in enumerate(np.unique(features_lr_class.labels))}
        self.Y_to_categorical = {index: label for label, index in Y_mapping.items()} # dictionary to convert back y's to categorical
        Y = [Y_mapping[y] for y in features_lr_class.labels]

        # Initialize Weights
        sample_size = len(features_lr_class.tokenized_text)
        n_features = len(features_lr_class.vocabulary)
        num_labels = len(features_lr_class.labelset)


        # Check if it's multinomial or binary classification
        if num_labels == 2:
            multinomial = False
            num_labels = 1 # Only one column to reference 0 or 1
        else:
            multinomial = True

        self.initialize_weights(n_features, num_labels)

        # One Hot encoded Y
        if multinomial:
            Y_onehot = self.OneHot(Y, num_labels)
        else:
            Y_onehot = np.array(Y).reshape(-1, 1)

        for i in range(self.epochs):
            # Z = softmax(X*W + b)
            prob = self.predict_prob(X, self.weights, self.bias, multinomial)

            # break            
            # dL/dW
            grad_w = (1/sample_size)*np.dot(X.T, prob - Y_onehot)
            grad_b =  (1/sample_size)*np.sum(prob - Y_onehot, axis=0)

            # Updating weights and bias
            # if balanced:
            #     grad_w = grad_w * class_weights

            self.weights = self.weights - (self.learning_rate*grad_w)
            self.bias = self.bias - (self.learning_rate*grad_b)

            # Computing cross-entropy loss
            if multinomial:
                loss = self.cross_entropy_loss(prob, Y_onehot)
            else:
                loss = self.binary_cross_entropy_loss(prob, Y_onehot)

            if verbose:
                print(f"Epoch: {i+1} - Loss: {loss}")

        model = {
            "feature_weights": {
                "weights": self.weights,
                "bias": self.bias,
                "Y_to_categorical": self.Y_to_categorical
            },
            "Feature": features_lr_class
        }
        ## Save the model
        self.save_model(model)
        return X, Y_onehot, prob


    def classify(self, input_file, model):
        """
        This method will be called by us for the validation stage and or you can call it for evaluating your code 
        on your own splits on top of the training sets seen to you
        :param input_fixle: path to input file with a text per line without labels
        :param model: the pretrained model
        :return: predictions list
        """
        ## TODO write your code here (and change return)

        feature_weights = model["feature_weights"]
        Feature_LR_class = model["Feature"]

        # Read Input File
        tokenized_text = Feature_LR_class.read_inference_file(input_file)
        X = []

        # Get features from inference file
        for sentence in tokenized_text:
            # Transform dataset to TF-IDF space
            # Return features with format (1, size_vocabulary)
            X_sentence = Feature_LR_class.get_features(sentence, Feature_LR_class.idf)

            # Concatenate A and B vertically
            X.append(X_sentence)

        X = np.vstack(X)

        # Prediction
        multinomial = True if len(feature_weights['Y_to_categorical'].keys()) > 2 else False
        preds_numerical = self.predict(X, feature_weights['weights'], feature_weights['bias'], multinomial)
        # Map indexes to Categorical space
        preds_label = []
        probs = self.predict_prob(X, feature_weights['weights'], feature_weights['bias'], multinomial)
        for y in preds_numerical:
            tmp = feature_weights['Y_to_categorical'][y]
            preds_label.append(tmp)
        
        return preds_label, probs

In [1406]:
# questions
train_file = "work/datasets/questions/train.txt"
pred_file = "work/datasets/questions/val.test"
pred_true_labels = "work/datasets/questions/val.txt"
model_file_name = "logreg.questions.model"
model_LR = LogisticRegression(model_file_name, learning_rate=0.00005, epochs=1000, threshold=1, max_features=500)

# # odiya
# train_file = "work/datasets/odiya/train.txt"
# pred_file = "work/datasets/odiya/val.test"
# pred_true_labels = "work/datasets/odiya/val.txt"
# model_file_name = "logreg.odiya.model"
# model_LR = LogisticRegression(model_file_name, learning_rate=0.000001, epochs=1000, threshold=10, max_features=1000)


# 4dim
# train_file = "work/datasets/4dim/train.txt"
# pred_file = "work/datasets/4dim/val.test"
# pred_true_labels = "work/datasets/4dim/val.txt"
# model_file_name = "logreg.4dim.model"
# model_LR = LogisticRegression(model_file_name, learning_rate=0.2, epochs=200, threshold=1, max_features=100)


# # Products
# train_file = "work/datasets/products/train.txt"
# pred_file = "work/datasets/products/val.test"
# pred_true_labels = "work/datasets/products/val.txt"
# model_file_name = "logreg.products.model"
# model_LR = LogisticRegression(model_file_name, learning_rate=0.9, epochs=1000, threshold=1, max_features=500)

In [1407]:
X, Y, prob  = model_LR.train(train_file, verbose=True)

Epoch: 1 - Loss: 0.29862657820467586
Epoch: 2 - Loss: 0.2988450038709276
Epoch: 3 - Loss: 0.2990650998372262
Epoch: 4 - Loss: 0.29928685496032864
Epoch: 5 - Loss: 0.29951025852295926
Epoch: 6 - Loss: 0.29973530022087197
Epoch: 7 - Loss: 0.29996197015032944
Epoch: 8 - Loss: 0.3001902587959862
Epoch: 9 - Loss: 0.3004201570191589
Epoch: 10 - Loss: 0.30065165604647276
Epoch: 11 - Loss: 0.3008847474588693
Epoch: 12 - Loss: 0.30111942318096463
Epoch: 13 - Loss: 0.3013556754707443
Epoch: 14 - Loss: 0.3015934969095842
Epoch: 15 - Loss: 0.3018328803925857
Epoch: 16 - Loss: 0.30207381911921477
Epoch: 17 - Loss: 0.3023163065842338
Epoch: 18 - Loss: 0.3025603365689159
Epoch: 19 - Loss: 0.3028059031325331
Epoch: 20 - Loss: 0.3030530006041075
Epoch: 21 - Loss: 0.30330162357441737
Epoch: 22 - Loss: 0.30355176688824864
Epoch: 23 - Loss: 0.3038034256368844
Epoch: 24 - Loss: 0.30405659515082223
Epoch: 25 - Loss: 0.30431127099271416
Epoch: 26 - Loss: 0.3045674489505195
Epoch: 27 - Loss: 0.304825125030864

In [1408]:
prob = model_LR.softmax(np.dot(X, model_LR.weights) + model_LR.bias)

In [1409]:
model_LR_loaded = model_LR.load_model()

In [1410]:
model_LR_loaded

{'feature_weights': {'weights': array([[ 3.25415550e-03,  3.40083798e-03, -2.39145014e-04,
           3.82157057e-03,  2.24468933e-03,  2.05746647e-03],
         [ 3.89265284e-03,  3.91929004e-03, -6.27197695e-05,
           2.66371856e-03,  1.98998596e-03,  1.48411806e-03],
         [ 5.09642459e-03,  3.73958991e-03, -1.54583134e-04,
           3.55029246e-03,  2.59382679e-03,  2.25011124e-03],
         ...,
         [ 1.09699452e-02,  9.36323969e-03, -4.38945047e-04,
           9.87727667e-03,  5.92061110e-03,  6.04844393e-03],
         [ 1.09704307e-02,  9.33492488e-03, -4.38859468e-04,
           9.89194017e-03,  5.92081974e-03,  6.04865668e-03],
         [ 1.09696609e-02,  9.33434809e-03, -4.38993752e-04,
           9.86221733e-03,  5.93609747e-03,  6.08473742e-03]]),
  'bias': array([ 0.0071371 ,  0.00863322, -0.00065039,  0.00850179,  0.00541161,
          0.00575694]),
  'Y_to_categorical': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5'}},
 'Feature': <__main__.Features_LR at 

In [1411]:
preds, prob = model_LR.classify(pred_file + ".txt", model_LR_loaded)

In [1419]:
prob

array([[9.18802537e-04, 3.65680354e-04, 1.34041123e-06, 4.83723813e-04,
        5.16377425e-05, 5.56870046e-05],
       [2.36775021e-03, 1.09845466e-03, 1.01805903e-05, 1.38708610e-03,
        2.14370899e-04, 2.28075746e-04],
       [2.37192807e-03, 1.10075887e-03, 1.02260689e-05, 1.39341013e-03,
        2.15000688e-04, 2.28815065e-04],
       ...,
       [7.58660477e-03, 4.26695414e-03, 1.27146955e-04, 5.08164719e-03,
        1.25209013e-03, 1.31134259e-03],
       [2.32929378e-03, 1.07673851e-03, 9.82390274e-06, 1.36080264e-03,
        2.08984722e-04, 2.22684235e-04],
       [2.33349421e-03, 1.07851072e-03, 9.85910030e-06, 1.36272409e-03,
        2.09496960e-04, 2.23193498e-04]])

In [1412]:
set(preds)

{'0'}

In [1413]:
## Save the predictions: one label prediction per line
with open(pred_file + ".pred.txt", "w") as file:
    for pred in preds:
        file.write(pred+"\n")

# Evaluation

In [1414]:
import pandas as pd

In [1415]:
true_dataset = pd.read_csv(pred_true_labels, sep='\t', header=None, names=['text', 'true_label'])
pred_dataset = pd.read_csv(pred_file + ".pred.txt", sep='\t', header=None, names=['pred'])

In [1416]:
train_dataset = pd.read_csv(train_file, sep='\t', header=None, names=['text', 'true_label'])

In [1417]:
train_dataset['true_label'].value_counts()

true_label
1    755
3    755
0    688
5    522
4    498
2     53
Name: count, dtype: int64

In [1418]:
# Check if the columns have the same name; adjust as needed
column_name = 'true_label'  # Change to the actual column name
pred_column_name = 'pred'  # Change to the actual predicted column name

# Merge the two DataFrames on a common index or key if available
merged_df = true_dataset.merge(pred_dataset, left_index=True, right_index=True)

# Calculate the accuracy by comparing the two columns
accuracy = (merged_df[column_name] == merged_df[pred_column_name]).mean()

# Print the accuracy as a percentage
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 19.07%
