In [89]:
from operator import methodcaller
import string
import re
from collections import Counter, defaultdict
import numpy as np
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [90]:
def expand_contradictions(text):

    contraction_mapping = {
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }

    pattern = re.compile(r"\b(?:" + "|".join(re.escape(contraction) for contraction in contraction_mapping.keys()) + r")\b")
    text = pattern.sub(lambda x: contraction_mapping[x.group()], text)
    
    return text

def remove_digits_and_words_digits(text):
    # Define a regular expression pattern to match words containing digits
    pattern = r'\b\w*\d\w*\b'
    text_without_words_with_digits = re.sub(pattern, '', text)

    return text_without_words_with_digits

def remove_stop_words(text):
    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                  'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those',
                  'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
                  'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                  'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
                  'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                  'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
                  'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
                  'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                  "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
                  'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    text_clean = []
    for word in text:
        if word not in stop_words:
            text_clean.append(word)
    return text_clean


def tokenize(text, split=True):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    # re.sub('[^a-zA-Z]', '', dataset['Text'][i])

    # Text preprocessing techniques:
    # 1) Lowercase
    text = text.lower()

    # 2) Expand Contradictions
    text = expand_contradictions(text)

    # 3) Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)

    # 4) Remove digits and words with digits
    text = remove_digits_and_words_digits(text)
    if split:
        text = text.split()

    # # 5) Remove Stop Words
    # if stop_words:
    # text = remove_stop_words(text)

    return text

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [91]:
class Features_FeedForward(Features):

    def __init__(self, input_file, embedding_file, threshold=0, max_features=None):
        super(Features_FeedForward, self).__init__(input_file)
        self.embedding_matrix = self.read_embedding_file(embedding_file) # Need to save EmbeddingMatrix values for inference
        # self.vocabulary = list(self.embedding_matrix.keys())
        # self.word2index = {word: i for i, word in enumerate(self.vocabulary, start=0)}
        # self.index2word = {i: word for i, word in enumerate(self.vocabulary, start=0)}
        self.threshold = threshold
        self.max_features = max_features
        self.vocabulary = None
        self.word2index = None
        self.index2word = None
        self.idf = None # Need to save IDF values for inference

    def adjust_max_seq_length(self, tokenized_text, max_seq_length):
        """Adjust size of data input to the max sequence length
        :param tokenized_text: data input
        :param max_seq_length: the max sequence length
        :return list: truncated sentences
        """
        new_tokenized_text = []
        for sentence in tokenized_text:
            new_tokenized_text.append(sentence[:max_seq_length])
        return new_tokenized_text

        
    def read_embedding_file(self, embedding_file):
        '''Read embedding file

        :param embedding_file (str):
        :return: dict: embedding matrix
        '''

        embedding_matrix = dict()
        try: 
            with open(embedding_file, "r") as file:
                for line in file:
                    values = line.strip().split()
                    word = values[0]
                    word_embedding = np.array([float(emb) for emb in values[1:]])
                    embedding_matrix[word] = word_embedding
            return embedding_matrix
        except OSError as e:
            print("Embedding file " + embedding_file + " is not available, please input the right parth to the file.")

    def read_inference_file(self, input_file):
        """Read inference file that is in the form: <text> i.e. a line
        of text that does not contain a tab.
        """
        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text
    
    def create_vocabulary(self, tokenized_text, threshold, max_features=None):
        """Creat vocabulary from training set, considering only words
        that have an occurence > threshold.
        """
        # Append everything together in a dictionary
        flattened_list = [item for sublist in tokenized_text for item in sublist]
        flattened_list_count = Counter(flattened_list)

        # Sort the dictionary by values in descending order
        flattened_list_count = dict(sorted(flattened_list_count.items(), key=lambda item: item[1], reverse=True))

        # Considering only words that have an occurence > threshold.
        flattened_list_count_filter = {word:count for word, count in flattened_list_count.items() if count > threshold}

        # Limit the size of the vocabulary based on max_features
        if max_features:
            flattened_list_count_filter = dict(islice(flattened_list_count_filter.items(), max_features-1))

        # Add to vocabulary the Out-of-Vocabulary token
        return list(flattened_list_count_filter.keys()) + ['UNK']
    
    def tf_idf(self, tokenized_text):
        """Term frequency-inverse document frequency
        """
        # Create Vocabulary
        self.vocabulary = self.create_vocabulary(tokenized_text, self.threshold, self.max_features)
        self.word2index = {word: i for i, word in enumerate(self.vocabulary, start=0)}
        self.index2word = {i: word for i, word in enumerate(self.vocabulary, start=0)}

        size_vocabulary = len(self.vocabulary)
        n_documents = len(tokenized_text)
        tf_array = np.zeros((n_documents, size_vocabulary))
        idf_array = np.zeros(size_vocabulary) # Inverse Document Frequency
        words_per_document = np.zeros(n_documents)
        # Compute Term-Frequency
        for d_i, sentence in enumerate(tokenized_text, start=0):
            words_in_document = []
            for word in sentence:

                index_word = self.word2index.get(word)
                
                if word in self.word2index.keys():
                    tf_array[d_i][index_word] += 1
                    words_per_document[d_i] += 1
                    # Inverse Document Frequency
                    if word not in words_in_document: # does not count repeated words in the same document
                        words_in_document.append(word) 
                        idf_array[index_word] += 1 # number of documents containing the term
        tf = (tf_array + 1)/(words_per_document.reshape(-1, 1) + 1)
        # Smoothing: to avoid division by zero errors and to ensure that terms with zero document
        # frequency still get a non-zero IDF score
        idf = np.log((n_documents + 1)/(idf_array + 1)) + 1 # Smoothing

        self.idf = idf
        tf_idf = tf*idf
        return tf_idf # Shape (n_documents, vocabulary)
    
    def sort_by_tfidf(self, tfidf_matrix, max_seq_length):
        """Sort input documents based on tf*idf score.
        Return top "max_seq_length" words
        :param: tfidf_matrix
        :param: max_seq_length
        :return: sentences ordered by TF-IDF score
        """
        
        # Indices of sorted matrix in descending order
        indices = np.argsort(-tfidf_matrix, axis=1)
        tfidf_matrix_sorted = []

        # Create sorted matrix
        for i in range(tfidf_matrix.shape[0]):
            # sentence in orderd version
            tmp = [self.index2word[index] for index in indices[i][:max_seq_length]]
            tfidf_matrix_sorted.append(tmp)
    
        return tfidf_matrix_sorted
    
    def get_features_tfidf(self, tokenized_sentence, idf_array):
        """Convert sentence to TF-IDF space
        """
        size_vocabulary = len(self.vocabulary)
        tf_array = np.zeros(size_vocabulary)
        words_per_document = 0
        # Compute Term-Frequency
        words_in_document = []
        for word in tokenized_sentence:
            index_word = self.word2index.get(word)
            if word in self.word2index.keys():
                tf_array[index_word] += 1
                words_per_document += 1
        tf = (tf_array + 1)/(words_per_document+1) # with smoothinf
        return tf*idf_array
    
    def get_features(self, tokenized_sentence):
        """Convert sentence to word embeeding values.
        :param tokenized_sentence
        :return feature weights
        """
        sentence_embedding = []
        
        for word in tokenized_sentence:
            # get embedding of word if exists
            try:
                word_emb = self.embedding_matrix[word]
            except: # read UNK token embedding 
                word_emb = self.embedding_matrix["UNK"]
            sentence_embedding.append(word_emb)
        
        return sentence_embedding

In [170]:
from model import Model
from nn_layers import FeedForwardNetwork

class NeuralModel(Model):
    def __init__(self, embeddingfile,
                 max_seq_length,
                 hidden_units, minibatch_size,
                 learning_rate,
                 epochs,
                 hidden_units_other_layers=[],
                 tfidf=False,
                 max_features=None,
                 threshold=0,
                 momentum=0,
                 average_emb_sentence=False):
        '''
        :param embeddingfile: word embedding file
        :param hidden_units: number of hidden units
        :param minibatch_size: mini-batch size
        :param learning_rate: learning_rate: learning
        :param epochs: number of epochs to train for
        :param hidden_units_other_layers (list): number of hidden units in each layer
        :param tfidf: Enable TF-IDF ranking
        :param threshold: TF-IDF Vocabulary size
        :param momentum: TF-IDF Minimum word frequency required
        :param average_emb_sentence: Compute the average of the embeddings in the sentence instead of concatenation
        '''
        # self.network = FeedForwardNetwork()
        self.embeddingfile = embeddingfile
        self.embedding_dim = None
        self.max_seq_length = max_seq_length
        self.average_emb_sentence = average_emb_sentence

        self.hidden_units = [hidden_units] +  hidden_units_other_layers if len(hidden_units_other_layers) > 0 else [hidden_units]
        # self.hidden_units = hidden_units if type(hidden_units) == list else [hidden_units] # list or int
        self.n_hidden_layers = len(self.hidden_units)
        self.weights = [None]*(self.n_hidden_layers + 1)
        self.bias = [None]*(self.n_hidden_layers + 1)


        self.Y_to_categorical = None
        self.minibatch_size = minibatch_size
        self.epochs = epochs
        self.features_ff_class = None
        self.learning_rate = learning_rate
        self.loss = {}
        # TF-IDF Sorting
        self.tfidf = tfidf # enable sorting by tf-idf score
        self.max_features = max_features
        self.threshold = threshold
        # Momentum
        self.momentum = momentum
    
    def initialize_weights(self, n_inputs, n_output):
        # weights = np.zeros((n_inputs, n_output))
        # bias = np.zeros(n_output)
        # np.random.seed(0)
        weights = np.random.rand(n_inputs, n_output)
        bias = np.random.rand(n_output)
        return weights, bias
    
    def relu_function(self, A):
        '''A = x*W + b

        :return: Z = relut(x*A+b)
        '''
        return np.maximum(0, A)
    
    def relu_derivative(self, A):
        return np.where(A > 0, 1, 0)

    def cross_entropy_loss(self, S, target):
        """Calculate the cross-entropy
        L = -1/n*_sum_{i=0}^{n}{y_i*log(s_i)} 
        y label is a vector containing K classes where yc = 1 if c is the correct class and the remaining elements will be 0.

        :param S[num_documents, num_labels]: probabilities of features after softmax
        :target [num_documents, num_labels]: target one hot encoded
        """
        epsilon = 1e-15
        S = np.maximum(epsilon, S)
        S = np.minimum(1 - epsilon, S)
        return -np.mean(np.log(S)*target)

    def softmax(self, Z):
        """Softmax function: normalizing logit scores
        :param Z([num_documents, num_labels])
        :return e^Z/sum_{i=0}^{k}{e^{Z}}
        """
        return np.exp(Z - np.max(Z, axis=1, keepdims=True))/np.sum(np.exp(Z - np.max(Z, axis=1, keepdims=True)), axis=1, keepdims=True)
    
    def OneHot(self, targets, num_labels):
        """Convert arrary of targets to One Hot 
        :param targets([num_documents,])
        :param num_labels(int)
        :return Y[num_documents, num_labels]
        """
        Y_onehot = np.zeros((len(targets), num_labels))
        Y_onehot[np.arange(len(targets)), targets] = 1
        return Y_onehot
    
    def predict(self, X):
        """Return prediction of X with the categorical values]
        """
        # z[num_documents, num_labels] = X[num_documents, num_features]*W[num_features, num_labels] + bias[num_labels]
        Z_i = np.dot(X, self.weights[0]) + self.bias[0]
        A_i = self.relu_function(Z_i)
        i = 0
        if self.n_hidden_layers > 1:
            for i in range(self.n_hidden_layers-1):
                Z_i = np.dot(A_i, self.weights[i+1]) + self.bias[i+1]
                A_i = self.relu_function(Z_i)

            i = i + 1
        Z_i = np.dot(A_i, self.weights[i+1]) + self.bias[i+1]
        O = self.softmax(Z_i)

        # Rows with highest probability
        S_max = np.argmax(O, axis=1)

        return S_max
    
    def convert_to_embeddings(self, sentence, average_emb_sentence=False):
        '''Convert sentence to embeddings.

        :param average_sentence:  Compute the element-wise average of a list of sentence embedding
        '''
        emb = self.features_ff_class.get_features(sentence)
        if not average_emb_sentence:
            if emb: # if there is a word
                emb_concat = np.concatenate(emb, axis=0)
            else:
                emb_concat = []
            # If you need padding words (i.e., your input is too short), use a vector of zeroes
            if len(emb) < self.max_seq_length:
                # Missing words
                words_missing = self.max_seq_length - len(emb)
                # print(words_missing)
                emb_concat = np.pad(emb_concat, (0, words_missing*self.embedding_dim), 'constant')
        else:
            # Compute average of the sentence
            if len(emb) > 0:
                stacked_arrays = np.vstack(emb)
                emb_concat = np.mean(stacked_arrays, axis=0)
            else:
                emb_concat = np.zeros(self.embedding_dim)

        return emb_concat

    
    def train(self, input_file, verbose=False):

        # Read dataset and create vocabulary
        features_ff_class = Features_FeedForward(input_file, self.embeddingfile, threshold=self.threshold, max_features=self.max_features)
        self.features_ff_class = features_ff_class
        num_labels = len(features_ff_class.labelset)

        # Convert Y from categorical to integers values
        Y_mapping = {label: index for index, label in enumerate(np.unique(features_ff_class.labels))}
        self.Y_to_categorical = {index: label for label, index in Y_mapping.items()} # dictionary to convert back y's to categorical
        Y = [Y_mapping[y] for y in features_ff_class.labels]
        # Convert to OneHot for computing Loss
        Y_onehot = self.OneHot(Y, num_labels)

        # Get embedding dim
        self.embedding_dim = list(features_ff_class.embedding_matrix.values())[0].shape[0]

        # Number of sentences
        sample_size = len(features_ff_class.tokenized_text)

        # X_train: shape: 50f or 300f-dim × features (u)
        if not self.average_emb_sentence:
            n_inputs = self.max_seq_length*self.embedding_dim # number of features
        else:
            n_inputs = self.embedding_dim
        X_train = np.zeros((sample_size, n_inputs))

        if self.tfidf: # Truncate input to the max sequence length sorted by TF-IDF
            tf_idf = features_ff_class.tf_idf(features_ff_class.tokenized_text)
            trunc_tokenized_text = features_ff_class.sort_by_tfidf(
                tf_idf,
                self.max_seq_length
            )
        else:
            # Truncate input to the max sequence length
            trunc_tokenized_text = features_ff_class.adjust_max_seq_length(
                features_ff_class.tokenized_text,
                self.max_seq_length
            )
        # Convert to embeddings with zero-padding
        for i, sentence in enumerate(trunc_tokenized_text):
            sentence_emb = self.convert_to_embeddings(sentence, average_emb_sentence=self.average_emb_sentence)
            X_train[i] = sentence_emb

        minibatch_size = self.minibatch_size

        # Initialize Weights
        # Create W_a and b_a
        # W_0[n_documents, hidden_units (u)]
        # b_0[hidden_units (u)]

        list_of_sizes = [n_inputs] + self.hidden_units + [num_labels]
        for i in range(self.n_hidden_layers + 1):
            weights, bias = self.initialize_weights(list_of_sizes[i], list_of_sizes[i+1])
            self.weights[i] = weights
            self.bias[i] = bias

        # Initilze Momentum weights
        prev_dW_i = [0] * (self.n_hidden_layers + 1)
        prev_db_i = [0] * (self.n_hidden_layers + 1)

        # Permutate the dataset to increase randomness
        np.random.seed(0)
        permutation = np.random.permutation(sample_size)
        # X_train[n_documents, n_features]
        X_permutation = X_train[permutation]
        Y_permutation_onehot = Y_onehot[permutation]

        for n_epoch in range(self.epochs):
            # Mini-batch_size Implementation
            mini_batch_loss = []
            for j in range(0, sample_size, minibatch_size):
                X_mini_batch = X_permutation[j:j+minibatch_size]
                y_mini_batch = Y_permutation_onehot[j:j+minibatch_size]

                ##########################################################
                # ---------------------FORWARD PASS--------------------- #
                ##########################################################
                # List of outputs of each layer
                # A[0] -> Input Layer
                # A[.] => Hidden Layer
                # A[n] -> Ouput Layer
                A = [None]*(self.n_hidden_layers + 2) 
                Z = [None]*(self.n_hidden_layers + 2)
                # ---------------- Input Layer --------------- #
                A[0] = X_mini_batch
                Z[0] = X_mini_batch

                # ---------------- Hidden Layers --------------- #
                for i in range(self.n_hidden_layers):
                    # Z_i = np.dot(X_mini_batch, self.weights_i) + self.bias_i
                    # A_i = relu(Z_i)
                    Z_tmp = np.dot(A[i], self.weights[i]) + self.bias[i]
                    Z[i+1] = Z_tmp
                    A_tmp = self.relu_function(Z_tmp)
                    A[i+1] = A_tmp
                # ---------------- Hidden-to-Output Layer --------------- #

                i = i + 1
                # print(i)
                Z_output_layer = np.dot(A[i], self.weights[self.n_hidden_layers]) + self.bias[self.n_hidden_layers]
                Z[i+1] = Z_output_layer
                A_output_layer = self.softmax(Z_output_layer)
                A[i+1] = A_output_layer


                ##########################################################
                # -------------------BACKWARD PASS---------------------- #
                ##########################################################

                # Compute Gradients
                # List of output gradients of each layer
                # dZ[0] -> Input Layer
                # dZ[.] => Hidden Layer
                # dZ[n] -> Ouput Layer
                dZ = [None] * (self.n_hidden_layers + 1)

                # dW[previous_layer, next_layer]
                dW = [None] * (self.n_hidden_layers + 1)
                db = [None] * (self.n_hidden_layers + 1)
                dZ[-1] = A[-1] - y_mini_batch
                for i in range(self.n_hidden_layers, 0, -1):
                    dW[i] = (1/minibatch_size)*np.dot(A[i].T, dZ[i])
                    db[i] = (1/minibatch_size)*np.sum(dZ[i], axis=0, keepdims = True)
                    dZ[i-1] = np.dot(dZ[i], self.weights[i].T)*self.relu_derivative(Z[i])

                # print(dZ[i-1])
                dW[0] = (1/minibatch_size)*np.dot(X_mini_batch.T, dZ[i-1])
                db[0] = (1/minibatch_size)*np.sum(dZ[i-1], axis=0, keepdims = True)

                # Update Weights
                for i in range(self.n_hidden_layers + 1):
                    self.weights[i] = self.weights[i] - (self.learning_rate*dW[i] + self.momentum*prev_dW_i[i])
                    self.bias[i] = self.bias[i] - (self.learning_rate*db[i] + self.momentum*prev_db_i[i])
                    # Momentum
                    # Save previous gradients for Momentum
                    prev_dW_i[i] =  self.learning_rate*dW[i] + self.momentum*prev_dW_i[i]
                    prev_db_i[i] =  self.learning_rate*db[i] + self.momentum*prev_db_i[i]

                ########
                # Loss #
                ########
                # print(y_mini_batch)
                mini_batch_loss.append(self.cross_entropy_loss(A[-1], y_mini_batch))

            loss = np.mean(mini_batch_loss)
            self.loss[n_epoch] = loss
            if verbose:
                print(f"Epoch: {n_epoch+1} - Loss: {loss}")

    def classify(self, input_file):
        # Read Input File
        tokenized_text = self.features_ff_class.read_inference_file(input_file)

        if self.tfidf:
            tf_idf_inference = []
            # Get features from inference file
            for sentence in tokenized_text:
                # Transform dataset to TF-IDF space
                # Return features with format (1, size_vocabulary)
                X_sentence = self.features_ff_class.get_features_tfidf(sentence, self.features_ff_class.idf)
                tf_idf_inference.append(X_sentence)
            tf_idf_inference = np.stack(tf_idf_inference)
            trunc_tokenized_text = self.features_ff_class.sort_by_tfidf(
                tf_idf_inference,
                self.max_seq_length
            )
        else:
            # Truncate input to the max sequence length
            trunc_tokenized_text = self.features_ff_class.adjust_max_seq_length(
                tokenized_text,
                self.max_seq_length
            )

        X_test = []
        # Convert to embeddings with zero padding
        for i, sentence in enumerate(trunc_tokenized_text):
            sentence_emb = self.convert_to_embeddings(sentence, average_emb_sentence=self.average_emb_sentence)
            X_test.append(sentence_emb)
        X_test = np.vstack(X_test)

        # Make Prediction
        y_test = self.predict(X_test)
        preds_label = []
        for y in y_test:
            tmp = self.Y_to_categorical[y]
            preds_label.append(tmp)
        
        return preds_label

In [229]:
############
# Products #
############

# train_file = "datasets/products/train.txt"
# emb_file = "glove.6B.50d.txt"
# pred_file = "datasets/products/val.test"
# pred_true_labels = "datasets/products/val.txt"
# model_file_name = "products.model"
# loss_file = "datasets/products/loss.txt"
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=10, minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=500, threshold=2, momentum=0.4) # 65.6%

# Validation #


# 1 Layer
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=10, minibatch_size=256, learning_rate=0.2, epochs=500) # 62.10%
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=10, minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=1000, threshold=2) # 66.23%
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=10, minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=1000, threshold=2, momentum=0.4) #66.17


# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=50, minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=1000, threshold=1, momentum=0.9, average_emb_sentence=True) # 70.86%

# 2 Layers
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=4, hidden_units_other_layers=[4], minibatch_size=32, learning_rate=0.02, epochs=200) # 67.33
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=4, hidden_units_other_layers=[4], minibatch_size=32, learning_rate=0.02, epochs=200, tfidf=True, max_features=1000, threshold=0) # 59.01
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=4, hidden_units_other_layers=[4], minibatch_size=32, learning_rate=0.02, epochs=200, tfidf=True, max_features=1000, threshold=0, momentum=0.1)

##BEST MODEL###
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=10, hidden_units_other_layers=[10], minibatch_size=32, learning_rate=0.02, epochs=200, tfidf=True, max_features=1000, threshold=1, momentum=0.4, average_emb_sentence=True) 

# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=10, hidden_units_other_layers=[10, 10], minibatch_size=32, learning_rate=0.005, epochs=200, tfidf=True, max_features=1000, threshold=1, momentum=0.4, average_emb_sentence=True) 
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=10, hidden_units_other_layers=[10], minibatch_size=32, learning_rate=0.02, epochs=200, tfidf=False, max_features=1000, threshold=1, momentum=0.4, average_emb_sentence=True) 

# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=256, learning_rate=0.1, epochs=500, tfidf=True, max_features=500, threshold=2) # 64%
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=10, minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=500, threshold=2)
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=[15, 3], minibatch_size=32, learning_rate=0.002, epochs=100) # 65%
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=10, minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=500, threshold=2, momentum=0.4)

# 2 Layers
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=[20, 10], minibatch_size=256, learning_rate=0.001, epochs=100, tfidf=True, max_features=500, threshold=2, momentum=0.9) # 59.01%
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=[10, 10], minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=500, threshold=2, momentum=0.4) # 59.01%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=[10, 10], minibatch_size=32, learning_rate=0.0001, epochs=100) # 58.98%


# 3 Layers
# nn_model = NeuralModel(emb_file, max_seq_length=20, hidden_units=[2, 2, 2], minibatch_size=128, learning_rate=0.2, momentum=0, epochs=100)  # Accuracy: 62.73%
# nn_model = NeuralModel(emb_file, max_seq_length=20, hidden_units=[2, 2, 2], minibatch_size=128, learning_rate=0.2, momentum=0.1, epochs=100)  # Accuracy: 63.23%
# nn_model = NeuralModel(emb_file, max_seq_length=20, hidden_units=[3, 3, 3], minibatch_size=32, learning_rate=0.02, momentum=0.1, epochs=200)  # Accuracy: 65.77%
# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=[3, 3, 3], minibatch_size=32, learning_rate=0.02, momentum=0.1, epochs=200)  # Accuracy: 66.40%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=[3, 3, 3], minibatch_size=32, learning_rate=0.02, momentum=0.1, epochs=200)  # Accuracy: 68.27%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=[3, 3, 3], minibatch_size=32, learning_rate=0.015, momentum=0.1, epochs=200)


########
# 4dim #
########

# train_file = "datasets/4dim/train.txt"
# emb_file = "glove.6B.50d.txt"
# pred_file = "datasets/4dim/val.test"
# pred_true_labels = "datasets/4dim/val.txt"
# model_file_name = "4dim.model"
# loss_file = "datasets/4dim/loss.txt"
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=20, minibatch_size=32, learning_rate=0.05, epochs=100)
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=20, minibatch_size=128, learning_rate=0.05, epochs=100, tfidf=True, threshold=2, max_features=1000)
# nn_model = NeuralModel(emb_file, max_seq_length=50, hidden_units=100, minibatch_size=256, learning_rate=0.01, epochs=200) # 34.62%
# nn_model = NeuralModel(emb_file, max_seq_length=50, hidden_units=100, minibatch_size=256, learning_rate=0.005, epochs=200) # 35.58%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=256, learning_rate=0.003, epochs=500, momentum=0.9) # 37.82%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=256, learning_rate=0.0015, epochs=500, momentum=0.9) # 41.99%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=256, learning_rate=0.0015, epochs=500, momentum=0.9, tfidf=True, threshold=2, max_features=1000) # Accuracy: 44.55%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=256, learning_rate=0.0015, epochs=500, momentum=0.9, tfidf=True, threshold=0, max_features=10000) # Accuracy: 46.47%
# nn_model = NeuralModel(emb_file, max_seq_length=20, hidden_units=[80, 50], minibatch_size=128, learning_rate=0.0001, epochs=500, momentum=0.9) # 28.21%
# nn_model = NeuralModel(emb_file, max_seq_length=20, hidden_units=[5, 5, 5], minibatch_size=128, learning_rate=0.0001, epochs=500, momentum=0.9) 


# Validation #

# 1 Layer
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=32, learning_rate=0.002, epochs=500) # Accuracy: 46.47%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=32, learning_rate=0.002, epochs=500, tfidf=True, threshold=0, max_features=10000) # Accuracy: 45.51%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=32, learning_rate=0.002, epochs=500, tfidf=True, threshold=0, max_features=10000,momentum=0.4) # Accuracy: 45.51%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=256, learning_rate=0.0015, epochs=500, momentum=0.9, tfidf=True, threshold=0, max_features=10000) # Accuracy: 46.47%

# AVG Embedding
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=100, minibatch_size=256, learning_rate=0.0015, epochs=500, momentum=0.9, tfidf=True, threshold=0, max_features=10000, average_emb_sentence=True) # Accuracy: 56.09%

##BEST MODEL###
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=80, minibatch_size=64, learning_rate=0.0015, epochs=500, momentum=0.95, tfidf=True, threshold=1, max_features=1000, average_emb_sentence=True) # Accuracy: 65.06%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=80, hidden_units_other_layers=[10, 10], minibatch_size=64, learning_rate=0.0001, epochs=500, momentum=0.95, tfidf=True, threshold=1, max_features=1000, average_emb_sentence=True) # Accuracy: 65.06%

# 2 Layer
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=50, hidden_units_other_layers=[50], minibatch_size=64, learning_rate=0.001, epochs=200) # 28.53%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=50, hidden_units_other_layers=[50], minibatch_size=64, learning_rate=0.001, epochs=200, tfidf=True, threshold=0, max_features=10000)  # 29.81%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=50, hidden_units_other_layers=[50], minibatch_size=64, learning_rate=0.001, epochs=200, tfidf=True, threshold=0, max_features=10000, momentum=0.2) #28.85%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=50, hidden_units_other_layers=[50], minibatch_size=32, learning_rate=0.002, epochs=300, tfidf=True, threshold=1, max_features=10000, momentum=0.5, average_emb_sentence=True) # Accuracy: 57.37%


#############
# questions #
#############

# train_file = "datasets/questions/train.txt"
# emb_file = "ufvytar.100d.txt"
# pred_file = "datasets/questions/val.test"
# pred_true_labels = "datasets/questions/val.txt"
# model_file_name = "questions.model"
# loss_file = "datasets/questions/loss.txt"
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100) # Accuracy: 65.77%
# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=9, minibatch_size=64, learning_rate=0.01, epochs=200, momentum=0.9) # Accuracy: 66.14%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=9, minibatch_size=64, learning_rate=0.01, epochs=200, momentum=0.9) # Accuracy: 67.24%
# nn_model = NeuralModel(emb_file, max_seq_length=50, hidden_units=[5, 5], minibatch_size=128, learning_rate=0.01, epochs=300, momentum=0.9)
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=[5, 5, 5], minibatch_size=32, learning_rate=0.01, epochs=300, momentum=0.9) # Accuracy: 63.33%

# Validation
# 1 Layer

##BEST MODEL###
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=10, minibatch_size=32, learning_rate=0.1, epochs=200, average_emb_sentence=True) # Accuracy: 65.77%
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=200, tfidf=True, threshold=0, max_features=1000) # Accuracy: 45.60%
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=200, tfidf=True, threshold=0, max_features=1000, momentum=0.2) # Accuracy: 45.60%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=50, minibatch_size=128, learning_rate=0.2, epochs=500, tfidf=True, threshold=1, max_features=1000, momentum=0.4, average_emb_sentence=True) # Accuracy: 60.27%

# 2 Layer
# nn_model = NeuralModel(emb_file, max_seq_length=50, hidden_units=5, hidden_units_other_layers=[5], minibatch_size=128, learning_rate=0.025, epochs=400) # Accuracy: 61.37%
# nn_model = NeuralModel(emb_file, max_seq_length=50, hidden_units=5, hidden_units_other_layers=[5], minibatch_size=128, learning_rate=0.025, epochs=400, tfidf=True, threshold=1, max_features=5000, momentum=0.4)
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=5, hidden_units_other_layers=[5, 5], minibatch_size=128, learning_rate=0.2, epochs=400, tfidf=True, threshold=1, max_features=10000, momentum=0, average_emb_sentence=True) # Accuracy: 61.86%

#########
# odiya #
#########

train_file = "datasets/odiya/train.txt"
emb_file = "fasttext.wiki.300d.vec"
pred_file = "datasets/odiya/val.test"
pred_true_labels = "datasets/odiya/val.txt"
model_file_name = "odia.model"
loss_file = "datasets/odiya/loss.txt"

# Validation
# 1 Layer
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100) # Accuracy: 75.89%
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100,  tfidf=True, threshold=2, max_features=1000) # Accuracy: 74.84%
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100,  tfidf=True, threshold=2, max_features=1000, momentum=0.2) # Accuracy: 75.66%


# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=40, minibatch_size=32, learning_rate=0.12, epochs=200,  tfidf=True, threshold=2, max_features=1000, momentum=0.4, average_emb_sentence=True) # Accuracy: 78.52%

# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100, momentum=0.9) # Accuracy: 78.29%
# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=200, momentum=0.9) # Accuracy: 79.11%

# 2 Layer
# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=5, hidden_units_other_layers=[5], minibatch_size=128, learning_rate=0.01, epochs=200) # Accuracy: 77.20%
# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=5, hidden_units_other_layers=[5], minibatch_size=128, learning_rate=0.01, epochs=200, tfidf=True, threshold=1, max_features=1000) # Accuracy: 50.27%/
# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=5, hidden_units_other_layers=[5], minibatch_size=128, learning_rate=0.01, epochs=200, tfidf=True, threshold=1, max_features=1000, momentum=0.6) # Accuracy: 75.20%
##BEST MODEL###
# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=10, hidden_units_other_layers=[10], minibatch_size=128, learning_rate=0.01, epochs=200, tfidf=True, threshold=1, max_features=2000, momentum=0.9, average_emb_sentence=True)

nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=10, hidden_units_other_layers=[10, 10], minibatch_size=128, learning_rate=0.0001, epochs=200, tfidf=True, threshold=1, max_features=2000, momentum=0.4, average_emb_sentence=True)

# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=5, hidden_units_other_layers=[5], minibatch_size=128, learning_rate=0.01, epochs=200, tfidf=True, threshold=1, max_features=1000, momentum=0.6, average_emb_sentence=False) # Accuracy: 50.27%
# nn_model = NeuralModel(emb_file, max_seq_length=30, hidden_units=[5, 5], minibatch_size=128, learning_rate=0.01, epochs=200, momentum=0.9) # Accuracy: 79.97%
# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=[5, 5], minibatch_size=128, learning_rate=0.01, epochs=200, momentum=0.9) # Accuracy: 79.97%

# nn_model = NeuralModel(emb_file, max_seq_length=40, hidden_units=5, hidden_units_other_layers=[5, 5], minibatch_size=32, learning_rate=0.001, epochs=200, momentum=0.9) # Accuracy: 81.32%

# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=10, minibatch_size=64, learning_rate=0.35, epochs=100)
# nn_model = NeuralModel(emb_file, max_seq_length=20, hidden_units=10, minibatch_size=64, learning_rate=0.1, epochs=1000)
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)


In [230]:
%%time

nn_model.train(train_file, verbose=True)

Epoch: 1 - Loss: 2.9633168168793698
Epoch: 2 - Loss: 0.35326435225323183
Epoch: 3 - Loss: 0.35325853458873785
Epoch: 4 - Loss: 0.35325271327523877
Epoch: 5 - Loss: 0.3532468884777687
Epoch: 6 - Loss: 0.3532410607977945
Epoch: 7 - Loss: 0.353235230300437
Epoch: 8 - Loss: 0.3532293969090031
Epoch: 9 - Loss: 0.3532235612113988
Epoch: 10 - Loss: 0.35321772272170215
Epoch: 11 - Loss: 0.35321188113960234
Epoch: 12 - Loss: 0.3532060365459039
Epoch: 13 - Loss: 0.3532001884944539
Epoch: 14 - Loss: 0.35319433741872147
Epoch: 15 - Loss: 0.3531884838676437
Epoch: 16 - Loss: 0.35318262805290257
Epoch: 17 - Loss: 0.3531767686086186
Epoch: 18 - Loss: 0.35317090590484823
Epoch: 19 - Loss: 0.35316503987619535
Epoch: 20 - Loss: 0.3531591711328338
Epoch: 21 - Loss: 0.3531532990014841
Epoch: 22 - Loss: 0.3531474239972993
Epoch: 23 - Loss: 0.35314154543655996
Epoch: 24 - Loss: 0.353135664331243
Epoch: 25 - Loss: 0.35312977955990993
Epoch: 26 - Loss: 0.35312389129732785
Epoch: 27 - Loss: 0.35311799968765933

In [231]:
loss = list(nn_model.loss.values())

import csv
with open(loss_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    for item in loss:
        writer.writerow([item])

In [232]:
nn_model.save_model(model_file_name)

In [233]:
model = NeuralModel.load_model(model_file_name)

In [234]:

preds = model.classify(pred_file + ".txt")
# preds, t1, t2 = model.classify(pred_file + ".txt")


In [235]:
## Save the predictions: one label prediction per line
with open(pred_file + ".pred.txt", "w") as file:
    for pred in preds:
        file.write(pred+"\n")

# Evaluation

In [236]:
import pandas as pd

In [237]:
## 4dim

# pred_file = "datasets/4dim/val.test"
# pred_true_labels = "datasets/4dim/val.txt"

# odiya

# pred_file = "datasets/odiya/val.test"
# pred_true_labels = "datasets/odiya/val.txt"

# products
# pred_file = "datasets/products/val.test"
# pred_true_labels = "datasets/products/val.txt"

# questions
# pred_file = "datasets/questions/val.test"
# pred_true_labels = "datasets/questions/val.txt"

In [238]:
true_dataset = pd.read_csv(pred_true_labels, sep='\t', header=None, names=['text', 'true_label'])
pred_dataset = pd.read_csv(pred_file + ".pred.txt", sep='\t', header=None, names=['pred'])

In [239]:
true_dataset['true_label'].value_counts()

sports           1345
business          937
entertainment     758
Name: true_label, dtype: int64

In [240]:
pred_dataset['pred'].value_counts()

sports      3039
business       1
Name: pred, dtype: int64

In [241]:
# Check if the columns have the same name; adjust as needed
column_name = 'true_label'  # Change to the actual column name
pred_column_name = 'pred'  # Change to the actual predicted column name

# Merge the two DataFrames on a common index or key if available
merged_df = true_dataset.merge(pred_dataset, left_index=True, right_index=True)

# Calculate the accuracy by comparing the two columns
accuracy = (merged_df[column_name] == merged_df[pred_column_name]).mean()

# Print the accuracy as a percentage
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 44.28%
