In [2]:
from operator import methodcaller
import string
import re
from collections import Counter, defaultdict
import numpy as np
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [3]:
def expand_contradictions(text):

    contraction_mapping = {
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }

    pattern = re.compile(r"\b(?:" + "|".join(re.escape(contraction) for contraction in contraction_mapping.keys()) + r")\b")
    text = pattern.sub(lambda x: contraction_mapping[x.group()], text)
    
    return text

def remove_digits_and_words_digits(text):
    # Define a regular expression pattern to match words containing digits
    pattern = r'\b\w*\d\w*\b'
    text_without_words_with_digits = re.sub(pattern, '', text)

    return text_without_words_with_digits

def remove_stop_words(text):
    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                  'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those',
                  'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
                  'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                  'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
                  'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                  'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
                  'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
                  'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                  "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
                  'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    text_clean = []
    for word in text:
        if word not in stop_words:
            text_clean.append(word)
    return text_clean


def tokenize(text, split=True):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    # re.sub('[^a-zA-Z]', '', dataset['Text'][i])

    # Text preprocessing techniques:
    # 1) Lowercase
    text = text.lower()

    # 2) Expand Contradictions
    text = expand_contradictions(text)

    # 3) Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)

    # 4) Remove digits and words with digits
    text = remove_digits_and_words_digits(text)
    if split:
        text = text.split()

    # # 5) Remove Stop Words
    # if stop_words:
    # text = remove_stop_words(text)

    return text

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [50]:
class Features_FeedForward(Features):

    def __init__(self, input_file, embedding_file, threshold=0, max_features=None):
        super(Features_FeedForward, self).__init__(input_file)
        self.embedding_matrix = self.read_embedding_file(embedding_file) # Need to save EmbeddingMatrix values for inference
        # self.vocabulary = list(self.embedding_matrix.keys())
        # self.word2index = {word: i for i, word in enumerate(self.vocabulary, start=0)}
        # self.index2word = {i: word for i, word in enumerate(self.vocabulary, start=0)}
        self.threshold = threshold
        self.max_features = max_features
        self.vocabulary = None
        self.word2index = None
        self.index2word = None
        self.idf = None # Need to save IDF values for inference

    def adjust_max_seq_length(self, tokenized_text, max_seq_length):
        """Adjust size of data input to the max sequence length
        :param tokenized_text: data input
        :param max_seq_length: the max sequence length
        :return list: truncated sentences
        """
        new_tokenized_text = []
        for sentence in tokenized_text:
            new_tokenized_text.append(sentence[:max_seq_length])
        return new_tokenized_text

        
    def read_embedding_file(self, embedding_file):
        '''Read embedding file

        :param embedding_file (str):
        :return: dict: embedding matrix
        '''

        embedding_matrix = dict()
        try: 
            with open(embedding_file, "r") as file:
                for line in file:
                    values = line.strip().split()
                    word = values[0]
                    word_embedding = np.array([float(emb) for emb in values[1:]])
                    embedding_matrix[word] = word_embedding
            return embedding_matrix
        except OSError as e:
            print("Embedding file " + embedding_file + " is not available, please input the right parth to the file.")

    def read_inference_file(self, input_file):
        """Read inference file that is in the form: <text> i.e. a line
        of text that does not contain a tab.
        """
        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text
    
    def create_vocabulary(self, tokenized_text, threshold, max_features=None):
        """Creat vocabulary from training set, considering only words
        that have an occurence > threshold.
        """
        # Append everything together in a dictionary
        flattened_list = [item for sublist in tokenized_text for item in sublist]
        flattened_list_count = Counter(flattened_list)

        # Sort the dictionary by values in descending order
        flattened_list_count = dict(sorted(flattened_list_count.items(), key=lambda item: item[1], reverse=True))

        # Considering only words that have an occurence > threshold.
        flattened_list_count_filter = {word:count for word, count in flattened_list_count.items() if count > threshold}

        # Limit the size of the vocabulary based on max_features
        if max_features:
            flattened_list_count_filter = dict(islice(flattened_list_count_filter.items(), max_features-1))

        # Add to vocabulary the Out-of-Vocabulary token
        return list(flattened_list_count_filter.keys()) + ['UNK']
    
    def tf_idf(self, tokenized_text):
        """Term frequency-inverse document frequency
        """
        # Create Vocabulary
        self.vocabulary = self.create_vocabulary(tokenized_text, self.threshold, self.max_features)
        self.word2index = {word: i for i, word in enumerate(self.vocabulary, start=0)}
        self.index2word = {i: word for i, word in enumerate(self.vocabulary, start=0)}

        size_vocabulary = len(self.vocabulary)
        n_documents = len(tokenized_text)
        tf_array = np.zeros((n_documents, size_vocabulary))
        idf_array = np.zeros(size_vocabulary) # Inverse Document Frequency
        words_per_document = np.zeros(n_documents)
        # Compute Term-Frequency
        for d_i, sentence in enumerate(tokenized_text, start=0):
            words_in_document = []
            for word in sentence:

                index_word = self.word2index.get(word)
                
                if word in self.word2index.keys():
                    tf_array[d_i][index_word] += 1
                    words_per_document[d_i] += 1
                    # Inverse Document Frequency
                    if word not in words_in_document: # does not count repeated words in the same document
                        words_in_document.append(word) 
                        idf_array[index_word] += 1 # number of documents containing the term
        tf = (tf_array + 1)/(words_per_document.reshape(-1, 1) + 1)
        # Smoothing: to avoid division by zero errors and to ensure that terms with zero document
        # frequency still get a non-zero IDF score
        idf = np.log((n_documents + 1)/(idf_array + 1)) + 1 # Smoothing

        self.idf = idf
        tf_idf = tf*idf
        return tf_idf # Shape (n_documents, vocabulary)
    
    def sort_by_tfidf(self, tfidf_matrix, max_seq_length):
        """Sort input documents based on tf*idf score.
        Return top "max_seq_length" words
        :param: tfidf_matrix
        :param: max_seq_length
        :return: sentences ordered by TF-IDF score
        """
        
        # Indices of sorted matrix in descending order
        indices = np.argsort(-tfidf_matrix, axis=1)
        tfidf_matrix_sorted = []

        # Create sorted matrix
        for i in range(tfidf_matrix.shape[0]):
            # sentence in orderd version
            tmp = [self.index2word[index] for index in indices[i][:max_seq_length]]
            tfidf_matrix_sorted.append(tmp)
    
        return tfidf_matrix_sorted
    
    def get_features_tfidf(self, tokenized_sentence, idf_array):
        """Convert sentence to TF-IDF space
        """
        size_vocabulary = len(self.vocabulary)
        tf_array = np.zeros(size_vocabulary)
        words_per_document = 0
        # Compute Term-Frequency
        words_in_document = []
        for word in tokenized_sentence:
            index_word = self.word2index.get(word)
            if word in self.word2index.keys():
                tf_array[index_word] += 1
                words_per_document += 1
        tf = (tf_array + 1)/(words_per_document+1) # with smoothinf
        return tf*idf_array
    
    def get_features(self, tokenized_sentence):
        """Convert sentence to word embeeding values.
        :param tokenized_sentence
        :return feature weights
        """
        sentence_embedding = []
        
        for word in tokenized_sentence:
            # get embedding of word if exists
            try:
                word_emb = self.embedding_matrix[word]
            except: # read UNK token embedding 
                word_emb = self.embedding_matrix["UNK"]
            sentence_embedding.append(word_emb)
        
        return sentence_embedding

In [65]:
from model import Model
from nn_layers import FeedForwardNetwork

class NeuralModel(Model):
    def __init__(self, embeddingfile, max_seq_length, hidden_units, minibatch_size, learning_rate, epochs, tfidf=False, max_features=None, threshold=0): 
        # self.network = FeedForwardNetwork()
        self.embeddingfile = embeddingfile
        self.embedding_dim = None
        self.max_seq_length = max_seq_length
        self.hidden_units = hidden_units
        self.weights_1 = None
        self.bias_1 = None
        self.weights_2 = None
        self.bias_2 = None
        self.Y_to_categorical = None
        self.minibatch_size = minibatch_size
        self.epochs = epochs
        self.features_ff_class = None
        self.learning_rate = learning_rate
        self.loss = {}
        # TF-IDF Sorting
        self.tfidf = tfidf # enable sorting by tf-idf score
        self.max_features = max_features
        self.threshold = threshold
    
    def initialize_weights(self, n_inputs, n_output):
        # weights = np.zeros((n_inputs, n_output))
        # bias = np.zeros(n_output)
        # np.random.seed(0)
        weights = np.random.rand(n_inputs, n_output)
        bias = np.random.rand(n_output)
        return weights, bias
    
    def relu_function(self, A):
        '''A = x*W + b

        :return: Z = relut(x*A+b)
        '''
        return np.maximum(0, A)
    
    def relu_derivative(self, A):
        return np.where(A > 0, 1, 0)

    def cross_entropy_loss(self, S, target):
        """Calculate the cross-entropy
        L = -1/n*_sum_{i=0}^{n}{y_i*log(s_i)} 
        y label is a vector containing K classes where yc = 1 if c is the correct class and the remaining elements will be 0.

        :param S[num_documents, num_labels]: probabilities of features after softmax
        :target [num_documents, num_labels]: target one hot encoded
        """
        return -np.mean(np.log(S)*target)

    def softmax(self, Z):
        """Softmax function: normalizing logit scores
        :param Z([num_documents, num_labels])
        :return e^Z/sum_{i=0}^{k}{e^{Z}}
        """
        return np.exp(Z - np.max(Z, axis=1, keepdims=True))/np.sum(np.exp(Z - np.max(Z, axis=1, keepdims=True)), axis=1, keepdims=True)
    
    def OneHot(self, targets, num_labels):
        """Convert arrary of targets to One Hot 
        :param targets([num_documents,])
        :param num_labels(int)
        :return Y[num_documents, num_labels]
        """
        Y_onehot = np.zeros((len(targets), num_labels))
        Y_onehot[np.arange(len(targets)), targets] = 1
        return Y_onehot
    
    def predict(self, X):
        """Return prediction of X with the categorical values]
        """
        # z[num_documents, num_labels] = X[num_documents, num_features]*W[num_features, num_labels] + bias[num_labels]
        A = np.dot(X, self.weights_1) + self.bias_1
        h = self.relu_function(A)

        A_2 = np.dot(h, self.weights_2) + self.bias_2

        O = self.softmax(A_2)

        # Rows with highest probability
        S_max = np.argmax(O, axis=1)

        return S_max
    
    def convert_to_embeddings(self, sentence):
        '''Convert sentence to embeddings
        '''
        emb = self.features_ff_class.get_features(sentence)
            # try:
        if emb: # if there is a word
            emb_concat = np.concatenate(emb, axis=0)
        else:
            emb_concat = []
        # If you need padding words (i.e., your input is too short), use a vector of zeroes
        if len(emb) < self.max_seq_length:
            # Missing words
            words_missing = self.max_seq_length - len(emb)
            # print(words_missing)
            emb_concat = np.pad(emb_concat, (0, words_missing*self.embedding_dim), 'constant')
        return emb_concat

    
    def train(self, input_file, verbose=False):

        # Read dataset and create vocabulary
        features_ff_class = Features_FeedForward(input_file, self.embeddingfile, threshold=self.threshold, max_features=self.max_features)
        self.features_ff_class = features_ff_class
        num_labels = len(features_ff_class.labelset)

        # Convert Y from categorical to integers values
        Y_mapping = {label: index for index, label in enumerate(np.unique(features_ff_class.labels))}
        self.Y_to_categorical = {index: label for label, index in Y_mapping.items()} # dictionary to convert back y's to categorical
        Y = [Y_mapping[y] for y in features_ff_class.labels]
        # Convert to OneHot for computing Loss
        Y_onehot = self.OneHot(Y, num_labels)

        # Get embedding dim
        self.embedding_dim = list(features_ff_class.embedding_matrix.values())[0].shape[0]

        # Number of sentences
        sample_size = len(features_ff_class.tokenized_text)

        # X_train: shape: 50f or 300f-dim × features (u)
        n_inputs = self.max_seq_length*self.embedding_dim # number of features
        X_train = np.zeros((sample_size, n_inputs))

        print("Computing TFIDF")
        if self.tfidf: # Truncate input to the max sequence length sorted by TF-IDF
            tf_idf = features_ff_class.tf_idf(features_ff_class.tokenized_text)
            trunc_tokenized_text = features_ff_class.sort_by_tfidf(
                tf_idf,
                self.max_seq_length
            )
        else:
            # Truncate input to the max sequence length
            trunc_tokenized_text = features_ff_class.adjust_max_seq_length(
                features_ff_class.tokenized_text,
                self.max_seq_length
            )
        print("Computing Embedding")
        # Convert to embeddings with zero-padding
        for i, sentence in enumerate(trunc_tokenized_text):
            sentence_emb = self.convert_to_embeddings(sentence)
            X_train[i] = sentence_emb

        minibatch_size = self.minibatch_size

        # Initialize Wieghts
        # Create W_a and b_a
        # W_a[n_documents, hidden_units (u)]
        # b_a[hidden_units (u)]
        W_1, b_1 = self.initialize_weights(n_inputs, self.hidden_units)
        # Create Wb and b_b
        # W_b[hidden_units (u), num_labels (d)]
        # b_b[num_labels]
        W_2, b_2 = self.initialize_weights(self.hidden_units, num_labels)

        # Permutate the dataset to increase randomness
        np.random.seed(0)
        permutation = np.random.permutation(sample_size)
        # X_train[n_documents, n_features]
        X_permutation = X_train[permutation]
        Y_permutation_onehot = Y_onehot[permutation]

        self.weights_1 = W_1
        self.bias_1 = b_1
        self.weights_2 = W_2
        self.bias_2 = b_2
        for i in range(self.epochs):
            # Mini-batch_size Implementation
            mini_batch_loss = []
            for j in range(0, sample_size, minibatch_size):
                X_mini_batch = X_permutation[j:j+minibatch_size]
                y_mini_batch = Y_permutation_onehot[j:j+minibatch_size]

                ##########################################################
                # ---------------------FORWARD PASS--------------------- #
                ##########################################################
            
                # ---------------- Input-to-Hidden Layer --------------- #
                # Z1 = W_a*X + b_a
                # Z1[n_documents, hidden_units (u)]
                Z_1 = np.dot(X_mini_batch, self.weights_1) + self.bias_1
                # Hidden Unit
                # h = relu(A)
                # h[n_documents, hidden_units (u)]
                A_1 = self.relu_function(Z_1)

                # ---------------- Hidden-to-Output Layer --------------- #
                #  = W_b*h + b_b
                # A_2[n_documents, num_labels (d)]
                Z_2 = np.dot(A_1, self.weights_2) + self.bias_2
                # Output Layer
                # A_2 = softmax(Z_2)
                # A_2[n_documents, num_labels (d)]
                A_2 = self.softmax(Z_2)
                # print(A_2)

                ##########################################################
                # -------------------BACKWARD PASS---------------------- #
                ##########################################################

                # Compute Gradients

                dZ_2 = A_2 - y_mini_batch # [n_documents, num_labels (d)]
                # np.dot(A_2, dZ_2) => (hidden_units, n_documents) X (n_documents, num_labels) = (hidden_units, num_labels)
                dW_2 = (1/minibatch_size)*np.dot(A_1.T, dZ_2)
                db_2 = (1/minibatch_size)*np.sum(dZ_2, axis=0, keepdims = True) # [num_labels]
                # np.dot(self.weights_b, dZ_2) => [n_documents, num_labels (d)] X [num_labels (d), hidden_units (u)] => [n_documents, hidden_units]
                dZ_1 = np.dot(dZ_2, self.weights_2.T)*self.relu_derivative(Z_1)
                # np.dot(X, dZ_1) => (features, n_documents) X (n_documents, hidden_units) = (hidden_units, num_labels)
                dW_1 = (1/minibatch_size)*np.dot(X_mini_batch.T, dZ_1)
                db_1 = (1/minibatch_size)*np.sum(dZ_1, axis=0, keepdims = True) # [hidden_units]

                # Update weights
                self.weights_1 = self.weights_1 - self.learning_rate*dW_1
                self.bias_1 = self.bias_1 - self.learning_rate*db_1
                self.weights_2 = self.weights_2 - self.learning_rate*dW_2
                self.bias_2 = self.bias_2 - self.learning_rate*db_2

                ########
                # Loss #
                ########
                mini_batch_loss.append(self.cross_entropy_loss(A_2, y_mini_batch))

            loss = np.mean(mini_batch_loss)
            self.loss[i] = loss
            if verbose:
                print(f"Epoch: {i+1} - Loss: {loss}")

    def classify(self, input_file):
        # Read Input File
        tokenized_text = self.features_ff_class.read_inference_file(input_file)

        if self.tfidf:
            tf_idf_inference = []
            # Get features from inference file
            for sentence in tokenized_text:
                # Transform dataset to TF-IDF space
                # Return features with format (1, size_vocabulary)
                X_sentence = self.features_ff_class.get_features_tfidf(sentence, self.features_ff_class.idf)
                tf_idf_inference.append(X_sentence)
            tf_idf_inference = np.stack(tf_idf_inference)
            trunc_tokenized_text = self.features_ff_class.sort_by_tfidf(
                tf_idf_inference,
                self.max_seq_length
            )
        else:
            # Truncate input to the max sequence length
            trunc_tokenized_text = self.features_ff_class.adjust_max_seq_length(
                tokenized_text,
                self.max_seq_length
            )

        X_test = []
        # Convert to embeddings with zero padding
        for i, sentence in enumerate(trunc_tokenized_text):
            sentence_emb = self.convert_to_embeddings(sentence)
            X_test.append(sentence_emb)
        X_test = np.vstack(X_test)

        # Make Prediction
        y_test = self.predict(X_test)
        preds_label = []
        for y in y_test:
            tmp = self.Y_to_categorical[y]
            preds_label.append(tmp)
        
        return preds_label

In [116]:
############
# Products #
############

train_file = "datasets/products/train.txt"
emb_file = "glove.6B.50d.txt"
pred_file = "datasets/products/val.test"
pred_true_labels = "datasets/products/val.txt"
model_file_name = "products.model"
loss_file = "datasets/products/loss.txt"
nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=10, minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=500, threshold=2) # 65%
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=256, learning_rate=0.1, epochs=500, tfidf=True, max_features=500, threshold=2) # 64%
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=10, minibatch_size=256, learning_rate=0.2, epochs=500, tfidf=True, max_features=500, threshold=2)


########
# 4dim #
########

# train_file = "datasets/4dim/train.txt"
# emb_file = "glove.6B.50d.txt"
# pred_file = "datasets/4dim/val.test"
# pred_true_labels = "datasets/4dim/val.txt"
# model_file_name = "4dim.model"
# loss_file = "datasets/4dim/loss.txt"
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=20, minibatch_size=32, learning_rate=0.05, epochs=100)
# nn_model = NeuralModel(emb_file, max_seq_length=15, hidden_units=20, minibatch_size=128, learning_rate=0.05, epochs=100, tfidf=True, threshold=2, max_features=1000)


#############
# questions #
#############

# train_file = "datasets/questions/train.txt"
# emb_file = "ufvytar.100d.txt"
# pred_file = "datasets/questions/val.test"
# pred_true_labels = "datasets/questions/val.txt"
# model_file_name = "questions.model"
# loss_file = "datasets/questions/loss.txt"
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)

#########
# odiya #
#########

# train_file = "datasets/odiya/train.txt"
# emb_file = "fasttext.wiki.300d.vec"
# pred_file = "datasets/odiya/val.test"
# pred_true_labels = "datasets/odiya/val.txt"
# model_file_name = "odiya.model"
# loss_file = "datasets/odiya/loss.txt"
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)

# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=10, minibatch_size=64, learning_rate=0.35, epochs=100)
# nn_model = NeuralModel(emb_file, max_seq_length=20, hidden_units=10, minibatch_size=64, learning_rate=0.1, epochs=1000)
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)


In [117]:
nn_model.train(train_file, verbose=True)

Computing TFIDF
Computing Embedding
Epoch: 1 - Loss: 1.1995253731079238
Epoch: 2 - Loss: 0.34003656586685055
Epoch: 3 - Loss: 0.33820896163287006
Epoch: 4 - Loss: 0.33728433314037093
Epoch: 5 - Loss: 0.33668541495136567
Epoch: 6 - Loss: 0.33625604639948253
Epoch: 7 - Loss: 0.3358474604084416
Epoch: 8 - Loss: 0.3354807657467681
Epoch: 9 - Loss: 0.33518050778562486
Epoch: 10 - Loss: 0.33489039296914297
Epoch: 11 - Loss: 0.33463438569789705
Epoch: 12 - Loss: 0.33438132736914516
Epoch: 13 - Loss: 0.33414969417971907
Epoch: 14 - Loss: 0.3339378588739833
Epoch: 15 - Loss: 0.3337139467964728
Epoch: 16 - Loss: 0.3335051712049158
Epoch: 17 - Loss: 0.33328295810856307
Epoch: 18 - Loss: 0.33306123448301006
Epoch: 19 - Loss: 0.3328403879552119
Epoch: 20 - Loss: 0.3326107251649328
Epoch: 21 - Loss: 0.3323682654001583
Epoch: 22 - Loss: 0.3321299059778196
Epoch: 23 - Loss: 0.3318693062620282
Epoch: 24 - Loss: 0.3315846307940797
Epoch: 25 - Loss: 0.3312837134646719
Epoch: 26 - Loss: 0.3309808720408219

In [118]:
nn_model.save_model(model_file_name)

In [119]:
model = NeuralModel.load_model(model_file_name)

In [120]:
preds = model.classify(pred_file + ".txt")
# preds, t1, t2 = model.classify(pred_file + ".txt")


In [121]:
## Save the predictions: one label prediction per line
with open(pred_file + ".pred.txt", "w") as file:
    for pred in preds:
        file.write(pred+"\n")

# Evaluation

In [122]:
import pandas as pd

In [123]:
## 4dim

# pred_file = "datasets/4dim/val.test"
# pred_true_labels = "datasets/4dim/val.txt"

# odiya

# pred_file = "datasets/odiya/val.test"
# pred_true_labels = "datasets/odiya/val.txt"

# products
# pred_file = "datasets/products/val.test"
# pred_true_labels = "datasets/products/val.txt"

# questions
# pred_file = "datasets/questions/val.test"
# pred_true_labels = "datasets/questions/val.txt"

In [124]:
true_dataset = pd.read_csv(pred_true_labels, sep='\t', header=None, names=['text', 'true_label'])
pred_dataset = pd.read_csv(pred_file + ".pred.txt", sep='\t', header=None, names=['pred'])

In [125]:
true_dataset['true_label'].value_counts()

pos    3844
neg    2670
Name: true_label, dtype: int64

In [126]:
pred_dataset['pred'].value_counts()

neg    3398
pos    3116
Name: pred, dtype: int64

In [127]:
# Check if the columns have the same name; adjust as needed
column_name = 'true_label'  # Change to the actual column name
pred_column_name = 'pred'  # Change to the actual predicted column name

# Merge the two DataFrames on a common index or key if available
merged_df = true_dataset.merge(pred_dataset, left_index=True, right_index=True)

# Calculate the accuracy by comparing the two columns
accuracy = (merged_df[column_name] == merged_df[pred_column_name]).mean()

# Print the accuracy as a percentage
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 65.21%
