In [53]:
from operator import methodcaller
import string
import re
from collections import Counter, defaultdict
import numpy as np
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn

In [3]:
def expand_contradictions(text):

    contraction_mapping = {
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }

    pattern = re.compile(r"\b(?:" + "|".join(re.escape(contraction) for contraction in contraction_mapping.keys()) + r")\b")
    text = pattern.sub(lambda x: contraction_mapping[x.group()], text)
    
    return text

def remove_digits_and_words_digits(text):
    # Define a regular expression pattern to match words containing digits
    pattern = r'\b\w*\d\w*\b'
    text_without_words_with_digits = re.sub(pattern, '', text)

    return text_without_words_with_digits

def remove_stop_words(text):
    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                  'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those',
                  'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
                  'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                  'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
                  'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                  'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
                  'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
                  'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                  "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
                  'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    text_clean = []
    for word in text:
        if word not in stop_words:
            text_clean.append(word)
    return text_clean


def tokenize(text, split=True):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    # re.sub('[^a-zA-Z]', '', dataset['Text'][i])

    # Text preprocessing techniques:
    # 1) Lowercase
    text = text.lower()

    # 2) Expand Contradictions
    text = expand_contradictions(text)

    # 3) Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)

    # 4) Remove digits and words with digits
    text = remove_digits_and_words_digits(text)
    if split:
        text = text.split()

    # # 5) Remove Stop Words
    # if stop_words:
    # text = remove_stop_words(text)

    return text

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass


class Features_FeedForward(Features):

    def __init__(self, input_file, embedding_file):
        super(Features_FeedForward, self).__init__(input_file)
        self.embedding_matrix = self.read_embedding_file(embedding_file) # Need to save EmbeddingMatrix values for inference

    def adjust_max_seq_length(self, tokenized_text, max_seq_length):
        """Adjust size of data input to the max sequence length
        :param tokenized_text: data input
        :param max_seq_length: the max sequence length
        :return list: truncated sentences
        """
        new_tokenized_text = []
        for sentence in tokenized_text:
            new_tokenized_text.append(sentence[:max_seq_length])
        return new_tokenized_text

        
    def read_embedding_file(self, embedding_file):
        '''Read embedding file

        :param embedding_file (str):
        :return: dict: embedding matrix
        '''

        embedding_matrix = dict()
        try: 
            with open(embedding_file, "r") as file:
                for line in file:
                    values = line.strip().split()
                    word = values[0]
                    word_embedding = np.array([float(emb) for emb in values[1:]])
                    embedding_matrix[word] = word_embedding
            return embedding_matrix
        except OSError as e:
            print("Embedding file " + embedding_file + " is not available, please input the right parth to the file.")

    def read_inference_file(self, input_file):
        """Read inference file that is in the form: <text> i.e. a line
        of text that does not contain a tab.
        """
        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text
    
    def get_features(self, tokenized_sentence):
        """Convert sentence to word embeeding values.
        :param tokenized_sentence
        :return feature weights
        """
        sentence_embedding = []
        
        for word in tokenized_sentence:
            # get embedding of word if exists
            try:
                word_emb = self.embedding_matrix[word]
            except: # read UNK token embedding 
                word_emb = self.embedding_matrix["UNK"]
            sentence_embedding.append(word_emb)
        
        return sentence_embedding

In [30]:
class NeuralNetworkTorch(nn.Module):
    def __init__(self, input_dim, hidden_units, n_labels):
        super(NeuralNetworkTorch, self).__init__()
        self.weights_1 = nn.Linear(input_dim, hidden_units)
        self.relu = nn.ReLU()
        self.weights_2 = nn.Linear(hidden_units, n_labels)
        self.softmax = nn.Softmax(dim=1)  # Apply softmax along the output dimension

    def forward(self, x):
        # ---------------- Input-to-Hidden Layer --------------- #
        x = self.weights_1(x)
        x = self.relu(x)
        # ---------------- Hidden-to-Output Layer --------------- #
        x = self.weights_2(x)
        x = self.softmax(x)
        return x

In [116]:
from model import Model

class NeuralModel_Torch(Model):
    def __init__(self, embeddingfile, max_seq_length, hidden_units, minibatch_size, learning_rate, epochs): 
        self.embeddingfile = embeddingfile
        self.embedding_dim = None
        self.max_seq_length = max_seq_length
        self.hidden_units = hidden_units
        # Layers
        self.model_torch = None
        self.Y_to_categorical = None
        self.criterion = nn.CrossEntropyLoss()
        self.minibatch_size = minibatch_size
        self.epochs = epochs
        self.features_ff_class = None
        self.learning_rate = learning_rate
        self.loss = {}
        
    def convert_to_embeddings(self, sentence):
        '''Convert sentence to embeddings
        '''
        emb = self.features_ff_class.get_features(sentence)
            # try:
        if emb: # if there is a word
            emb_concat = np.concatenate(emb, axis=0)
        else:
            emb_concat = []
        # If you need padding words (i.e., your input is too short), use a vector of zeroes
        if len(emb) < self.max_seq_length:
            # Missing words
            words_missing = self.max_seq_length - len(emb)
            # print(words_missing)
            emb_concat = np.pad(emb_concat, (0, words_missing*self.embedding_dim), 'constant')
        return emb_concat

    
    def train(self, input_file, verbose=False):

        # Read dataset and create vocabulary
        features_ff_class = Features_FeedForward(input_file, self.embeddingfile)
        self.features_ff_class = features_ff_class
        num_labels = len(features_ff_class.labelset)

        # Convert Y from categorical to integers values
        Y_mapping = {label: index for index, label in enumerate(np.unique(features_ff_class.labels))}
        self.Y_to_categorical = {index: label for label, index in Y_mapping.items()} # dictionary to convert back y's to categorical
        Y = [Y_mapping[y] for y in features_ff_class.labels]
        Y = np.array(Y)
        # Convert to OneHot for computing Loss
        # Y_onehot = self.OneHot(Y, num_labels)

        # Get embedding dim
        self.embedding_dim = list(features_ff_class.embedding_matrix.values())[0].shape[0]

        # Number of sentences
        sample_size = len(features_ff_class.tokenized_text)

        # X_train: shape: 50f or 300f-dim × features (u)
        n_inputs = self.max_seq_length*self.embedding_dim # number of features
        X_train = np.zeros((sample_size, n_inputs))

        # Truncate input to the max sequence length
        trunc_tokenized_text = features_ff_class.adjust_max_seq_length(
            features_ff_class.tokenized_text,
            self.max_seq_length
        )

        # Convert to embeddings with zero-padding
        for i, sentence in enumerate(trunc_tokenized_text):
            sentence_emb = self.convert_to_embeddings(sentence)
            X_train[i] = sentence_emb

        minibatch_size = self.minibatch_size

        # Initialize Torch Model
        self.model_torch = NeuralNetworkTorch(n_inputs, self.hidden_units, num_labels)
        # Optimzer
        optimizer = optim.SGD(self.model_torch.parameters(), lr=self.learning_rate)


        #################
        # Torch Tensors #
        #################
        # Permutate the dataset to increase randomness
        np.random.seed(0)
        permutation = np.random.permutation(sample_size)
        # X_train[n_documents, n_features]
        X_permutation = X_train[permutation]
        Y_permutation = Y[permutation]


        # Torch Tensors
        X_permutation = torch.tensor(X_permutation, dtype=torch.float32)
        Y_permutation = torch.tensor(Y_permutation)

        for i in range(self.epochs):
            # Mini-batch_size Implementation
            mini_batch_loss = []
            for j in range(0, sample_size, minibatch_size):
                X_mini_batch = X_permutation[j:j+minibatch_size]
                y_mini_batch = Y_permutation[j:j+minibatch_size]

                ##########################################################
                # ---------------------FORWARD PASS--------------------- #
                ##########################################################
                outputs = self.model_torch(X_mini_batch)

                loss = self.criterion(outputs, y_mini_batch)

                ##########################################################
                # -------------------BACKWARD PASS---------------------- #
                ##########################################################
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                mini_batch_loss.append(loss.item())

            
            self.loss[i] = np.mean(mini_batch_loss)
            if verbose:
                print(f"Epoch: {i+1} - Loss: {self.loss[i]}")

    def classify(self, input_file):
        # Read Input File
        tokenized_text = self.features_ff_class.read_inference_file(input_file)

        # Truncate input to the max sequence length
        trunc_tokenized_text = self.features_ff_class.adjust_max_seq_length(
            tokenized_text,
            self.max_seq_length
        )

        X_test = []
        # Convert to embeddings with zero padding
        for i, sentence in enumerate(trunc_tokenized_text):
            sentence_emb = self.convert_to_embeddings(sentence)
            X_test.append(sentence_emb)
        X_test = np.vstack(X_test)

        # Convert to tensor
        X_test = torch.tensor(X_test, dtype=torch.float32)

        # Make Prediction
        preds_label = []
        with torch.no_grad():
            predicted = self.model_torch(X_test)
            _, y_test = torch.max(predicted, 1)
            for y in y_test:
                tmp = self.Y_to_categorical[y.item()] # Convert to original class
                preds_label.append(tmp)
        
        return preds_label


In [156]:
############
# Products #
############

train_file = "datasets/products/train.txt"
emb_file = "glove.6B.50d.txt"
pred_file = "datasets/products/val.test"
pred_true_labels = "datasets/products/val.txt"
model_file_name = "torch.products.model"
loss_file = "datasets/products/loss.txt"

nn_model = NeuralModel_Torch(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)


########
# 4dim #
########

# train_file = "datasets/4dim/train.txt"
# emb_file = "glove.6B.50d.txt"
# pred_file = "datasets/4dim/val.test"
# pred_true_labels = "datasets/4dim/val.txt"
# model_file_name = "torch.4dim.model"
# loss_file = "datasets/4dim/loss.txt"
# nn_model = NeuralModel_Torch(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)


#############
# questions #
#############

# train_file = "datasets/questions/train.txt"
# emb_file = "ufvytar.100d.txt"
# pred_file = "datasets/questions/val.test"
# pred_true_labels = "datasets/questions/val.txt"
# model_file_name = "torch.questions.model"
# loss_file = "datasets/questions/loss.txt"
# nn_model = NeuralModel_Torch(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)

#########
# odiya #
#########

# train_file = "datasets/odiya/train.txt"
# emb_file = "fasttext.wiki.300d.vec"
# pred_file = "datasets/odiya/val.test"
# pred_true_labels = "datasets/odiya/val.txt"
# model_file_name = "torch.odiya.model"
# loss_file = "datasets/odiya/loss.txt"
# nn_model = NeuralModel_Torch(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)

# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=10, minibatch_size=64, learning_rate=0.35, epochs=100)
# nn_model = NeuralModel(emb_file, max_seq_length=20, hidden_units=10, minibatch_size=64, learning_rate=0.1, epochs=1000)
# nn_model = NeuralModel(emb_file, max_seq_length=10, hidden_units=5, minibatch_size=32, learning_rate=0.1, epochs=100)


In [158]:
nn_model.train(train_file, verbose=True)

Epoch: 1 - Loss: 0.6493482973677981
Epoch: 2 - Loss: 0.6323923690918765
Epoch: 3 - Loss: 0.626327877871098
Epoch: 4 - Loss: 0.6226238647121594
Epoch: 5 - Loss: 0.6191641749048525
Epoch: 6 - Loss: 0.6162016637851856
Epoch: 7 - Loss: 0.6136381490830264
Epoch: 8 - Loss: 0.6112433095285498
Epoch: 9 - Loss: 0.6092542092858648
Epoch: 10 - Loss: 0.6071236630158922
Epoch: 11 - Loss: 0.6047104695457622
Epoch: 12 - Loss: 0.6018602465193695
Epoch: 13 - Loss: 0.599180631988619
Epoch: 14 - Loss: 0.5976826925950548
Epoch: 15 - Loss: 0.5960224956091196
Epoch: 16 - Loss: 0.5937833718361298
Epoch: 17 - Loss: 0.5920294069439356
Epoch: 18 - Loss: 0.5904260096008792
Epoch: 19 - Loss: 0.5886665507328291
Epoch: 20 - Loss: 0.5871475106733708
Epoch: 21 - Loss: 0.5866939762618644
Epoch: 22 - Loss: 0.5847412821339684
Epoch: 23 - Loss: 0.5828933919134316
Epoch: 24 - Loss: 0.5824078328404689
Epoch: 25 - Loss: 0.5808292337356169
Epoch: 26 - Loss: 0.5800598656107312
Epoch: 27 - Loss: 0.5784858658635543
Epoch: 28 - 

In [159]:
nn_model.save_model(model_file_name)

In [160]:
model = NeuralModel_Torch.load_model(model_file_name)

In [161]:
preds = model.classify(pred_file + ".txt")

In [162]:
## Save the predictions: one label prediction per line
with open(pred_file + ".pred.txt", "w") as file:
    for pred in preds:
        file.write(pred+"\n")

# Evaluation

In [163]:
import pandas as pd

In [164]:
# products
pred_file = "datasets/products/val.test"
pred_true_labels = "datasets/products/val.txt"

# 4dim

# pred_file = "datasets/4dim/val.test"
# pred_true_labels = "datasets/4dim/val.txt"


# questions
# pred_file = "datasets/questions/val.test"
# pred_true_labels = "datasets/questions/val.txt"

# odiya

# pred_file = "datasets/odiya/val.test"
# pred_true_labels = "datasets/odiya/val.txt"

In [165]:
true_dataset = pd.read_csv(pred_true_labels, sep='\t', header=None, names=['text', 'true_label'])
pred_dataset = pd.read_csv(pred_file + ".pred.txt", sep='\t', header=None, names=['pred'])

In [166]:
# Check if the columns have the same name; adjust as needed
column_name = 'true_label'  # Change to the actual column name
pred_column_name = 'pred'  # Change to the actual predicted column name

# Merge the two DataFrames on a common index or key if available
merged_df = true_dataset.merge(pred_dataset, left_index=True, right_index=True)

# Calculate the accuracy by comparing the two columns
accuracy = (merged_df[column_name] == merged_df[pred_column_name]).mean()

# Print the accuracy as a percentage
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 63.34%
