In [37]:
from operator import methodcaller
import string
import re
from collections import Counter, defaultdict
import numpy as np
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [38]:
def expand_contradictions(text):

    contraction_mapping = {
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }

    pattern = re.compile(r"\b(?:" + "|".join(re.escape(contraction) for contraction in contraction_mapping.keys()) + r")\b")
    text = pattern.sub(lambda x: contraction_mapping[x.group()], text)
    
    return text

def remove_digits_and_words_digits(text):
    # Define a regular expression pattern to match words containing digits
    pattern = r'\b\w*\d\w*\b'
    text_without_words_with_digits = re.sub(pattern, '', text)

    return text_without_words_with_digits

def remove_stop_words(text):
    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                  'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those',
                  'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
                  'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                  'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
                  'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                  'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
                  'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
                  'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                  "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
                  'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    text_clean = []
    for word in text:
        if word not in stop_words:
            text_clean.append(word)
    return text_clean


def tokenize(text, split=True):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    # re.sub('[^a-zA-Z]', '', dataset['Text'][i])

    # Text preprocessing techniques:
    # 1) Lowercase
    text = text.lower()

    # 2) Expand Contradictions
    text = expand_contradictions(text)

    # 3) Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)

    # 4) Remove digits and words with digits
    text = remove_digits_and_words_digits(text)
    if split:
        text = text.split()

    # # 5) Remove Stop Words
    # if stop_words:
    # text = remove_stop_words(text)

    return text

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [40]:
class Features_FeedForward(Features):

    def __init__(self, input_file, embedding_file):
        super(Features_FeedForward, self).__init__(input_file)
        self.embedding_matrix = self.read_embedding_file(embedding_file) # Need to save EmbeddingMatrix values for inference

    def adjust_max_seq_length(self, tokenized_text, max_seq_length):
        """Adjust size of data input to the max sequence length
        :param tokenized_text: data input
        :param max_seq_length: the max sequence length
        :return list: truncated sentences
        """
        new_tokenized_text = []
        for sentence in tokenized_text:
            new_tokenized_text.append(sentence[:max_seq_length])
        return new_tokenized_text

        
    def read_embedding_file(self, embedding_file):
        '''Read embedding file

        :param embedding_file (str):
        :return: dict: embedding matrix
        '''

        embedding_matrix = dict()
        try: 
            with open(embedding_file, "r") as file:
                for line in file:
                    values = line.strip().split()
                    word = values[0]
                    word_embedding = np.array([float(emb) for emb in values[1:]])
                    embedding_matrix[word] = word_embedding
            return embedding_matrix
        except OSError as e:
            print("Embedding file " + embedding_file + " is not available, please input the right parth to the file.")

    def read_inference_file(self, input_file):
        """Read inference file that is in the form: <text> i.e. a line
        of text that does not contain a tab.
        """
        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text
    
    def get_features(self, tokenized_sentence):
        """Convert sentence to word embeeding values.
        :param tokenized_sentence
        :return feature weights
        """
        sentence_embedding = []
        
        for word in tokenized_sentence:
            # get embedding of word if exists
            try:
                word_emb = self.embedding_matrix[word]
            except: # read UNK token embedding 
                word_emb = self.embedding_matrix["UNK"]
            sentence_embedding.append(word_emb)
        
        return sentence_embedding

In [41]:
features_class = Features_FeedForward("datasets/questions.train.txt", "glove.6B.50d.txt")

In [151]:
len(features_class.embedding_matrix['UNK'])

50

In [95]:
from model import Model
from nn_layers import FeedForwardNetwork

class NeuralModel(Model):
    def __init__(self, embeddingfile, max_seq_length, hidden_units, minibatch_size, epochs): 
        # self.network = FeedForwardNetwork()
        self.embeddingfile = embeddingfile
        self.embedding_dim = None
        self.max_seq_length = max_seq_length
        self.hidden_units = hidden_units
        self.weights_a = None
        self.bias_a = None
        self.weights_b = None
        self.bias_b = None
        self.Y_to_categorical = None
        self.minibatch_size = minibatch_size
        self.epochs = epochs
        self.features_ff_class = None
    
    def initialize_weights(self, n_inputs, n_output):
        # weights = np.zeros((n_inputs, n_output))
        # bias = np.zeros(n_output)
        # np.random.seed(0)
        weights = np.random.rand(n_inputs, n_output)
        bias = np.random.rand(n_output)
        return weights, bias
    
    def relu_function(self, A):
        '''A = x*W + b

        :return: Z = relut(x*A+b)
        '''
        return np.maximum(0, A)

    def cross_entropy_loss(self, S, target):
        """Calculate the cross-entropy
        L = -1/n*_sum_{i=0}^{n}{y_i*log(s_i)} 
        y label is a vector containing K classes where yc = 1 if c is the correct class and the remaining elements will be 0.

        :param S[num_documents, num_labels]: probabilities of features after softmax
        :target [num_documents, num_labels]: target one hot encoded
        """
        return -np.mean(np.log(S)*target)

    def softmax(self, Z):
        """Softmax function: normalizing logit scores
        :param Z([num_documents, num_labels])
        :return e^Z/sum_{i=0}^{k}{e^{Z}}
        """
        return np.exp(Z - np.max(Z, axis=1, keepdims=True))/np.sum(np.exp(Z - np.max(Z, axis=1, keepdims=True)), axis=1, keepdims=True)
    
    def OneHot(self, targets, num_labels):
        """Convert arrary of targets to One Hot 
        :param targets([num_documents,])
        :param num_labels(int)
        :return Y[num_documents, num_labels]
        """
        Y_onehot = np.zeros((len(targets), num_labels))
        Y_onehot[np.arange(len(targets)), targets] = 1
        return Y_onehot
    
    def predict(self, X):
        """Return prediction of X with the categorical values]
        """
        # z[num_documents, num_labels] = X[num_documents, num_features]*W[num_features, num_labels] + bias[num_labels]
        A = np.dot(X, self.weights_a) + self.bias_a
        h = self.relu_function(A)

        A_2 = np.dot(h, self.weights_b) + self.bias_b

        O = self.softmax(A_2)

        # Rows with highest probability
        S_max = np.argmax(O, axis=1)

        return S_max

    
    def train(self, input_file, verbose=False):

        # Read dataset and create vocabulary
        features_ff_class = Features_FeedForward(input_file, self.embeddingfile)
        self.features_ff_class = features_ff_class
        num_labels = len(features_ff_class.labelset)

        # Convert Y from categorical to integers values
        Y_mapping = {label: index for index, label in enumerate(np.unique(features_ff_class.labels))}
        self.Y_to_categorical = {index: label for label, index in Y_mapping.items()} # dictionary to convert back y's to categorical
        Y = [Y_mapping[y] for y in features_ff_class.labels]
        # Convert to OneHot for computing Loss
        Y_onehot = self.OneHot(Y, num_labels)

        # Get embedding dim
        self.embedding_dim = list(features_ff_class.embedding_matrix.values())[0].shape[0]

        # Number of sentences
        sample_size = len(features_ff_class.tokenized_text)

        # X_train: shape: 50f or 300f-dim × u
        n_inputs = self.max_seq_length*self.embedding_dim # number of features
        X_train = np.zeros((sample_size, n_inputs))

        # Truncate input to the max sequence length
        trunc_tokenized_text = features_ff_class.adjust_max_seq_length(
            features_ff_class.tokenized_text,
            self.max_seq_length
        )

        # Convert to embeddings
        for i, sentence in enumerate(trunc_tokenized_text):
            emb = features_ff_class.get_features(sentence)
            emb_concat = np.concatenate(emb, axis=0)
            # If you need padding words (i.e., your input is too short), use a vector of zeroes
            if len(emb) < self.max_seq_length:
                # Missing words
                words_missing = self.max_seq_length - len(emb)
                # print(words_missing)
                emb_concat = np.pad(emb_concat, (0, words_missing*self.embedding_dim), 'constant')
                # print(emb_concat.shape)

            X_train[i] = emb_concat

        minibatch_size = self.minibatch_size

        # Initialize Wieghts
        # Create W_a and b_a
        # W_a[n_documents, hidden_units (u)]
        # b_a[hidden_units (u)]
        W_a, b_a = self.initialize_weights(n_inputs, self.hidden_units)
        # Create Wb and b_b
        # W_b[hidden_units (u), num_labels (d)]
        # b_b[num_labels]
        W_b, b_b = self.initialize_weights(self.hidden_units, num_labels)

        # Permutate the dataset to increase randomness
        np.random.seed(0)
        permutation = np.random.permutation(sample_size)
        # X_train[n_documents, n_features]
        X_permutation = X_train[permutation]
        Y_permutation_onehot = Y_onehot[permutation]

        self.weights_a = W_a
        self.bias_a = b_a
        self.weights_b = W_b
        self.bias_b = b_b
        for i in range(self.epochs):
            # Mini-batch_size Implementation
            mini_batch_loss = []
            for j in range(0, sample_size, minibatch_size):
                X_mini_batch = X_permutation[j:j+minibatch_size]
                y_mini_batch = Y_permutation_onehot[j:j+minibatch_size]

                ##########################################################
                # ---------------------FORWARD PASS--------------------- #
                ##########################################################
            
                # ---------------- Input-to-Hidden Layer --------------- #
                # A = W_a*X + b_a
                # A[n_documents, hidden_units (u)]
                A = np.dot(X_mini_batch, self.weights_a) + self.bias_a
                # Hidden Unit
                # h = relu(A)
                # h[n_documents, hidden_units (u)]
                h = self.relu_function(A)

                # ---------------- Hidden-to-Output Layer --------------- #
                # A_2 = W_b*h + b_b
                # A_2[n_documents, num_labels (d)]
                A_2 = np.dot(h, self.weights_b) + self.bias_b
                # Output Layer
                # O = softmax(A_2)
                # O[n_documents, num_labels (d)]
                O = self.softmax(A_2)

                ##########################################################
                # -------------------BACKWARD PASS---------------------- #
                ##########################################################

                ########
                # Loss #
                ########
                mini_batch_loss.append(self.cross_entropy_loss(O, y_mini_batch))

            loss = np.mean(mini_batch_loss)
            if verbose:
                print(f"Epoch: {i+1} - Loss: {loss}")

        # model = {
        #     "feature_weights": {
        #         "weights_a": self.weights_a,
        #         "bias_a": self.bias_a,
        #         "weights_b": self.weights_b,
        #         "bias_b": self.bias_b,
        #         "Y_to_categorical": self.Y_to_categorical
        #     },
        #     "Feature": features_ff_class
        # }

        # ## Save the model
        # self.save_model(model)
        # return model 

    def classify(self, input_file):
        # Read Input File
        tokenized_text = self.features_ff_class.read_inference_file(input_file)

        # Truncate input to the max sequence length
        trunc_tokenized_text = self.features_ff_class.adjust_max_seq_length(
            tokenized_text,
            self.max_seq_length
        )

        X_test = []
        # Convert to embeddings
        for i, sentence in enumerate(trunc_tokenized_text):
            emb = self.features_ff_class.get_features(sentence)
            emb_concat = np.concatenate(emb, axis=0)
            # If you need padding words (i.e., your input is too short), use a vector of zeroes
            if len(emb) < self.max_seq_length:
                # Missing words
                words_missing = self.max_seq_length - len(emb)
                # print(words_missing)
                emb_concat = np.pad(emb_concat, (0, words_missing*self.embedding_dim), 'constant')
                # print(emb_concat.shape)

            X_test.append(emb_concat)
        X_test = np.vstack(X_test)


In [96]:
nn_model = NeuralModel("glove.6B.50d.txt", 10, 5, 32, 10)
# embeddingfile, max_seq_length, hidden_units): 

In [97]:
nn_model.train("datasets/4dim.train.txt", verbose=True)

Epoch: 1 - Loss: 2.0132703239839955
Epoch: 2 - Loss: 2.0132703239839955
Epoch: 3 - Loss: 2.0132703239839955
Epoch: 4 - Loss: 2.0132703239839955
Epoch: 5 - Loss: 2.0132703239839955
Epoch: 6 - Loss: 2.0132703239839955
Epoch: 7 - Loss: 2.0132703239839955
Epoch: 8 - Loss: 2.0132703239839955
Epoch: 9 - Loss: 2.0132703239839955
Epoch: 10 - Loss: 2.0132703239839955


In [98]:
model_name = "nn.4dim.model"
nn_model.save_model(model_name)

In [99]:
model = NeuralModel.load_model(model_name)

In [100]:
input_file = "datasets/4dim/val.test.txt"

# Read Input File
tokenized_text = model.features_ff_class.read_inference_file(input_file)

In [101]:
tokenized_text

[['the',
  'hyatt',
  'regency',
  'chicago',
  'hotel',
  'is',
  'perfecty',
  'located',
  'in',
  'the',
  'center',
  'of',
  'downtown',
  'chicago',
  'whether',
  'you',
  'are',
  'going',
  'there',
  'for',
  'business',
  'or',
  'pleasure',
  'it',
  'is',
  'in',
  'the',
  'perfect',
  'place',
  'the',
  'rooms',
  'are',
  'large',
  'and',
  'beautiful',
  'and',
  'the',
  'ball',
  'room',
  'took',
  'my',
  'breath',
  'away',
  'the',
  'wi',
  'fi',
  'connection',
  'was',
  'perfect',
  'for',
  'the',
  'work',
  'i',
  'needed',
  'to',
  'do',
  'and',
  'the',
  'show',
  'at',
  'the',
  'navy',
  'pier',
  'was',
  'perfect',
  'for',
  'when',
  'i',
  'needed',
  'a',
  'break',
  'other',
  'hotels',
  'have',
  'nothing',
  'on',
  'the',
  'hyatt',
  'i',
  'just',
  'wish',
  'there',
  'was',
  'a',
  'hyatt',
  'regency',
  'in',
  'every',
  'city',
  'for',
  'all',
  'of',
  'my',
  'business',
  'trips'],
 ['recently',
  'returned',
  'from',

In [167]:
    

    preds = model.classify(args.i)

(1560, 100)