In [10]:
import gensim.downloader as api
from gensim.models import KeyedVectors

In [5]:
wv = api.load('word2vec-google-news-300');

In [8]:
wv.save("word2vec.wordvectors")

In [11]:
model_embeddings = KeyedVectors.load("word2vec.wordvectors")

In [21]:
model_embeddings["computer"].shape

(300,)

In [15]:
""" 
    Basic feature extractor
"""
from operator import methodcaller
import string
from collections import Counter, defaultdict
import numpy as np
import re
from itertools import islice

def expand_contradictions(text):

    contraction_mapping = {
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }

    pattern = re.compile(r"\b(?:" + "|".join(re.escape(contraction) for contraction in contraction_mapping.keys()) + r")\b")
    text = pattern.sub(lambda x: contraction_mapping[x.group()], text)
    
    return text

def remove_digits_and_words_digits(text):
    # Define a regular expression pattern to match words containing digits
    pattern = r'\b\w*\d\w*\b'
    text_without_words_with_digits = re.sub(pattern, '', text)

    return text_without_words_with_digits

def remove_stop_words(text):
    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                  'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those',
                  'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
                  'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                  'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
                  'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                  'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
                  'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
                  'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                  "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
                  'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    text_clean = []
    for word in text:
        if word not in stop_words:
            text_clean.append(word)
    return text_clean

def tokenize(text):
    # TODO customize to your needs
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    # re.sub('[^a-zA-Z]', '', dataset['Text'][i])

    # Text preprocessing techniques:
    # 1) Lowercase
    text = text.lower()

    # 2) Expand Contradictions
    text = expand_contradictions(text)

    # 3) Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)

    # 4) Remove digits and words with digits
    text = remove_digits_and_words_digits(text)

    text = text.split()

    # 5) Remove Stop Words
    # text = remove_stop_words(text)

    return text

class Features:

    def __init__(self, data_file):
        with open(data_file) as file:
            data = file.read().splitlines()

        data_split = map(methodcaller("rsplit", "\t", 1), data)
        texts, self.labels = map(list, zip(*data_split))

        self.tokenized_text = [tokenize(text) for text in texts]

        self.labelset = list(set(self.labels))

    @classmethod 
    def get_features(cls, tokenized, model):
        # TODO: implement this method by implementing different classes for different features 
        # Hint: try simple general lexical features first before moving to more resource intensive or dataset specific features 
        pass

In [47]:
def get_features_word2vec(tokenized_sentence, embedding_matrix):
    """Convert sentence to Word2Vec Embeeding.
    Each sentence is represented by the average of all of word embeddings 
    of that sentence.
    """
    sentence_embedding = []
    
    for word in tokenized_sentence:
        # get embedding of word if exists
        try:
            word_emb = embedding_matrix[word]
            sentence_embedding.append(word_emb)
        except:
            pass
    
    # Compute average of the sentence
    if len(sentence_embedding) > 0:
        stacked_arrays = np.vstack(sentence_embedding)
        elementwise_average = np.mean(stacked_arrays, axis=0)
    else:
        elementwise_average = np.zeros(embedding_matrix.shape[1])
    return elementwise_average

In [114]:
################################
# Logistic Regression Features #
################################

class Features_LR_Word2Vec(Features):

    def __init__(self, model_file, embedding_matrix):
        super(Features_LR_Word2Vec, self).__init__(model_file)
        self.embedding_matrix = embedding_matrix # Need to save IDF values for inference

    def read_inference_file(self, input_file):
        """Read inference file that is in the form: <text> i.e. a line
        of text that does not contain a tab.
        """
        with open(input_file) as file:
            data = file.read().splitlines()

        texts = data

        tokenized_text = [tokenize(text) for text in texts]
        return tokenized_text
    
    def get_features_word2vec(self, tokenized_sentence):
        """Convert sentence to Word2Vec Embeeding.
        Each sentence is represented by the average of all of word embeddings 
        of that sentence.
        """
        sentence_embedding = []
        
        for word in tokenized_sentence:
            # get embedding of word if exists
            try:
                word_emb = self.embedding_matrix[word]
                sentence_embedding.append(word_emb)
            except: # remove Out-of-Vocabulary words
                pass
        
        # Compute average of the sentence
        if len(sentence_embedding) > 0:
            stacked_arrays = np.vstack(sentence_embedding)
            elementwise_average = np.mean(stacked_arrays, axis=0)
        else:
            elementwise_average = np.zeros(self.embedding_matrix.vector_size)
        return elementwise_average

In [148]:
"""
 Refer to Chapter 5 for more details on how to implement a LogisticRegression
"""
from work.Model import *

class LogisticRegressionWord2Vec(Model):
    def __init__(self, model_file, learning_rate=None, epochs=None, batch_size=None, embedding_matrix=None):
        super(LogisticRegressionWord2Vec, self).__init__(model_file)
        self.weights = None
        self.bias = None
        self.loss = []
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.Y_to_categorical = None # Map Y label to numerical
        self.batch_size = batch_size
        self.embedding_matrix = embedding_matrix

    def initialize_weights(self, num_features, num_labels):
        self.weights = np.zeros((num_features, num_labels))
        self.bias = np.zeros(num_labels)
        # np.random.seed(0)
        # self.weights = np.random.rand(num_features, num_labels)
        # self.bias = np.random.rand(num_labels)

    def softmax(self, Z):
        """Softmax function: normalizing logit scores
        :param Z([num_documents, num_labels])
        :return e^Z/sum_{i=0}^{k}{e^{Z}}
        """
        return np.exp(Z - np.max(Z, axis=1, keepdims=True))/np.sum(np.exp(Z - np.max(Z, axis=1, keepdims=True)), axis=1, keepdims=True)
    
    def sigmoid(self, Z):
        """Sigmoid function for binary classification

        :param Z([num_documents, num_labels])
        :return 1/(1+e^{-Z})
        """
        return 1/(1 + np.exp(-Z))

        
    def predict_prob(self, X, weights, bias, multinomial):
        """Return prediction of shape [num_documents, num_labels]
        """
        # z[num_documents, num_labels] = X[num_documents, num_features]*W[num_features, num_labels] + bias[num_labels]
        Z = np.dot(X, weights) + bias

        if multinomial:
            # Apply Softmax
            S = self.softmax(Z)
        else:
            # Apply Sigmoid
            S = self.sigmoid(Z)
        return S

    def cross_entropy_loss(self, S, target):
        """Calculate the cross-entropy
        L = -1/n*_sum_{i=0}^{n}{y_i*log(s_i)} 
        y label is a vector containing K classes where yc = 1 if c is the correct class and the remaining elements will be 0.

        :param S[num_documents, num_labels]: probabilities of features after softmax
        :target [num_documents, num_labels]: target one hot encoded
        """
        return -np.mean(np.log(S)*target)
    
    def binary_cross_entropy_loss(self, S, target):
        """Calculate Binary cross-entropy
        """
        return  -np.mean(target*(np.log(S)) + (1-target)*np.log(1-S))

    def OneHot(self, targets, num_labels):
        """Convert arrary of targets to One Hot 
        :param targets([num_documents,])
        :param num_labels(int)
        :return Y[num_documents, num_labels]
        """
        Y_onehot = np.zeros((len(targets), num_labels))
        Y_onehot[np.arange(len(targets)), targets] = 1
        return Y_onehot
    
    def predict(self, X, weights, bias, multinomial):
        """Return prediction of X with the categorical values]
        """
        # z[num_documents, num_labels] = X[num_documents, num_features]*W[num_features, num_labels] + bias[num_labels]
        Z = np.dot(X, weights) + bias

        if multinomial:
            # Apply Softmax
            S = self.softmax(Z)

            # Rows with highest probability
            S_max = np.argmax(S, axis=1)
        else:
            # Apply Sigmoid
            S = self.sigmoid(Z)
            # Rows with highest probability
            S_max = [1 if i > 0.5 else 0 for i in S]

        return S_max
    

    def train(self, input_file, verbose=False):
        """
        This method is used to train your models and generated for a given input_file a trained model
        :param input_file: path to training file with a text and a label per each line
        :return: model: trained model 
        """
        # Read dataset and create vocabulary
        features_lr_class = Features_LR_Word2Vec(input_file, self.embedding_matrix)
        embedding_size = self.embedding_matrix.vector_size

        # Replace words that are not in vocabulary with OOV (Out-of-Vocabulary)
        # token
        updated_text = []
        for sentence in enumerate(features_lr_class.tokenized_text):
            tmp = features_lr_class.get_features_word2vec(sentence)
            updated_text.append(tmp)
        
        # Transform dataset to Word2Vec space
        # Return features with format (n_documents, embedding_size=300)
        X = np.array(updated_text)
        
        # Y
        Y_mapping = {label: index for index, label in enumerate(np.unique(features_lr_class.labels))}
        self.Y_to_categorical = {index: label for label, index in Y_mapping.items()} # dictionary to convert back y's to categorical
        Y = [Y_mapping[y] for y in features_lr_class.labels]

        # Initialize Weights
        sample_size = len(features_lr_class.tokenized_text)
        n_features = embedding_size
        num_labels = len(features_lr_class.labelset)


        # Check if it's multinomial or binary classification
        if num_labels == 2:
            multinomial = False
            num_labels = 1 # Only one column to reference 0 or 1
        else:
            multinomial = True

        self.initialize_weights(n_features, num_labels)

        # One Hot encoded Y
        if multinomial:
            Y_onehot = self.OneHot(Y, num_labels)
        else:
            Y_onehot = np.array(Y).reshape(-1, 1)

        np.random.seed(0)
        permutation = np.random.permutation(sample_size)
        X_permutation = X[permutation]
        Y_permutation_onehot = Y_onehot[permutation]

        batch_size = self.batch_size

        for i in range(self.epochs):
            
            # Batch_size implementation
            for j in range(0, sample_size, batch_size):
                X_mini_batch = X_permutation[j:j+batch_size]
                y_mini_batch = Y_permutation_onehot[j:j+batch_size]

                # Z = softmax(X*W + b)
                prob = self.predict_prob(X_mini_batch, self.weights, self.bias, multinomial)

                # dL/dW
                grad_w = (1/batch_size)*np.dot(X_mini_batch.T, prob - y_mini_batch)
                grad_b =  (1/batch_size)*np.sum(prob - y_mini_batch, axis=0)

            # # break            
            # dL/dW
                # grad_w = (1/sample_size)*np.dot(X.T, prob - Y_onehot)
                # grad_b =  (1/sample_size)*np.sum(prob - Y_onehot, axis=0)

                self.weights = self.weights - (self.learning_rate*grad_w)
                self.bias = self.bias - (self.learning_rate*grad_b)

            # Computing cross-entropy loss
            if multinomial:
                loss = self.cross_entropy_loss(prob, y_mini_batch)
            else:
                loss = self.binary_cross_entropy_loss(prob, y_mini_batch)

            if verbose:
                print(f"Epoch: {i+1} - Loss: {loss}")

        model = {
            "feature_weights": {
                "weights": self.weights,
                "bias": self.bias,
                "Y_to_categorical": self.Y_to_categorical
            },
            "Feature": features_lr_class
        }
        ## Save the model
        self.save_model(model)
        return X, Y_onehot, prob, 


    def classify(self, input_file, model):
        """
        This method will be called by us for the validation stage and or you can call it for evaluating your code 
        on your own splits on top of the training sets seen to you
        :param input_fixle: path to input file with a text per line without labels
        :param model: the pretrained model
        :return: predictions list
        """

        feature_weights = model["feature_weights"]
        Feature_LR_class = model["Feature"]

        # Read Input File
        tokenized_text = Feature_LR_class.read_inference_file(input_file)
        # Replace words that are not in vocabulary with OOV (Out-of-Vocabulary)
        # token
        updated_text = []
        for sentence in enumerate(tokenized_text):
            tmp = Feature_LR_class.get_features_word2vec(sentence)
            updated_text.append(tmp)

        tokenized_text = updated_text
        

        X = np.vstack(tokenized_text)

        print(X.shape)

        # Prediction
        multinomial = True if len(feature_weights['Y_to_categorical'].keys()) > 2 else False
        preds_numerical = self.predict(X, feature_weights['weights'], feature_weights['bias'], multinomial)
        # Map indexes to Categorical space
        preds_label = []
        probs = self.predict_prob(X, feature_weights['weights'], feature_weights['bias'], multinomial)
        for y in preds_numerical:
            tmp = feature_weights['Y_to_categorical'][y]
            preds_label.append(tmp)
        
        return preds_label, probs, tokenized_text

In [178]:
# questions
# train_file = "work/datasets/questions/train.txt"
# pred_file = "work/datasets/questions/val.test"
# pred_true_labels = "work/datasets/questions/val.txt"
# model_file_name = "logreg.questions.model"
# # model_LR = LogisticRegression(model_file_name, learning_rate=0.1, epochs=1000, threshold=0, max_features=10)
# model_LR = LogisticRegression(model_file_name, learning_rate=0.15, epochs=500, threshold=0, max_features=150)
# X, Y, prob  = model_LR.train(train_file, batch_size=32, verbose=True)

# odiya
# train_file = "work/datasets/odiya/train.txt"
# pred_file = "work/datasets/odiya/val.test"
# pred_true_labels = "work/datasets/odiya/val.txt"
# model_file_name = "logreg.odiya.model"
# model_LR = LogisticRegression(model_file_name, learning_rate=0.01, epochs=1000, threshold=10, max_features=1000)
# X, Y, prob  = model_LR.train(train_file, batch_size=256, verbose=True)


# 4dim
train_file = "work/datasets/4dim/train.txt"
pred_file = "work/datasets/4dim/val.test"
pred_true_labels = "work/datasets/4dim/val.txt"
model_file_name = "word2vec_log.4dim.model"
model_LR = LogisticRegressionWord2Vec(model_file_name, learning_rate=0.9, epochs=500, batch_size=128, embedding_matrix=model_embeddings)
X, Y, prob  = model_LR.train(train_file, verbose=True)


# #Products
# train_file = "work/datasets/products/train.txt"
# pred_file = "work/datasets/products/val.test"
# pred_true_labels = "work/datasets/products/val.txt"
# model_file_name = "word2vec_log.products.model"
# # model_LR = LogisticRegression(model_file_name, learning_rate=0.9, epochs=1000, threshold=2, max_features=500)
# #80% of the dataset
# model_LR = LogisticRegressionWord2Vec(model_file_name, learning_rate=0.9, epochs=100, batch_size=16, embedding_matrix=model_embeddings)

Epoch: 1 - Loss: 0.3455683356205985
Epoch: 2 - Loss: 0.3388484446076833
Epoch: 3 - Loss: 0.3333682782564171
Epoch: 4 - Loss: 0.3287315025056436
Epoch: 5 - Loss: 0.32471381844996877
Epoch: 6 - Loss: 0.3211664071541381
Epoch: 7 - Loss: 0.3179870322245108
Epoch: 8 - Loss: 0.31510333073549973
Epoch: 9 - Loss: 0.31246266812888995
Epoch: 10 - Loss: 0.3100257911866595
Epoch: 11 - Loss: 0.30776274499194595
Epoch: 12 - Loss: 0.305650178889223
Epoch: 13 - Loss: 0.3036695282891939
Epoch: 14 - Loss: 0.301805762717811
Epoch: 15 - Loss: 0.30004650815682354
Epoch: 16 - Loss: 0.29838142164776377
Epoch: 17 - Loss: 0.2968017388352722
Epoch: 18 - Loss: 0.29529994187029457
Epoch: 19 - Loss: 0.29386951222571733
Epoch: 20 - Loss: 0.29250474417142036
Epoch: 21 - Loss: 0.29120060209778403
Epoch: 22 - Loss: 0.2899526098988806
Epoch: 23 - Loss: 0.2887567640602617
Epoch: 24 - Loss: 0.28760946447092123
Epoch: 25 - Loss: 0.28650745863812266
Epoch: 26 - Loss: 0.28544779615368454
Epoch: 27 - Loss: 0.2844277910923253

In [179]:
X, Y, prob  = model_LR.train(train_file, verbose=True)

Epoch: 1 - Loss: 0.3455683356205985
Epoch: 2 - Loss: 0.3388484446076833
Epoch: 3 - Loss: 0.3333682782564171
Epoch: 4 - Loss: 0.3287315025056436
Epoch: 5 - Loss: 0.32471381844996877
Epoch: 6 - Loss: 0.3211664071541381
Epoch: 7 - Loss: 0.3179870322245108
Epoch: 8 - Loss: 0.31510333073549973
Epoch: 9 - Loss: 0.31246266812888995
Epoch: 10 - Loss: 0.3100257911866595
Epoch: 11 - Loss: 0.30776274499194595
Epoch: 12 - Loss: 0.305650178889223
Epoch: 13 - Loss: 0.3036695282891939
Epoch: 14 - Loss: 0.301805762717811
Epoch: 15 - Loss: 0.30004650815682354
Epoch: 16 - Loss: 0.29838142164776377
Epoch: 17 - Loss: 0.2968017388352722
Epoch: 18 - Loss: 0.29529994187029457
Epoch: 19 - Loss: 0.29386951222571733
Epoch: 20 - Loss: 0.29250474417142036
Epoch: 21 - Loss: 0.29120060209778403
Epoch: 22 - Loss: 0.2899526098988806
Epoch: 23 - Loss: 0.2887567640602617
Epoch: 24 - Loss: 0.28760946447092123
Epoch: 25 - Loss: 0.28650745863812266
Epoch: 26 - Loss: 0.28544779615368454
Epoch: 27 - Loss: 0.2844277910923253

In [180]:
preds, prob, X = model_LR.classify(pred_file + ".txt", model_LR.load_model())

(312, 300)


In [181]:
## Save the predictions: one label prediction per line
with open(pred_file + ".pred.txt", "w") as file:
    for pred in preds:
        file.write(pred+"\n")

# Evaluation

In [182]:
import pandas as pd

In [183]:
true_dataset = pd.read_csv(pred_true_labels, sep='\t', header=None, names=['text', 'true_label'])
pred_dataset = pd.read_csv(pred_file + ".pred.txt", sep='\t', header=None, names=['pred'])

In [184]:
train_dataset = pd.read_csv(train_file, sep='\t', header=None, names=['text', 'true_label'])

In [185]:
true_dataset['true_label'].value_counts()

pos.dec    94
neg.tru    83
pos.tru    70
neg.dec    65
Name: true_label, dtype: int64

In [186]:
true_dataset.shape

(312, 2)

In [187]:
pred_dataset.shape

(312, 1)

In [188]:
# Check if the columns have the same name; adjust as needed
column_name = 'true_label'  # Change to the actual column name
pred_column_name = 'pred'  # Change to the actual predicted column name

# Merge the two DataFrames on a common index or key if available
merged_df = true_dataset.merge(pred_dataset, left_index=True, right_index=True)

# Calculate the accuracy by comparing the two columns
accuracy = (merged_df[column_name] == merged_df[pred_column_name]).mean()

# Print the accuracy as a percentage
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 25.64%
