### Import Libraries 

In [1]:
import os
import pickle

import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import gensim.downloader as api
import spacy

from datasets import load_dataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


## Part 0. Dataset Preparation

In [2]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

### Dataset Exploration

In [3]:
#Number of sentences in each set 
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [4]:
print(f"Sample sentence from train dataset: {test_dataset[0]['text']}")
print(f"Label: {'Positive' if test_dataset[0]['label'] == 1 else 'Negative'}")

Sample sentence from train dataset: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
Label: Positive


## Part 1. Preparing Word Embeddings

### Question 1 Word Embedding

#### (a) What is the size of the vocabulary formed in your training data

In [5]:
# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

def build_vocab(train_dataset):
    vocab = set()
    train_tokenized = []  # Store tokenized sentences

    # Loop through each sentence in the dataset
    for sentence in train_dataset['text']:
        # Use SpaCy's tokenizer
        doc = nlp(sentence.lower())  # Lowercase for consistency

        # Extract tokens
        word_list = [token.text for token in doc]  # Tokenized words

        # Add cleaned words into the vocabulary (no need to strip quotes with SpaCy)
        vocab.update(word_list)

        # Store tokenized sentence
        train_tokenized.append(word_list)

    vocab.discard('')  # Remove any empty string from the vocabulary
    
    # Convert vocab set to a list to index it
    vocab_list = sorted(vocab)
    return vocab, train_tokenized

vocab_list, train_tokenized = build_vocab(train_dataset)

# Show the number of words in the vocabulary
print(f"Number of words in the vocabulary: {len(vocab_list)}")

# Print a sample tokenized sentence
print("Sample tokenized sentence:", train_tokenized[0])

Number of words in the vocabulary: 16631
Sample tokenized sentence: ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '"', 'conan', '"', 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean', '-', 'claud', 'van', 'damme', 'or', 'steven', 'segal', '.']


#### (b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

In [6]:
# Initialize word_to_index with <PAD> at 0 and <UNK> at 1
word_to_index = {"<PAD>": 0, "<UNK>": 1}

# Assign indices to words in vocab_list starting from 2
for idx, word in enumerate(vocab_list, start=2):
    word_to_index[word] = idx

print("Sample word-to-index mapping:", list(word_to_index.items())[:10])  # Display first 10 mappings

Sample word-to-index mapping: [('<PAD>', 0), ('<UNK>', 1), ('indecipherable', 2), ('bonding', 3), ('corcuera', 4), ('hide', 5), ('boast', 6), ('gesture', 7), ('1989', 8), ('sencillamente', 9)]


In [7]:
def load_glove_embeddings(file_path, embedding_size=300):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if len(vector) == embedding_size:
                embeddings[word] = vector
    return embeddings

# Path to GloVe embeddings file and embedding size
glove_path = '/Users/anushreearora/Downloads/glove/glove.6B.300d.txt'  
embedding_size = 300
glove_embeddings = load_glove_embeddings(glove_path, embedding_size)

In [8]:
# Initialize the embedding matrix with None values for all words
embedding_matrix = [None] * (len(vocab_list) + 2)  # +2 for <PAD> and <UNK> tokens

# Define <UNK> and <PAD> embeddings
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)  # Random vector for <UNK>
pad_vector = np.zeros(embedding_size)  # Zero vector for <PAD>

# Set <PAD> and <UNK> embeddings initially
embedding_matrix[0] = pad_vector  # <PAD> at index 0
embedding_matrix[1] = unk_vector  # <UNK> at index 1

# Populate the embedding matrix
for word, idx in word_to_index.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]  # Assign GloVe embedding
    # If the word is not in GloVe, it remains None 

In [9]:
# Derive OOV words from the embedding matrix
oov_words = [word for word, idx in word_to_index.items() if embedding_matrix[idx] is None]

oov_words = sorted(oov_words)

# Print results
print(f"Number of OOV words: {len(oov_words)}")
for word in oov_words:
    print(word)

Number of OOV words: 690
--with
-a
-after
-doing
-greaseballs
-hollywood
-inevitable
-of
-quite
-stunning
-the
-west
28k
3/4th
4/5ths
a]n
abandone
aborbing
absolutamente
aburrido
acabamos
accomodates
aceitou
achival
achronological
acontecimentos
actorish
actory
actuación
actuada
adapted-
addessi
adorability
adventues
affirmational
ain't-
alientation
allodi
amoses
amusedly
and-
andamento
animé
anteing
apallingly
apesar
aproveitar
aqueles
aren't
arriesgado
artsploitation
artístico
assistir
atacar
atacarse
atreve
auteil
autocritique
b+
bazadona
bergmanesque
beseechingly
bibbidy
bierbichler
birot
bizzarre
bjorkness
black-&-white
blighter
blutarsky
bobbidi
bondish
bornin
bottomlessly
bruckheimeresque
brûlée
bull's
burningly
bustingly
butterfingered
c'm
cadness
cam'ron
camareras
can&#8217
cannier
captivatingly
capturou
carente
cativante
catsup--
certamente
chabrolian
character-
character]is
chopsocky
choquart
cineasts
cinemantic
cipherlike
cirulnick
class-
claustrophic
clericks
cletis
clichê

#### (c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.

In [28]:
%pip install deepl

Collecting deepl
  Downloading deepl-1.19.1-py3-none-any.whl.metadata (27 kB)
Downloading deepl-1.19.1-py3-none-any.whl (35 kB)
Installing collected packages: deepl
Successfully installed deepl-1.19.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
import deepl
from langdetect import detect, DetectorFactory
import langdetect

# Ensure consistent results with langdetect
DetectorFactory.seed = 0

# Initialize the DeepL translator with your API key
DEEPL_API_KEY = "5d44ea5a-e29f-4bf0-afe3-f3d4a7128c33:fx"  
translator = deepl.Translator(DEEPL_API_KEY)

# Function to translate words only if they are non-English
def translate_word(word):
    try:
        # Detect the language of the word
        language = detect(word)
        
        # Only translate if the language is not English ('en')
        if language != 'en':
            result = translator.translate_text(word, target_lang="EN-US")
            translated_text = result.text.lower()
            print(f"Original: '{word}' ({language}) -> Translated: '{translated_text}'")
            return translated_text
        else:
            print(f"Skipped (Already English): '{word}'")
            return word  # Return the word as is if it's already English
    except langdetect.lang_detect_exception.LangDetectException as e:
        print(f"Language detection failed for '{word}': {e}")
        return word  # Return the word unchanged if language detection fails
    except Exception as e:
        print(f"Translation failed for '{word}': {e}")
        return word  # Return the word unchanged if translation fails

# Example usage with OOV words
translated_oov_words = {word: translate_word(word) for word in oov_words}

Skipped (Already English): '--with'
Original: '-a' (tl) -> Translated: '-a'
Original: '-after' (da) -> Translated: '-after'
Original: '-doing' (pt) -> Translated: '-doing'
Skipped (Already English): '-greaseballs'
Original: '-hollywood' (cy) -> Translated: '-hollywood'
Original: '-inevitable' (sl) -> Translated: '-inevitable'
Skipped (Already English): '-of'
Original: '-quite' (fr) -> Translated: '-quite'
Original: '-stunning' (fi) -> Translated: '-stunning'
Skipped (Already English): '-the'
Original: '-west' (af) -> Translated: '-west'
Original: '28k' (sw) -> Translated: '28k'
Original: '3/4th' (vi) -> Translated: '3/4th'
Original: '4/5ths' (vi) -> Translated: '4/5ths'
Original: 'a]n' (hu) -> Translated: 'a]n'
Original: 'abandone' (id) -> Translated: 'abandone'
Original: 'aborbing' (tl) -> Translated: 'absorbing'
Original: 'absolutamente' (ca) -> Translated: 'absolutely'
Original: 'aburrido' (es) -> Translated: 'boring'
Original: 'acabamos' (es) -> Translated: 'we're done'
Original: '

In [12]:
updated_oov_words = {}

# Loop through each translated word
for original_word, translated_word in translated_oov_words.items():
    # Get the index of the original word in the embedding matrix
    original_index = word_to_index[original_word]

    # Check if the translated word has a GloVe embedding
    if translated_word in glove_embeddings:
        # Only assign if the original OOV word's embedding is still None
        if embedding_matrix[original_index] is None:
            embedding_matrix[original_index] = glove_embeddings[translated_word]
    else:
        # If still not in the vocabulary or no GloVe embedding, keep it as OOV
        updated_oov_words[original_word] = translated_word

# Update the list of OOV words for further processing
# oov_words = list(updated_oov_words.keys())

# Print out the results
print(f"Number of OOV words after translation and matching with GloVe: {len(updated_oov_words)}")

Number of OOV words after translation and matching with GloVe: 503


In [14]:
import Levenshtein

# Define the maximum allowable edit distance
EDIT_DISTANCE_THRESHOLD = 2

# Function to find the closest match in glove_embeddings using edit distance
def edit_distance(word, glove_embeddings, threshold=EDIT_DISTANCE_THRESHOLD):
    best_match = None
    best_distance = float('inf')
    
    for vocab_word in glove_embeddings:
        distance = Levenshtein.distance(word, vocab_word)
        
        # Update best match if a closer match is found
        if distance < best_distance:
            best_match = vocab_word
            best_distance = distance

    return best_distance, best_match

# List to store corrected words along with their original words
corrections = []

# Loop through each translated OOV word
for original_word, translated_word in updated_oov_words.items():
    # Find the best match using edit distance
    best_distance, best_match = edit_distance(translated_word, glove_embeddings)
    
    # If the best match is within the distance threshold, consider it a valid correction
    if best_distance <= EDIT_DISTANCE_THRESHOLD:
        corrections.append((original_word, best_match))
        
        # Assign the GloVe embedding for the corrected word if it doesn't already exist in the embedding matrix
        original_index = word_to_index[original_word]
        if embedding_matrix[original_index] is None:
            embedding_matrix[original_index] = glove_embeddings[best_match]
            print(f"Assigned embedding: '{original_word}' -> '{best_match}' (distance: {best_distance})")

# Remove corrected words from updated_oov_words
for original_word, corrected_word in corrections:
    updated_oov_words.pop(original_word, None)

# Print results after corrections
print(f"Corrected words: {corrections}")
print(f"Number of words corrected: {len(corrections)}")
print(f"Number of OOV words remaining after corrections: {len(updated_oov_words)}")
print("Remaining OOV words:", list(updated_oov_words.keys()))


Assigned embedding: '--with' -> 'with' (distance: 2)
Assigned embedding: '-a' -> 'a' (distance: 1)
Assigned embedding: '-after' -> 'after' (distance: 1)
Assigned embedding: '-doing' -> 'doing' (distance: 1)
Assigned embedding: '-greaseballs' -> 'greaseball' (distance: 2)
Assigned embedding: '-hollywood' -> 'hollywood' (distance: 1)
Assigned embedding: '-inevitable' -> 'inevitable' (distance: 1)
Assigned embedding: '-of' -> 'of' (distance: 1)
Assigned embedding: '-quite' -> 'quite' (distance: 1)
Assigned embedding: '-stunning' -> 'stunning' (distance: 1)
Assigned embedding: '-the' -> 'the' (distance: 1)
Assigned embedding: '-west' -> 'west' (distance: 1)
Assigned embedding: '28k' -> '28' (distance: 1)
Assigned embedding: '3/4th' -> '34th' (distance: 1)
Assigned embedding: '4/5ths' -> '45th' (distance: 2)
Assigned embedding: 'a]n' -> 'an' (distance: 1)
Assigned embedding: 'abandone' -> 'abandoned' (distance: 1)
Assigned embedding: 'achival' -> 'archival' (distance: 1)
Assigned embedding:

In [15]:
# Assign <UNK> embedding to remaining OOV words
for original_word in updated_oov_words:
    original_index = word_to_index[original_word]
    if embedding_matrix[original_index] is None:
        embedding_matrix[original_index] = unk_vector
        print(f"Assigned <UNK> embedding to OOV word: '{original_word}'")

print(len(embedding_matrix))

# Convert embedding_matrix to a NumPy array
embedding_matrix_np = np.array(embedding_matrix)

#save as a pickle file (includes both embeddings and word-to-index mapping)
with open("updated_embedding_matrix.pkl", "wb") as f:
    pickle.dump({"embeddings": embedding_matrix, "word_to_index": word_to_index}, f)

print("Embedding matrix saved successfully!")


Assigned <UNK> embedding to OOV word: 'acabamos'
Assigned <UNK> embedding to OOV word: 'artsploitation'
Assigned <UNK> embedding to OOV word: 'atacarse'
Assigned <UNK> embedding to OOV word: 'bergmanesque'
Assigned <UNK> embedding to OOV word: 'bierbichler'
Assigned <UNK> embedding to OOV word: 'bruckheimeresque'
Assigned <UNK> embedding to OOV word: 'can&#8217'
Assigned <UNK> embedding to OOV word: 'chabrolian'
Assigned <UNK> embedding to OOV word: 'chopsocky'
Assigned <UNK> embedding to OOV word: 'cipherlike'
Assigned <UNK> embedding to OOV word: 'copmovieland'
Assigned <UNK> embedding to OOV word: 'crowdpleaser'
Assigned <UNK> embedding to OOV word: 'dateflick'
Assigned <UNK> embedding to OOV word: 'decirles'
Assigned <UNK> embedding to OOV word: 'derivativeness'
Assigned <UNK> embedding to OOV word: 'dogwalker'
Assigned <UNK> embedding to OOV word: 'dreadfulness'
Assigned <UNK> embedding to OOV word: 'elegiacally'
Assigned <UNK> embedding to OOV word: 'enrapturing'
Assigned <UNK> e