## Install and import dependencies

In [1]:
%pip install torch gensim datasets nltk

Collecting torch
  Downloading torch-2.5.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.1-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting setuptools (from torch)
  Downloading setuptools-75.2.0-py3-none-any.whl.metadata (6.9 kB)
Collecting sympy==1.13.1 (from torch)
  Dow

In [24]:
import os
import nltk
#nltk.download("all")


import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from datasets import load_dataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from gensim.downloader import load as load_word2vec

## Part 0. Dataset Preparation

In [25]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

### Dataset Exploration

In [26]:
#Number of sentences in each set 
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [27]:
print(f"Sample sentence from train dataset: {test_dataset[0]['text']}")
print(f"Label: {'Positive' if test_dataset[0]['label'] == 1 else 'Negative'}")

Sample sentence from train dataset: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
Label: Positive


## Part 1. Preparing Word Embeddings

### Question 1 Word Embedding

#### (a) What is the size of the vocabulary formed in your training data

In [28]:
#tokenize sentences 
train_tokenized = []
for sentence in train_dataset['text']:
    train_tokenized.append(word_tokenize(sentence.lower()))

print('sample sentence:', train_tokenized[0],'\n')

#build vocabulary
vocab = {"<PAD>", "<UNK>"} #include a padding and unknown token for future processing
vocab.update(word for sentence in train_tokenized for word in sentence)

print("Number of words in the vocabulary(including padding and unknown tokens):", len(vocab))
print("Number of words in the vocabulary:" , len(vocab)-2)


sample sentence: ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '``', 'conan', '``', 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'] 

Number of words in the vocabulary(including padding and unknown tokens): 18031
Number of words in the vocabulary: 18029


In [30]:
## create word embeddings

# Load pretrained Word2Vec model (Google News Word2Vec)

word2vec = load_word2vec('word2vec-google-news-300')

# Set embedding size 
embedding_size = 300

# Initialize the embedding matrix with zeros for padding and random values for unknown tokens
embedding_matrix = {}

# Create an <UNK> token embedding as a random vector
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)
embedding_matrix["<UNK>"] = unk_vector

# Create a <PAD> token embedding as a zero vector
pad_vector = np.zeros(embedding_size)
embedding_matrix["<PAD>"] = pad_vector

# Initialize OOV counter
oov_count = 0

# Iterate over the vocabulary
for word in vocab:
    if word in word2vec:  # If the word is in Word2Vec, add its embedding
        embedding_matrix[word] = word2vec[word]
    else:
        # If the word is OOV, assign it the <UNK> vector
        if word != "<PAD>" and word != "<UNK>":  
            oov_count += 1
        embedding_matrix[word] = unk_vector 


print(f"Number of OOV words: {oov_count}")
print(f"Percentage of OOV words: {(oov_count / len(vocab)) * 100:.2f}%")

print(f"Embedding for <PAD>: {embedding_matrix['<PAD>']}")
print(f"Embedding for <UNK>: {embedding_matrix['<UNK>']}")
print(f"Embedding for a known word (e.g., 'good'): {embedding_matrix.get('good', 'Not found')}")



Number of OOV words: 3612
Percentage of OOV words: 20.03%
Embedding for <PAD>: [-0.17272163 -0.08615165  0.12563584 -0.06587423  0.10645834 -0.01698609
 -0.09833902 -0.0439332   0.13371405 -0.21977292 -0.0988321  -0.01075339
 -0.07475643  0.08178638 -0.07455481  0.16921675 -0.20274623  0.07100154
  0.23807387  0.06484599  0.12747533  0.1569731   0.19133292 -0.02872742
 -0.24495056 -0.14136922  0.04833299 -0.11152147 -0.1121851  -0.06351838
  0.18686544 -0.2428275   0.08859587  0.067547    0.18058591  0.15446684
 -0.17593752 -0.05121276  0.09997025 -0.0745362   0.03959235  0.10560216
  0.13394498  0.21520545  0.12057554  0.23513249  0.04281824  0.07460304
 -0.13450599  0.14947394  0.04317058  0.06187691 -0.23351912  0.02909118
  0.07479826  0.00726507 -0.14327209  0.17278679  0.09071895  0.02977174
 -0.12416539 -0.16754471 -0.08320124  0.03858752 -0.15633115  0.2431455
  0.09029769 -0.0924034   0.24697118 -0.03965657  0.11723496  0.24123009
  0.10843441  0.1049156   0.00811062  0.240137

#### (b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

#### (c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.