## Install and import dependencies

In [1]:
%pip install torch gensim datasets nltk

Collecting torch
  Downloading torch-2.5.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.1-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting setuptools (from torch)
  Downloading setuptools-75.2.0-py3-none-any.whl.metadata (6.9 kB)
Collecting sympy==1.13.1 (from torch)
  Dow

In [12]:
%pip install spacy

Collecting spacy
  Downloading spacy-3.8.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downlo

In [2]:
import os
import nltk
nltk.download("all")

import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import gensim.downloader as api

from datasets import load_dataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/anushreearora/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/anushreearora/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/anushreearora/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/anushreearora/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/anushreearora/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagg

## Part 0. Dataset Preparation

In [3]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

### Dataset Exploration

In [4]:
#Number of sentences in each set 
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [5]:
print(f"Sample sentence from train dataset: {test_dataset[0]['text']}")
print(f"Label: {'Positive' if test_dataset[0]['label'] == 1 else 'Negative'}")

Sample sentence from train dataset: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
Label: Positive


## Part 1. Preparing Word Embeddings

### Question 1 Word Embedding

#### (a) What is the size of the vocabulary formed in your training data

In [14]:
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

def build_vocab(train_dataset):
    vocab = {"<PAD>", "<UNK>"}  # Initialize vocabulary with padding and unknown tokens
    train_dataset_pos = []  # Store POS-tagged sentences

    # Loop through each sentence in the dataset
    for sentence in train_dataset['text']:
        # Use SpaCy's tokenizer and POS tagger
        doc = nlp(sentence.lower())  # Lowercase for consistency

        # Extract tokens and POS tags
        word_list = [token.text for token in doc]  # Tokenized words
        pos_tags = [(token.text, token.pos_) for token in doc]  # POS tags

        # Add cleaned words into the vocabulary (no need to strip quotes with SpaCy)
        vocab.update(word_list)

        # Store POS tags
        train_dataset_pos.append(pos_tags)

    vocab.discard('')  # Remove any empty string from the vocabulary
    return vocab, train_dataset_pos

# Example usage with train_dataset
vocab, train_dataset_pos = build_vocab(train_dataset)

# Show the number of words in the vocabulary
print(f"Number of words in the vocabulary (including padding and unknown tokens): {len(vocab)}")
print(f"Number of words in the vocabulary (excluding padding and unknown tokens): {len(vocab) - 2}")

# Print a sample POS tagged sentence
print("Sample POS tagged sentence:", train_dataset_pos[0])



Number of words in the vocabulary (including padding and unknown tokens): 16633
Number of words in the vocabulary (excluding padding and unknown tokens): 16631
Sample POS tagged sentence: [('the', 'DET'), ('rock', 'NOUN'), ('is', 'AUX'), ('destined', 'VERB'), ('to', 'PART'), ('be', 'AUX'), ('the', 'DET'), ('21st', 'ADJ'), ('century', 'NOUN'), ("'s", 'PART'), ('new', 'ADJ'), ('"', 'PUNCT'), ('conan', 'PROPN'), ('"', 'PUNCT'), ('and', 'CCONJ'), ('that', 'SCONJ'), ('he', 'PRON'), ("'s", 'AUX'), ('going', 'VERB'), ('to', 'PART'), ('make', 'VERB'), ('a', 'DET'), ('splash', 'NOUN'), ('even', 'ADV'), ('greater', 'ADJ'), ('than', 'ADP'), ('arnold', 'ADJ'), ('schwarzenegger', 'ADJ'), (',', 'PUNCT'), ('jean', 'NOUN'), ('-', 'PUNCT'), ('claud', 'NOUN'), ('van', 'NOUN'), ('damme', 'NOUN'), ('or', 'CCONJ'), ('steven', 'NOUN'), ('segal', 'PROPN'), ('.', 'PUNCT')]


In [9]:
from nltk import pos_tag

# Make sure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Tokenize sentences and apply POS tagging
def build_vocab(train_dataset):
    vocab = {"<PAD>", "<UNK>"}  # Start vocab with <PAD> and <UNK> tokens
    train_dataset_pos = []  # To store POS tagged sentences

    # Loop through each sentence in the dataset
    for sentence in train_dataset['text']:
        # Convert to lowercase for case folding
        sentence = sentence.lower()

        # Tokenize the sentence into words
        word_list = word_tokenize(sentence)

        # Clean each word by stripping any quotes
        word_list = [word.strip("'\"") for word in word_list]

        # Add cleaned words into the vocabulary
        vocab.update(word_list)

        # Get POS tags for the words
        pos_tags = pos_tag(word_list)
        train_dataset_pos.append(pos_tags)

    vocab.discard('')  # Remove any empty string from the vocabulary
    return vocab, train_dataset_pos

# Example usage with train_dataset
vocab, train_dataset_pos = build_vocab(train_dataset)

# Show the number of words in the vocabulary
print(f"Number of words in the vocabulary (including padding and unknown tokens): {len(vocab)}")
print(f"Number of words in the vocabulary (excluding padding and unknown tokens): {len(vocab) - 2}")

# Print a sample POS tagged sentence
print("Sample POS tagged sentence:", train_dataset_pos[0])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anushreearora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anushreearora/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Number of words in the vocabulary (including padding and unknown tokens): 17843
Number of words in the vocabulary (excluding padding and unknown tokens): 17841
Sample POS tagged sentence: [('the', 'DT'), ('rock', 'NN'), ('is', 'VBZ'), ('destined', 'VBN'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN'), ('s', 'VBD'), ('new', 'JJ'), ('``', '``'), ('conan', 'JJ'), ('``', '``'), ('and', 'CC'), ('that', 'IN'), ('he', 'PRP'), ('s', 'VBZ'), ('going', 'VBG'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('splash', 'NN'), ('even', 'RB'), ('greater', 'JJR'), ('than', 'IN'), ('arnold', 'RB'), ('schwarzenegger', 'NN'), (',', ','), ('jean-claud', 'JJ'), ('van', 'NN'), ('damme', 'NN'), ('or', 'CC'), ('steven', 'JJ'), ('segal', 'NN'), ('.', '.')]


#### (b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

#### (c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.

In [15]:
# Load pretrained Word2Vec model (Google News Word2Vec)
word2vec = api.load('word2vec-google-news-300')

# Set embedding size
embedding_size = 300

# Initialize the embedding matrix with zeros for padding and random values for unknown tokens
embedding_matrix = {}

# Create an <UNK> token embedding as a random vector
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)
embedding_matrix["<UNK>"] = unk_vector

# Create a <PAD> token embedding as a zero vector
pad_vector = np.zeros(embedding_size)
embedding_matrix["<PAD>"] = pad_vector

# Initialize OOV counter
oov_count = 0

# Iterate over the vocabulary
for word in vocab:
    if word == "<PAD>" or word == "<UNK>":
        continue  
    
    if word in word2vec:  # If the word is in Word2Vec, add its embedding
        embedding_matrix[word] = word2vec[word]
    else:
        # If the word is OOV, assign it the <UNK> vector and count as OOV
        embedding_matrix[word] = unk_vector  # Assign OOV words the <UNK> vector
        oov_count += 1  # Increment OOV counter

# Print results for Word2Vec
print(f"Number of OOV words with Word2Vec: {oov_count}")
print(f"Embedding for <PAD>: {embedding_matrix['<PAD>']}")
print(f"Embedding for <UNK>: {embedding_matrix['<UNK>']}")


Number of OOV words with Word2Vec: 1758
Embedding for <PAD>: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Embedding for <UNK>: [-0.

In [16]:
# Load pretrained FastText model (wiki-news-300d-subword)
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

# Set embedding size
embedding_size = 300

# Initialize the embedding matrix with zeros for padding and random values for unknown tokens
embedding_matrix = {}

# Create an <UNK> token embedding as a random vector
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)
embedding_matrix["<UNK>"] = unk_vector

# Create a <PAD> token embedding as a zero vector
pad_vector = np.zeros(embedding_size)
embedding_matrix["<PAD>"] = pad_vector

# Initialize OOV counter for FastText
oov_count_fasttext = 0

# Iterate over the vocabulary
for word in vocab:
    if word == "<PAD>" or word == "<UNK>":
        continue  
    
    try:
        # Try to get the word vector using FastText's subword handling
        embedding_matrix[word] = fasttext_model.get_vector(word)
    except KeyError:
        # If the word can't be processed even by FastText, assign it the <UNK> vector
        embedding_matrix[word] = unk_vector
        oov_count_fasttext += 1  # Increment OOV count

# Print results for FastText
print(f"Number of OOV words with FastText: {oov_count_fasttext}")
print(f"Embedding for <PAD>: {embedding_matrix['<PAD>']}")
print(f"Embedding for <UNK>: {embedding_matrix['<UNK>']}")

Number of OOV words with FastText: 1176
Embedding for <PAD>: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Embedding for <UNK>: [ 0.