In [5]:
import os
import pickle

import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import gensim.downloader as api
import spacy as spacy

from datasets import load_dataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader

In [1]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Number of sentences in each set 
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [3]:
print(f"Sample sentence from train dataset: {test_dataset[0]['text']}")
print(f"Label: {'Positive' if test_dataset[0]['label'] == 1 else 'Negative'}")

Sample sentence from train dataset: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
Label: Positive


In [6]:
# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

def build_vocab(train_dataset):
    vocab = set()
    train_tokenized = []  # Store tokenized sentences

    # Loop through each sentence in the dataset
    for sentence in train_dataset['text']:
        # Use SpaCy's tokenizer
        doc = nlp(sentence.lower())  # Lowercase for consistency

        # Extract tokens
        word_list = [token.text for token in doc]  # Tokenized words

        # Add cleaned words into the vocabulary (no need to strip quotes with SpaCy)
        vocab.update(word_list)

        # Store tokenized sentence
        train_tokenized.append(word_list)

    vocab.discard('')  # Remove any empty string from the vocabulary
    
    # Convert vocab set to a list to index it
    vocab_list = sorted(vocab)
    return vocab, train_tokenized

vocab_list, train_tokenized = build_vocab(train_dataset)

# Show the number of words in the vocabulary
print(f"Number of words in the vocabulary: {len(vocab_list)}")

# Print a sample tokenized sentence
print("Sample tokenized sentence:", train_tokenized[0])

Number of words in the vocabulary: 16631
Sample tokenized sentence: ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '"', 'conan', '"', 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean', '-', 'claud', 'van', 'damme', 'or', 'steven', 'segal', '.']


In [12]:
# Initialize word_to_index with <PAD> at 0 and <UNK> at 1
word_to_index = {"<PAD>": 0, "<UNK>": 1}

# Assign indices to words in vocab_list starting from 2
for idx, word in enumerate(vocab_list, start=2):
    word_to_index[word] = idx

print("Sample word-to-index mapping:", list(word_to_index.items())[:10])  # Display first 10 mappings

Sample word-to-index mapping: [('<PAD>', 0), ('<UNK>', 1), ('fortunately', 2), ('aussie', 3), ('rap', 4), ('attempt', 5), ('iben', 6), ('research', 7), ('struggles', 8), ('getting', 9)]


In [8]:
def load_glove_embeddings(file_path, embedding_size=300):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if len(vector) == embedding_size:
                embeddings[word] = vector
    return embeddings

# Path to GloVe embeddings file and embedding size
glove_path = '/Users/anushreearora/Downloads/glove/glove.6B.300d.txt'  
embedding_size = 300
glove_embeddings = load_glove_embeddings(glove_path, embedding_size)

In [13]:
# Initialize the embedding matrix with None values for all words
embedding_matrix = [None] * (len(vocab_list) + 2)  # +2 for <PAD> and <UNK> tokens

# Define <UNK> and <PAD> embeddings
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)  # Random vector for <UNK>
pad_vector = np.zeros(embedding_size)  # Zero vector for <PAD>

# Set <PAD> and <UNK> embeddings initially
embedding_matrix[0] = pad_vector  # <PAD> at index 0
embedding_matrix[1] = unk_vector  # <UNK> at index 1

# Populate the embedding matrix
for word, idx in word_to_index.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]  # Assign GloVe embedding
    # If the word is not in GloVe, it remains None 

In [14]:
# Derive OOV words from the embedding matrix
oov_words = [word for word, idx in word_to_index.items() if embedding_matrix[idx] is None]

oov_words = sorted(oov_words)

# Print results
print(f"Number of OOV words: {len(oov_words)}")
for word in oov_words:
    print(word)

Number of OOV words: 690
--with
-a
-after
-doing
-greaseballs
-hollywood
-inevitable
-of
-quite
-stunning
-the
-west
28k
3/4th
4/5ths
a]n
abandone
aborbing
absolutamente
aburrido
acabamos
accomodates
aceitou
achival
achronological
acontecimentos
actorish
actory
actuación
actuada
adapted-
addessi
adorability
adventues
affirmational
ain't-
alientation
allodi
amoses
amusedly
and-
andamento
animé
anteing
apallingly
apesar
aproveitar
aqueles
aren't
arriesgado
artsploitation
artístico
assistir
atacar
atacarse
atreve
auteil
autocritique
b+
bazadona
bergmanesque
beseechingly
bibbidy
bierbichler
birot
bizzarre
bjorkness
black-&-white
blighter
blutarsky
bobbidi
bondish
bornin
bottomlessly
bruckheimeresque
brûlée
bull's
burningly
bustingly
butterfingered
c'm
cadness
cam'ron
camareras
can&#8217
cannier
captivatingly
capturou
carente
cativante
catsup--
certamente
chabrolian
character-
character]is
chopsocky
choquart
cineasts
cinemantic
cipherlike
cirulnick
class-
claustrophic
clericks
cletis
clichê

In [15]:
# Assign the <UNK> embedding to OOV words
for word in oov_words:
    idx = word_to_index[word]
    embedding_matrix[idx] = unk_vector  # Assign <UNK> vector to OOV words

In [20]:
# Convert the embedding matrix to a NumPy array
embedding_matrix_np = np.array(embedding_matrix)

# Save both the embedding matrix and word_to_index mapping as a pickle file
with open("base_embedding_matrix.pkl", "wb") as f:
    pickle.dump({"embeddings": embedding_matrix_np, "word_to_index": word_to_index}, f)