## Install and import dependencies

In [1]:
%pip install torch gensim datasets nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import nltk
#nltk.download("all")


import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import gensim.downloader as api

from datasets import load_dataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


## Part 0. Dataset Preparation

In [3]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

### Dataset Exploration

In [4]:
#Number of sentences in each set 
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [5]:
print(f"Sample sentence from train dataset: {test_dataset[0]['text']}")
print(f"Label: {'Positive' if test_dataset[0]['label'] == 1 else 'Negative'}")

Sample sentence from train dataset: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
Label: Positive


## Part 1. Preparing Word Embeddings

### Question 1 Word Embedding

#### (a) What is the size of the vocabulary formed in your training data

In [6]:
#tokenize sentences 
train_tokenized = []
for sentence in train_dataset['text']:
    train_tokenized.append(word_tokenize(sentence.lower()))

print('sample sentence:', train_tokenized[0],'\n')

#build vocabulary
vocab = {"<PAD>", "<UNK>"} #include a padding and unknown token for future processing
vocab.update(word for sentence in train_tokenized for word in sentence)

print("Number of words in the vocabulary(including padding and unknown tokens):", len(vocab))
print("Number of words in the vocabulary:" , len(vocab)-2)


sample sentence: ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '``', 'conan', '``', 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'] 

Number of words in the vocabulary(including padding and unknown tokens): 18031
Number of words in the vocabulary: 18029


#### (b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

#### (c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.

In [7]:
# Load pretrained Word2Vec model (Google News Word2Vec)
word2vec = api.load('word2vec-google-news-300')

# Set embedding size
embedding_size = 300

# Initialize the embedding matrix with zeros for padding and random values for unknown tokens
embedding_matrix = {}

# Create an <UNK> token embedding as a random vector
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)
embedding_matrix["<UNK>"] = unk_vector

# Create a <PAD> token embedding as a zero vector
pad_vector = np.zeros(embedding_size)
embedding_matrix["<PAD>"] = pad_vector

# Initialize OOV counter
oov_count = 0

# Iterate over the vocabulary
for word in vocab:
    if word == "<PAD>" or word == "<UNK>":
        continue  
    
    if word in word2vec:  # If the word is in Word2Vec, add its embedding
        embedding_matrix[word] = word2vec[word]
    else:
        # If the word is OOV, assign it the <UNK> vector and count as OOV
        embedding_matrix[word] = unk_vector  # Assign OOV words the <UNK> vector
        oov_count += 1  # Increment OOV counter

# Print results for Word2Vec
print(f"Number of OOV words with Word2Vec: {oov_count}")
print(f"Embedding for <PAD>: {embedding_matrix['<PAD>']}")
print(f"Embedding for <UNK>: {embedding_matrix['<UNK>']}")


Number of OOV words with Word2Vec: 3612
Embedding for <PAD>: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Embedding for <UNK>: [ 0.

In [8]:
# Load pretrained FastText model (wiki-news-300d-subword)
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

# Set embedding size
embedding_size = 300

# Initialize the embedding matrix with zeros for padding and random values for unknown tokens
embedding_matrix = {}

# Create an <UNK> token embedding as a random vector
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)
embedding_matrix["<UNK>"] = unk_vector

# Create a <PAD> token embedding as a zero vector
pad_vector = np.zeros(embedding_size)
embedding_matrix["<PAD>"] = pad_vector

# Initialize OOV counter for FastText
oov_count_fasttext = 0

# Iterate over the vocabulary
for word in vocab:
    if word == "<PAD>" or word == "<UNK>":
        continue  
    
    try:
        # Try to get the word vector using FastText's subword handling
        embedding_matrix[word] = fasttext_model.get_vector(word)
    except KeyError:
        # If the word can't be processed even by FastText, assign it the <UNK> vector
        embedding_matrix[word] = unk_vector
        oov_count_fasttext += 1  # Increment OOV count

# Print results for FastText
print(f"Number of OOV words with FastText: {oov_count_fasttext}")
print(f"Embedding for <PAD>: {embedding_matrix['<PAD>']}")
print(f"Embedding for <UNK>: {embedding_matrix['<UNK>']}")

Number of OOV words with FastText: 1961
Embedding for <PAD>: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Embedding for <UNK>: [-0.

In [9]:
#store the embeddings so that they can be used later
np.save("embedding_matrix.npy", embedding_matrix)


## Part 2 : Model Training & Evaluation - RNN

In [10]:
embedding_matrix=np.load("embedding_matrix.npy",allow_pickle='TRUE').item()

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score

In [13]:
# Converting the Vocab set to dictionary 
def build_vocab_dict(vocab_set):
    # Create the vocabulary dictionary without <PAD> and <UNK>
    vocab_set.discard("<PAD>")
    vocab_set.discard("<UNK>")
    vocab_dict = {word: idx for idx, word in enumerate(vocab_set, start=2)}

    # Check for <PAD> and <UNK> existence and assign them fixed indices if they are present
    if "<PAD>" not in vocab_dict:
        vocab_dict["<PAD>"] = 0  # Index for padding token
    if "<UNK>" not in vocab_dict:
        vocab_dict["<UNK>"] = 1  # Index for unknown token
    
    #add the <PAD> and <UNK> back to the vocab
    vocab_set.add("<PAD>")
    vocab_set.add("<UNK>")
    return vocab_dict


In [14]:
# Prepare Dataset for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, vocab, embedding_matrix, max_len=30):
        self.texts = texts
        self.labels = labels
        self.vocab = build_vocab_dict(vocab)
        self.embedding_matrix = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = word_tokenize(text.lower())
        vectorized_text = self.vectorize(tokenized_text)
        return torch.tensor(vectorized_text), torch.tensor(label)

    def vectorize(self, tokens):
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Check for out-of-range indices
        for index in vectorized:
            if index >= len(self.embedding_matrix):
                raise ValueError(f"Index {index} is out of range for the embedding matrix.")
                
        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized


In [15]:
# Create the RNN Model
class RNNModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, output_size, num_layers=2, bidirectional=False):
        super(RNNModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        
        # Define the embedding layer with pretrained embeddings, frozen
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=True)
        
        # Define RNN layer 
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, batch_first=True)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Forward pass
        x = self.embedding(x)
        out, _ = self.rnn(x)
        # Using Average Pooling
        out = torch.mean(out, dim=1)
        out = self.fc(out)
        return out

In [16]:
# Prepare DataLoader
def create_data_loader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_dataset_instance = SentimentDataset(train_dataset['text'], train_dataset['label'], vocab, embedding_matrix)
val_dataset_instance = SentimentDataset(validation_dataset['text'], validation_dataset['label'], vocab, embedding_matrix)
test_dataset_instance = SentimentDataset(test_dataset['text'], test_dataset['label'], vocab, embedding_matrix)

In [17]:
def evaluate(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data, labels in data_loader:
            output = model.forward(data)
            probs = torch.sigmoid(output)  # Apply sigmoid to get probabilities
            predicted = (probs >= 0.5).long()  # Convert probabilities to binary predictions
            all_preds.extend(predicted.cpu().numpy().flatten().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())
    acc = accuracy_score(all_labels, all_preds)
    return acc

In [18]:
# Train and validate function
def train_and_validate(model, train_loader, val_loader, optimizer, criterion, max_epochs=100, convergence_threshold=0.001):
    best_val_acc = 0
    epochs_without_improvement = 0
    
    for epoch in range(max_epochs):
        model.train()
        running_loss = 0
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data).squeeze(1)
            loss = criterion(output, target.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{max_epochs}, Loss: {running_loss/len(train_loader)}, Val Accuracy: {val_acc}")
        
        # Check for improvement
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
            
        # Check for convergence
        if epochs_without_improvement >= 10:  # Convergence condition (no improvement for 5 epochs)
            print("Convergence reached, stopping training.")
            break
            
    return best_val_acc, epoch 


In [19]:
# Hyperparameter tuning
learning_rates = [0.001, 0.01]
batch_sizes = [32, 64]
hidden_size = 123
optimizers = ['adam', 'sgd', 'rmsprop']  

embedding_matrix_list = [embedding_matrix[word] for word in vocab]
embedding_matrix_array = np.stack(embedding_matrix_list)
vocab_size, embedding_dim = embedding_matrix_array.shape
output_size = 1  


best_val_acc = 0
best_hyperparams = {}

for lr in learning_rates:
    for bs in batch_sizes:
            for opt in optimizers:
                print("Training with the following hyperparameters:")
                print(f"Learning Rate: {lr}, Batch Size: {bs}, Hidden Size: {hidden_size}, Optimizer: {opt}")
                # Initialize model, criterion
                model = RNNModel(embedding_matrix_array, hidden_size=128, output_size=1)
                criterion = nn.BCEWithLogitsLoss()

                # Initialize optimizer based on the selected type
                if opt == 'adam':
                    optimizer = optim.Adam(model.parameters(), lr=lr)
                elif opt == 'sgd':
                    optimizer = optim.SGD(model.parameters(), lr=lr)
                elif opt == 'rmsprop':
                    optimizer = optim.RMSprop(model.parameters(), lr=lr)

                train_loader = create_data_loader(train_dataset_instance, bs)
                val_loader = create_data_loader(val_dataset_instance, bs)
                
                # Train and validate
                val_acc, epochs_used = train_and_validate(model, train_loader, val_loader, optimizer, criterion)
                print(f"Learning Rate: {lr}, Batch Size: {bs} Optimizer: {opt}, Validation Accuracy: {val_acc}")

                # Update best parameters
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_hyperparams = {
                        'learning_rate': lr,
                        'batch_size': bs,
                        'optimizer': opt
                    }
                    best_epochs = epochs_used

# Print the best configuration
print(f"Best Model Configuration: {best_hyperparams} with Validation Accuracy: {best_val_acc} over {best_epochs} epochs")

Training with the following hyperparameters:
Learning Rate: 0.001, Batch Size: 32, Hidden Size: 123, Optimizer: adam
Epoch 1/100, Loss: 0.6943659646234263, Val Accuracy: 0.5412757973733584
Epoch 2/100, Loss: 0.6830613117539481, Val Accuracy: 0.549718574108818
Epoch 3/100, Loss: 0.6558597842405798, Val Accuracy: 0.6097560975609756
Epoch 4/100, Loss: 0.635233868597152, Val Accuracy: 0.6341463414634146
Epoch 5/100, Loss: 0.6168298465705543, Val Accuracy: 0.6622889305816135
Epoch 6/100, Loss: 0.6045206906866938, Val Accuracy: 0.6378986866791745
Epoch 7/100, Loss: 0.6005654070484504, Val Accuracy: 0.6388367729831145
Epoch 8/100, Loss: 0.5948330480954174, Val Accuracy: 0.6341463414634146
Epoch 9/100, Loss: 0.5995116915818903, Val Accuracy: 0.649155722326454
Epoch 10/100, Loss: 0.5876737326271971, Val Accuracy: 0.6529080675422139
Epoch 11/100, Loss: 0.590491944819354, Val Accuracy: 0.624765478424015
Epoch 12/100, Loss: 0.5891645145550203, Val Accuracy: 0.6604127579737336
Epoch 13/100, Loss: 0

In [21]:
#Trainig the model with the best hyperparameters
batch_size = 64
lr=0.001
model = RNNModel(embedding_matrix_array, hidden_size=128, output_size=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

train_loader = create_data_loader(train_dataset_instance, batch_size)
val_loader = create_data_loader(val_dataset_instance, batch_size)
                
# Train and validate
val_acc, epochs_used = train_and_validate(model, train_loader, val_loader, optimizer, criterion)
print(f"Validation Accuracy: {val_acc}, over {epochs_used} epochs")

Epoch 1/100, Loss: 0.6932030738289676, Val Accuracy: 0.575046904315197
Epoch 2/100, Loss: 0.6585904124067791, Val Accuracy: 0.6313320825515948
Epoch 3/100, Loss: 0.6413114764797154, Val Accuracy: 0.6069418386491557
Epoch 4/100, Loss: 0.6170370576541815, Val Accuracy: 0.6144465290806754
Epoch 5/100, Loss: 0.6095510436527765, Val Accuracy: 0.6538461538461539
Epoch 6/100, Loss: 0.6074308230360942, Val Accuracy: 0.6594746716697936
Epoch 7/100, Loss: 0.5941777089193686, Val Accuracy: 0.6632270168855535
Epoch 8/100, Loss: 0.5925311423949341, Val Accuracy: 0.6594746716697936
Epoch 9/100, Loss: 0.5892993661005106, Val Accuracy: 0.650093808630394
Epoch 10/100, Loss: 0.5830642238481721, Val Accuracy: 0.6538461538461539
Epoch 11/100, Loss: 0.584717085334792, Val Accuracy: 0.6435272045028143
Epoch 12/100, Loss: 0.5767381233510687, Val Accuracy: 0.6697936210131332
Epoch 13/100, Loss: 0.5795326408610415, Val Accuracy: 0.6707317073170732
Epoch 14/100, Loss: 0.5749036865447884, Val Accuracy: 0.6641651

In [23]:
# Step 7: Evaluate on Test Set
test_loader = create_data_loader(test_dataset_instance, batch_size)
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc}")

# Report the configuration
print(f"Final Configuration:\nEpochs: {epochs_used}\nLearning Rate: {lr}\nOptimizer: Adam\nBatch Size: {batch_size}")

Test Accuracy: 0.6472795497185742
Final Configuration:
Epochs: 26
Learning Rate: 0.001
Optimizer: Adam
Batch Size: 64


In [1]:
# Step 8: Get a sample sentence from the test set and predict
import random
# Select a random index from the test dataset
random_index = random.randint(0, len(test_dataset) - 1)

# Get the corresponding sentence and its label from the test dataset
sample_sentence = test_dataset[random_index]['text']  # Assuming the dataset contains a 'text' field
true_label = test_dataset[random_index]['label']  # Assuming there's a label field

# Tokenize the sample sentence
sample_tokens = word_tokenize(sample_sentence.lower())

# Convert tokens to indices
sample_indices = []
for token in sample_tokens:
    if token in vocab:
        sample_indices.append(list(vocab).index(token))
    else:
        sample_indices.append(list(vocab).index("<UNK>"))
sample_tensor = torch.tensor(sample_indices).unsqueeze(0)  # Add batch dimension
# Make prediction using the model
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # No need to compute gradients during inference
    output = model(sample_tensor)  # Pass the tensor to the model
    _, predicted = torch.max(output, 1)  # Get the index of the max log-probability

# Map predicted index to sentiment label
sentiment_labels = ['negative', 'positive']  # Adjust according to your label encoding
predicted_label = sentiment_labels[predicted.item()]

# Print results
print(f"Sample Sentence: '{sample_sentence}'")
print(f"True Label: {true_label}")
print(f"Predicted Label: {predicted_label}")

NameError: name 'test_dataset' is not defined