## Install and import dependencies

In [1]:
%pip install torch gensim datasets nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import nltk
nltk.download("all")


import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import gensim.downloader as api

from datasets import load_dataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagg

## Part 0. Dataset Preparation

In [3]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

### Dataset Exploration

In [4]:
#Number of sentences in each set 
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [5]:
print(f"Sample sentence from train dataset: {test_dataset[0]['text']}")
print(f"Label: {'Positive' if test_dataset[0]['label'] == 1 else 'Negative'}")

Sample sentence from train dataset: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
Label: Positive


## Part 1. Preparing Word Embeddings

### Question 1 Word Embedding

#### (a) What is the size of the vocabulary formed in your training data

In [6]:
#tokenize sentences 
train_tokenized = []
for sentence in train_dataset['text']:
    train_tokenized.append(word_tokenize(sentence.lower()))

print('sample sentence:', train_tokenized[0],'\n')

#build vocabulary
vocab = {"<PAD>", "<UNK>"} #include a padding and unknown token for future processing
vocab.update(word for sentence in train_tokenized for word in sentence)

print("Number of words in the vocabulary(including padding and unknown tokens):", len(vocab))
print("Number of words in the vocabulary:" , len(vocab)-2)


sample sentence: ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '``', 'conan', '``', 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'] 

Number of words in the vocabulary(including padding and unknown tokens): 18031
Number of words in the vocabulary: 18029


#### (b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

#### (c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.

In [7]:
# Load pretrained Word2Vec model (Google News Word2Vec)
word2vec = api.load('word2vec-google-news-300')

# Set embedding size
embedding_size = 300

# Initialize the embedding matrix with zeros for padding and random values for unknown tokens
embedding_matrix = {}

# Create an <UNK> token embedding as a random vector
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)
embedding_matrix["<UNK>"] = unk_vector

# Create a <PAD> token embedding as a zero vector
pad_vector = np.zeros(embedding_size)
embedding_matrix["<PAD>"] = pad_vector

# Initialize OOV counter
oov_count = 0

# Iterate over the vocabulary
for word in vocab:
    if word == "<PAD>" or word == "<UNK>":
        continue  
    
    if word in word2vec:  # If the word is in Word2Vec, add its embedding
        embedding_matrix[word] = word2vec[word]
    else:
        # If the word is OOV, assign it the <UNK> vector and count as OOV
        embedding_matrix[word] = unk_vector  # Assign OOV words the <UNK> vector
        oov_count += 1  # Increment OOV counter

# Print results for Word2Vec
print(f"Number of OOV words with Word2Vec: {oov_count}")
print(f"Embedding for <PAD>: {embedding_matrix['<PAD>']}")
print(f"Embedding for <UNK>: {embedding_matrix['<UNK>']}")


Number of OOV words with Word2Vec: 3612
Embedding for <PAD>: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Embedding for <UNK>: [ 0.

In [8]:
# Load pretrained FastText model (wiki-news-300d-subword)
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

# Set embedding size
embedding_size = 300

# Initialize the embedding matrix with zeros for padding and random values for unknown tokens
embedding_matrix = {}

# Create an <UNK> token embedding as a random vector
unk_vector = np.random.uniform(-0.25, 0.25, embedding_size)
embedding_matrix["<UNK>"] = unk_vector

# Create a <PAD> token embedding as a zero vector
pad_vector = np.zeros(embedding_size)
embedding_matrix["<PAD>"] = pad_vector

# Initialize OOV counter for FastText
oov_count_fasttext = 0

# Iterate over the vocabulary
for word in vocab:
    if word == "<PAD>" or word == "<UNK>":
        continue  
    
    try:
        # Try to get the word vector using FastText's subword handling
        embedding_matrix[word] = fasttext_model.get_vector(word)
    except KeyError:
        # If the word can't be processed even by FastText, assign it the <UNK> vector
        embedding_matrix[word] = unk_vector
        oov_count_fasttext += 1  # Increment OOV count

# Print results for FastText
print(f"Number of OOV words with FastText: {oov_count_fasttext}")
print(f"Embedding for <PAD>: {embedding_matrix['<PAD>']}")
print(f"Embedding for <UNK>: {embedding_matrix['<UNK>']}")

Number of OOV words with FastText: 1961
Embedding for <PAD>: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Embedding for <UNK>: [ 1.

In [9]:
#store the embeddings so that they can be used later
np.save("embedding_matrix.npy", embedding_matrix)


## Part 3 
# 3.Keeping the above two adjustments, replace your simple RNN model in Part 2 wioth a biLSTM model and biGRU model.

# biLSTM Model

In [385]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score

In [386]:
embedding_matrix_33=np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
display(embedding_matrix_33)

{'<UNK>': array([ 1.46499299e-01, -7.41628753e-02, -1.06896044e-01, -3.95719798e-02,
         1.63726991e-01, -2.22449625e-01,  5.56402646e-03, -1.05855226e-01,
        -1.82631474e-01,  1.45761976e-01,  2.05889201e-01, -1.25959589e-01,
        -1.00601347e-01,  3.36342349e-03,  6.40261757e-02, -1.22160286e-01,
        -2.33354254e-01, -7.47514722e-02,  1.80142836e-01,  9.00434538e-02,
         2.19750715e-01,  5.79411526e-02,  1.94438483e-01, -1.24174975e-01,
        -1.76645771e-01, -2.96569862e-02, -1.33702138e-01,  2.03971424e-01,
        -2.40263069e-01, -4.97999858e-02,  9.63215207e-03,  4.89669880e-02,
         3.30706948e-02,  1.30805832e-01,  9.92367302e-02, -8.16547394e-02,
        -4.90370673e-02, -1.52256222e-01,  2.06895209e-02, -2.11391683e-01,
        -1.82143324e-01,  1.96268508e-04, -1.68306934e-01, -5.48427657e-02,
        -1.84294121e-01, -6.07469084e-02, -5.37062527e-02,  2.29149777e-01,
         1.26175731e-02, -2.25514634e-02,  1.43933964e-01, -1.50017508e-01,
   

preparing the train dataset. text -> word index

In [387]:
class SentimentDataset_33(Dataset):
    def __init__(self, texts : list[str], labels : list[int], vocab : set, embedding_matrix : dict, max_len=30):
        self.texts : list[str] = texts
        self.labels : list[int] = labels
        self.vocab : dict = self.build_vocab_dict(vocab)  # function to build vocabulary
        self.embedding_matrix : dict = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = word_tokenize(text.lower())
        vectorized_text = self.vectorize(tokenized_text)
        return torch.tensor(vectorized_text, dtype=torch.long), torch.tensor(label, dtype=torch.float)

    def vectorize(self, tokens):
        # Convert tokens to their corresponding index in the vocabulary
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized

    def build_vocab_dict(self, vocab : set):
        if "<PAD>" in vocab : vocab.remove("<PAD>")
        if "<UNK>" in vocab : vocab.remove("<UNK>")
        vocab_dict = {word: idx for idx, word in enumerate(vocab)}
        print(len(vocab_dict))
        vocab_dict['<PAD>'] = len(vocab_dict) # Add padding token
        print(len(vocab_dict))
        vocab_dict['<UNK>'] = len(vocab_dict) # Add unknown token
        print(len(vocab_dict))
        return vocab_dict


make the bilstm

In [388]:
class SentimentBiLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(SentimentBiLSTM, self).__init__()
        # Load pre-trained embeddings
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_matrix.size(1), hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # print("LSTM out: ", lstm_out)
        # Get the final forward and backward hidden states
        out = torch.cat((lstm_out[:, -1, :self.lstm.hidden_size], lstm_out[:, 0, self.lstm.hidden_size:]), dim=1)
        # print(out)
        out = self.dropout(out)
        # return self.sigmoid(self.fc(out))
        return self.fc(out)

start making the rnn

training function

In [389]:

def train_33(model, iterator):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in iterator:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
        optimizer.zero_grad()
        output = model(X_batch).squeeze(1)
        loss = criterion(output, y_batch.float())
        loss.backward()
        # for param in model.parameters():
        #     if param.grad is not None:  # Ensure the gradient is not None
        #         print(f"Gradient norm for {param.shape}: {param.grad.data.norm()}")
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)


evaluation function

In [390]:
def evaluate_33(model, iterator):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in iterator:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            output = model(X_batch).squeeze(1)
            probs = model.sigmoid(output)
            prediction = (probs >= 0.5).float()
            
            loss = criterion(output, y_batch.float())
            epoch_loss += loss.item()
            all_preds.extend(prediction.tolist())
            all_labels.extend(y_batch.tolist())
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy, epoch_loss / len(iterator)


train and validate function

In [393]:
def train_and_validate_33(num_epochs, model, train_iterator, valid_iterator):
    best_val_acc = 0
    for epoch in range(num_epochs):
        train_loss = train_33(model, train_iterator)
        accuracy , valid_loss = evaluate_33(model, valid_iterator)
        print(f'Epoch {epoch + 1}: Train Loss = {train_loss:.3f}, Accuracy = {accuracy:.3f}, Val Loss = {valid_loss:.3f}')

        if accuracy > best_val_acc:
            best_val_acc = accuracy
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
            
        # Check for convergence
        if epochs_without_improvement >= 10:  # Convergence condition (no improvement for 5 epochs)
            print("Convergence reached, stopping training.")
            break

create model and run the train loop

In [351]:
# Prepare your datasets
# train_texts_33 : list[str] = train_dataset['text']  # List of training texts
# train_labels_33 : list[int] = train_dataset['label']  # Corresponding labels for training texts
train_texts_33 : list[str] = train_dataset['text']  # List of training texts
train_labels_33 : list[int] = train_dataset['label']  # Corresponding labels for training texts
valid_texts_33 : list[str]= validation_dataset['text']  # List of validation texts
valid_labels_33 : list[int] = validation_dataset['label']  # Corresponding labels for validation texts
vocab_33 : set = vocab  # Your vocabulary list
embedding_matrix_33 : dict[ str , np.ndarray]= np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
embedding_matrix_values = np.array(list(embedding_matrix_33.values()), dtype=np.float32)
embedding_matrix_tensor = torch.tensor(embedding_matrix_values, dtype=torch.float32)
#embedding_layer = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=True)

hidden_dim = 128  # Adjust as needed
output_dim = 1  # Binary sentiment classification


# Create dataset instances
train_dataset_33 : SentimentDataset_33 = SentimentDataset_33(train_texts_33, train_labels_33, vocab_33, embedding_matrix_33)
valid_dataset_33 : SentimentDataset_33 = SentimentDataset_33(valid_texts_33, valid_labels_33, vocab_33, embedding_matrix_33)

# Create data loaders
train_iterator_33 = DataLoader(train_dataset_33, batch_size=32, shuffle=True)
valid_iterator_33 = DataLoader(valid_dataset_33, batch_size=32, shuffle=False)


18029
18030
18031
18029
18030
18031


In [352]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [353]:
model_33 = SentimentBiLSTM(embedding_matrix_tensor, hidden_dim, output_dim).to(device)
criterion = nn.BCEWithLogitsLoss()


In [354]:
optimizer = optim.Adam(model_33.parameters(), lr=0.001)

lr 0.001 
Epoch 1: Train Loss = 0.689, Accuracy = 0.532, Val Loss = 0.689
Epoch 2: Train Loss = 0.681, Accuracy = 0.541, Val Loss = 0.685
Epoch 3: Train Loss = 0.673, Accuracy = 0.563, Val Loss = 0.677
Epoch 4: Train Loss = 0.656, Accuracy = 0.574, Val Loss = 0.676
Epoch 5: Train Loss = 0.638, Accuracy = 0.588, Val Loss = 0.688
lr 0.005
Epoch 1: Train Loss = 0.675, Accuracy = 0.574, Val Loss = 0.682
Epoch 2: Train Loss = 0.654, Accuracy = 0.581, Val Loss = 0.672
Epoch 3: Train Loss = 0.643, Accuracy = 0.541, Val Loss = 0.686
Epoch 4: Train Loss = 0.613, Accuracy = 0.586, Val Loss = 0.695
Epoch 5: Train Loss = 0.569, Accuracy = 0.557, Val Loss = 0.741

gpu
Epoch 1: Train Loss = 0.702, Accuracy = 0.491, Val Loss = 0.694
Epoch 2: Train Loss = 0.699, Accuracy = 0.520, Val Loss = 0.693
Epoch 3: Train Loss = 0.694, Accuracy = 0.535, Val Loss = 0.691
Epoch 4: Train Loss = 0.694, Accuracy = 0.537, Val Loss = 0.691
Epoch 5: Train Loss = 0.690, Accuracy = 0.535, Val Loss = 0.688
Epoch 6: Train Loss = 0.688, Accuracy = 0.535, Val Loss = 0.691
Epoch 7: Train Loss = 0.693, Accuracy = 0.525, Val Loss = 0.694
Epoch 8: Train Loss = 0.685, Accuracy = 0.506, Val Loss = 0.701
Epoch 9: Train Loss = 0.685, Accuracy = 0.539, Val Loss = 0.689
Epoch 10: Train Loss = 0.682, Accuracy = 0.542, Val Loss = 0.691
Epoch 11: Train Loss = 0.681, Accuracy = 0.548, Val Loss = 0.688
Epoch 12: Train Loss = 0.678, Accuracy = 0.550, Val Loss = 0.692
Epoch 13: Train Loss = 0.674, Accuracy = 0.556, Val Loss = 0.690
Epoch 14: Train Loss = 0.672, Accuracy = 0.545, Val Loss = 0.692
Epoch 15: Train Loss = 0.676, Accuracy = 0.519, Val Loss = 0.697
Epoch 16: Train Loss = 0.684, Accuracy = 0.530, Val Loss = 0.696
Epoch 17: Train Loss = 0.673, Accuracy = 0.543, Val Loss = 0.696
Epoch 18: Train Loss = 0.670, Accuracy = 0.531, Val Loss = 0.694
Epoch 19: Train Loss = 0.670, Accuracy = 0.534, Val Loss = 0.701
Epoch 20: Train Loss = 0.668, Accuracy = 0.543, Val Loss = 0.709
Epoch 21: Train Loss = 0.679, Accuracy = 0.538, Val Loss = 0.699
Epoch 22: Train Loss = 0.670, Accuracy = 0.537, Val Loss = 0.697
Epoch 23: Train Loss = 0.669, Accuracy = 0.534, Val Loss = 0.705
Epoch 24: Train Loss = 0.663, Accuracy = 0.533, Val Loss = 0.705
Epoch 25: Train Loss = 0.663, Accuracy = 0.544, Val Loss = 0.707


gpu, unfroze embeddings
Epoch 1: Train Loss = 0.707, Accuracy = 0.645, Val Loss = 0.625
Epoch 2: Train Loss = 0.427, Accuracy = 0.753, Val Loss = 0.528
Epoch 3: Train Loss = 0.136, Accuracy = 0.736, Val Loss = 0.742
Epoch 4: Train Loss = 0.046, Accuracy = 0.723, Val Loss = 1.024
Epoch 5: Train Loss = 0.020, Accuracy = 0.727, Val Loss = 1.312
Epoch 6: Train Loss = 0.013, Accuracy = 0.720, Val Loss = 1.405
Epoch 7: Train Loss = 0.020, Accuracy = 0.741, Val Loss = 1.334
Epoch 8: Train Loss = 0.009, Accuracy = 0.736, Val Loss = 1.682
Epoch 9: Train Loss = 0.009, Accuracy = 0.729, Val Loss = 1.624
Epoch 10: Train Loss = 0.005, Accuracy = 0.743, Val Loss = 1.735
Epoch 11: Train Loss = 0.003, Accuracy = 0.730, Val Loss = 1.939
Epoch 12: Train Loss = 0.011, Accuracy = 0.728, Val Loss = 1.797
Epoch 13: Train Loss = 0.006, Accuracy = 0.733, Val Loss = 1.899
Epoch 14: Train Loss = 0.003, Accuracy = 0.725, Val Loss = 2.376
Epoch 15: Train Loss = 0.006, Accuracy = 0.729, Val Loss = 2.139
Epoch 16: Train Loss = 0.003, Accuracy = 0.744, Val Loss = 2.200
Epoch 17: Train Loss = 0.001, Accuracy = 0.735, Val Loss = 2.331
Epoch 18: Train Loss = 0.008, Accuracy = 0.723, Val Loss = 2.101
Epoch 19: Train Loss = 0.011, Accuracy = 0.724, Val Loss = 2.356
Epoch 20: Train Loss = 0.016, Accuracy = 0.726, Val Loss = 2.071
Epoch 21: Train Loss = 0.010, Accuracy = 0.725, Val Loss = 2.190
Epoch 22: Train Loss = 0.011, Accuracy = 0.726, Val Loss = 2.064
Epoch 23: Train Loss = 0.005, Accuracy = 0.729, Val Loss = 2.379
Epoch 24: Train Loss = 0.003, Accuracy = 0.720, Val Loss = 2.873
Epoch 25: Train Loss = 0.001, Accuracy = 0.715, Val Loss = 2.772

same thing with lesser hiddenlayers
Epoch 1: Train Loss = 0.660, Accuracy = 0.712, Val Loss = 0.569
Epoch 2: Train Loss = 0.379, Accuracy = 0.744, Val Loss = 0.518
Epoch 3: Train Loss = 0.170, Accuracy = 0.746, Val Loss = 0.686
Epoch 4: Train Loss = 0.082, Accuracy = 0.755, Val Loss = 0.849

Training with hidden_dim=64, lr=0.001, dropout_rate=0.3, batch_size=64



In [355]:
# Now you can run your training loop
train_and_validate_33(4, model_33, train_iterator_33, valid_iterator_33)

Epoch 1: Train Loss = 0.660, Accuracy = 0.712, Val Loss = 0.569
Epoch 2: Train Loss = 0.379, Accuracy = 0.744, Val Loss = 0.518
Epoch 3: Train Loss = 0.170, Accuracy = 0.746, Val Loss = 0.686
Epoch 4: Train Loss = 0.082, Accuracy = 0.755, Val Loss = 0.849


In [356]:
# Converting the Vocab set to dictionary 
def build_vocab_dict(vocab_set):
    # Create the vocabulary dictionary without <PAD> and <UNK>
    vocab_set.discard("<PAD>")
    vocab_set.discard("<UNK>")
    vocab_dict = {word: idx for idx, word in enumerate(vocab_set, start=2)}

    # Check for <PAD> and <UNK> existence and assign them fixed indices if they are present
    if "<PAD>" not in vocab_dict:
        vocab_dict["<PAD>"] = 0  # Index for padding token
    if "<UNK>" not in vocab_dict:
        vocab_dict["<UNK>"] = 1  # Index for unknown token
    
    #add the <PAD> and <UNK> back to the vocab
    vocab_set.add("<PAD>")
    vocab_set.add("<UNK>")
    return vocab_dict


test

In [384]:
# Step 8: Get a sample sentence from the test set and predict
import random
# Select a random index from the test dataset
random_index = random.randint(0, len(test_dataset) - 1)

# Get the corresponding sentence and its label from the test dataset
sample_sentence = test_dataset[random_index]['text']  # Assuming the dataset contains a 'text' field
true_label = test_dataset[random_index]['label']  # Assuming there's a label field
# Tokenize the sample sentence
sample_tokens = word_tokenize(sample_sentence.lower())

vocab_33 = build_vocab_dict(vocab)
# Convert tokens to indices
sample_indices = []
for token in sample_tokens:
    if token in vocab:
        sample_indices.append(vocab_33[token])
    else:
        sample_indices.append(vocab_33['<UNK>'])

sample_tensor = torch.tensor(sample_indices).unsqueeze(0)  # Add batch dimension
sample_tensor = sample_tensor.to(device)  # Move to GPU if available
# Make prediction using the model
model_33.eval()  # Set the model to evaluation mode
with torch.no_grad():  # No need to compute gradients during inference
    output = model_33(sample_tensor)  # Pass the tensor to the model
    print(output)
    probs = model_33.sigmoid(output)
    predicted = (probs >= 0.5)
    print(predicted.item())

# Map predicted index to sentiment label
sentiment_labels = ['negative', 'positive']  # Adjust according to your label encoding
predicted_label = sentiment_labels[predicted]

# Print results
print(f"Sample Sentence: '{sample_sentence}'")
print(f"True Label: {sentiment_labels[true_label]}")
print(f"Predicted Label: {predicted_label}")

tensor([[3.5031]], device='mps:0')
True
Sample Sentence: 'there's something to be said for a studio-produced film that never bothers to hand viewers a suitcase full of easy answers .'
True Label: positive
Predicted Label: positive


In [394]:
import itertools

# Define the hyper-parameter grid
hidden_dims = [64, 128 ,256]
learning_rates = [0.001, 0.005]
dropout_rates = [0.3, 0.5]
batch_sizes = [32, 64]

# Iterate over all combinations of hyper-parameters
for hidden_dim, lr, dropout_rate, bs in itertools.product(hidden_dims, learning_rates, dropout_rates, batch_sizes):
    print(f'Training with hidden_dim={hidden_dim}, lr={lr}, dropout_rate={dropout_rate}, batch_size={bs}')
    
    model_33 = SentimentBiLSTM(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model_33.parameters(), lr=lr)

    train_iterator_33 = DataLoader(train_dataset_33, bs)
    valid_iterator_33 = DataLoader(valid_dataset_33, bs)

    train_and_validate_33(25, model_33, train_iterator_33, valid_iterator_33)


Training with hidden_dim=64, lr=0.001, dropout_rate=0.3, batch_size=32
Epoch 1: Train Loss = 0.435, Accuracy = 0.500, Val Loss = 1.988
Epoch 2: Train Loss = 0.758, Accuracy = 0.500, Val Loss = 1.149
Epoch 3: Train Loss = 0.732, Accuracy = 0.500, Val Loss = 0.838
Epoch 4: Train Loss = 0.525, Accuracy = 0.500, Val Loss = 2.096
Epoch 5: Train Loss = 0.455, Accuracy = 0.500, Val Loss = 1.882
Epoch 6: Train Loss = 0.604, Accuracy = 0.500, Val Loss = 1.509
Epoch 7: Train Loss = 0.536, Accuracy = 0.500, Val Loss = 1.555
Epoch 8: Train Loss = 0.452, Accuracy = 0.520, Val Loss = 1.196
Epoch 9: Train Loss = 0.317, Accuracy = 0.568, Val Loss = 1.205
Epoch 10: Train Loss = 0.231, Accuracy = 0.604, Val Loss = 1.205
Epoch 11: Train Loss = 0.176, Accuracy = 0.626, Val Loss = 1.264
Epoch 12: Train Loss = 0.135, Accuracy = 0.637, Val Loss = 1.347
Epoch 13: Train Loss = 0.101, Accuracy = 0.648, Val Loss = 1.426
Epoch 14: Train Loss = 0.078, Accuracy = 0.662, Val Loss = 1.519
Epoch 15: Train Loss = 0.061

# biGRU Model

In [399]:
class SentimentBiGRU(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(SentimentBiGRU, self).__init__()
        
        # Using pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)  # Freeze if you don't want to update
        self.hidden_dim = hidden_dim
        
        # Bidirectional GRU layer
        self.gru = nn.GRU(embedding_matrix.size(1), hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Output size is 2 * hidden_dim for bidirectional
        self.dropout = nn.Dropout(dropout_rate)  # Dropout layer for regularization
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation for binary classification
        
        # Custom weight initialization (optional)
    #     self.init_weights()

    # def init_weights(self):
    #     for name, param in self.gru.named_parameters():
    #         if 'weight' in name:
    #             nn.init.xavier_normal_(param)  # Xavier initialization for weights
    #         elif 'bias' in name:
    #             nn.init.constant_(param, 0)  # Zero initialization for biases

    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)  # Shape: [batch_size, sequence_length, embedding_dim]

        # Pass through GRU layer
        gru_out, _ = self.gru(embedded)  # gru_out shape: [batch_size, sequence_length, hidden_dim * 2]
        
        # Concatenate last hidden states from forward and backward GRUs
        out = torch.cat((gru_out[:, -1, :self.hidden_dim], gru_out[:, 0, self.hidden_dim:]), dim=1)  # Shape: [batch_size, hidden_dim * 2]
        
        out = self.dropout(out)  # Apply dropout
        return self.fc(out)  # Final output

In [396]:
train_texts_33_gru : list[str] = train_dataset['text']  # List of training texts
train_labels_33_gru : list[int] = train_dataset['label']  # Corresponding labels for training texts
valid_texts_33_gru : list[str]= validation_dataset['text']  # List of validation texts
valid_labels_33_gru : list[int] = validation_dataset['label']  # Corresponding labels for validation texts
vocab_33_gru : set = vocab  # Your vocabulary list
embedding_matrix_33_gru : dict[ str , np.ndarray]= np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
embedding_matrix_values_gru = np.array(list(embedding_matrix_33_gru.values()), dtype=np.float32)
embedding_matrix_tensor_gru = torch.tensor(embedding_matrix_values_gru, dtype=torch.float32)
#embedding_layer = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=True)

hidden_dim_gru = 128  # Adjust as needed
output_dim_gru = 1  # Binary sentiment classification


# Create dataset instances
train_dataset_33_gru : SentimentDataset_33 = SentimentDataset_33(train_texts_33_gru, train_labels_33_gru, vocab_33_gru, embedding_matrix_33_gru)
valid_dataset_33_gru : SentimentDataset_33 = SentimentDataset_33(valid_texts_33_gru, valid_labels_33_gru, vocab_33_gru, embedding_matrix_33_gru)

# Create data loaders
train_iterator_33_gru = DataLoader(train_dataset_33_gru, batch_size=32, shuffle=True)
valid_iterator_33_gru = DataLoader(valid_dataset_33_gru, batch_size=32, shuffle=False)


18029
18030
18031
18029
18030
18031


In [404]:
criterion = nn.BCEWithLogitsLoss()
model_33_gru = SentimentBiGRU(embedding_matrix_tensor_gru, hidden_dim_gru, output_dim_gru, dropout_rate=0.5).to(device)
optimizer_gru = optim.Adam(model_33_gru.parameters(), lr=0.001)

In [405]:
# Now you can run your training loop
train_and_validate_33(5, model_33_gru, train_iterator_33_gru, valid_iterator_33_gru)

Epoch 1: Train Loss = 0.693, Accuracy = 0.531, Val Loss = 0.693
Epoch 2: Train Loss = 0.694, Accuracy = 0.531, Val Loss = 0.693
Epoch 3: Train Loss = 0.693, Accuracy = 0.531, Val Loss = 0.693
Epoch 4: Train Loss = 0.694, Accuracy = 0.531, Val Loss = 0.693
Epoch 5: Train Loss = 0.694, Accuracy = 0.531, Val Loss = 0.693


In [None]:
import itertools

# Define the hyper-parameter grid
hidden_dims = [64, 128 ,256]
learning_rates = [0.001, 0.005]
dropout_rates = [0.3, 0.5]
batch_sizes = [32, 64]

# Iterate over all combinations of hyper-parameters
for hidden_dim, lr, dropout_rate, bs in itertools.product(hidden_dims, learning_rates, dropout_rates, batch_sizes):
    print(f'Training with hidden_dim={hidden_dim}, lr={lr}, dropout_rate={dropout_rate}, batch_size={bs}')
    
    model_33 = SentimentBiGRU(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model_33.parameters(), lr=lr)

    train_iterator_33 = DataLoader(train_dataset_33, bs)
    valid_iterator_33 = DataLoader(valid_dataset_33, bs)

    train_and_validate_33(25, model_33, train_iterator_33, valid_iterator_33)