# **Part 2**

In [None]:
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SC4002 Natural Language Processing



In [None]:
from datasets import load_dataset

In [None]:
!git clone https://github.com/AkhiAcharya/SC4002-NLP-Project.git


In [None]:
import sys
sys.path.append('/content/SC4002-NLP-Project')

In [None]:
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [None]:
print(test_dataset.to_pandas().head(15))

print(test_dataset[:15])  # View the first 5 rows as a dictionary


In [None]:
import os
import gensim.downloader as api
from gensim.models import KeyedVectors

def save_model_to_drive(model_name):
    import gensim.downloader as api
    from gensim.models import KeyedVectors
    import os

    save_path = '/content/drive/MyDrive/models'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    if model_name == 'word2vec':
        path = api.load("word2vec-google-news-300", return_path=True)
        model = KeyedVectors.load_word2vec_format(path, binary=True)
        model.save(f'{save_path}/word2vec.model')
        print("Word2Vec model saved to Google Drive successfully!")

    elif model_name == 'glove':
        model = api.load("glove-wiki-gigaword-300")
        model.save(f'{save_path}/glove.model')
        print("GloVe model saved to Google Drive successfully!")

    return model

def load_model_from_drive(model_name):
    from gensim.models import KeyedVectors
    import os

    model_path = f'/content/drive/MyDrive/models/{model_name}.model'

    if os.path.exists(model_path):
        model = KeyedVectors.load(model_path)
        print(f"{model_name.capitalize()} model loaded from Drive successfully!")
        return model
    else:
        print(f"{model_name.capitalize()} model not found in Drive. Downloading and saving...")
        return save_model_to_drive(model_name)

def get_model(model_name):
    if model_name not in ['word2vec', 'glove']:
        raise ValueError("model_name must be either 'word2vec' or 'glove'")

    try:
        return load_model_from_drive(model_name)
    except Exception as e:
        print(f"Error loading {model_name} model: {str(e)}")
        return None

word2vec_model = get_model('word2vec')

glove_model = get_model('glove')

In [None]:
import os
import gensim.downloader as api
import pickle

def load_and_save_fasttext_model(drive_path="/content/drive/MyDrive/models/fasttext_model.model"):
    """
    Load the FastText model for OOV handling. Save the model to Google Drive if it does not exist.

    Args:
        drive_path (str): Path in Google Drive to save/load the FastText model.

    Returns:
        model: Loaded FastText model.
    """
    # Check if the model already exists in Google Drive
    if os.path.exists(drive_path):
        print("Loading FastText model from Drive...")
        fasttext_model = api.load(drive_path)
        print("FastText model loaded from Drive successfully.")
    else:
        print("Downloading FastText model...")
        # Load FastText model from gensim
        fasttext_model = api.load("fasttext-wiki-news-subwords-300")
        print("FastText model loaded successfully.")

    return fasttext_model


In [None]:
import numpy as np
# Load FastText model for OOV handling
fasttext_model = load_and_save_fasttext_model()
print("FastText model loaded successfully.")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import re

In [None]:
from utils import SentimentDataset
from utils import get_device

device = get_device()

In [None]:
print(type(train_dataset))
print(train_dataset[0])

In [None]:
from utils import get_embedding

word_to_vector_map = {}


In [None]:
def prepare_data(dataset):
    texts = [example['text'] for example in dataset]
    labels = [example['label'] for example in dataset]
    return texts, labels

train_texts, train_labels = prepare_data(train_dataset)
val_texts, val_labels = prepare_data(validation_dataset)
test_texts, test_labels = prepare_data(test_dataset)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)

train_dataset = SentimentDataset(
    train_texts, train_labels, glove_model, word_to_vector_map, get_embedding
)

val_dataset = SentimentDataset(
    val_texts, val_labels, glove_model, word_to_vector_map, get_embedding
)

In [None]:
test_dataset = SentimentDataset(
    test_texts, test_labels, glove_model, word_to_vector_map, get_embedding
)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 2a) Report the final configuration of your best model, namely the number of training epochs,learning rate, optimizer, batch size.

In [None]:
import importlib
import RNN


In [None]:
importlib.reload(RNN)



In [None]:
from RNN import RNN, train_model_rnn, train_model_multiple_optimizers

In [None]:
train_model_multiple_optimizers(RNN,
                               train_loader,
                               val_loader,
                               num_epochs=40,
                               device=device,
                               model_name='rnn_model',
                               num_runs=5)

In [None]:
import RNN
importlib.reload(RNN)

In [None]:
from RNN import RNN, train_model_rnn, train_model_multiple_learning_rates

In [None]:
train_model_multiple_learning_rates(RNN,
                                    train_loader,
                                    val_loader, device,
                                    model_name='rnn_model',
                                    num_runs=3,
                                    num_epochs=40)

In [None]:
import RNN
importlib.reload(RNN)

In [None]:
from RNN import RNN, train_model_rnn, train_model_multiple_batch_sizes

In [None]:
train_model_multiple_batch_sizes(RNN,
                                 train_data = train_dataset,
                                 val_data = val_dataset,
                                 device = device,
                                 model_name = 'rnn_model',
                                 num_runs=3,
                                 num_epochs=40)

In [None]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=1)

In [None]:
import RNN
importlib.reload(RNN)

In [None]:
from RNN import RNN, RNNWithPooling, train_model_rnn

# 2b) Report the accuracy score on the test set, as well as the accuracy score on the validation set for each epoch during training.

In [None]:
model = RNN()

# Train model with max pooling, learning rate 0.005, and Adam optimizer
history, early_stop_epoch, early_stop_history = train_model_rnn(
    model, train_loader, val_loader, num_epochs=40, device=device, model_name='rnn_with_optimised_values', learning_rate=0.005, optimizer_type='Adam'
)

print(early_stop_history)

In [None]:
from collections import OrderedDict

# Load the best model checkpoint for testing
checkpoint_path = 'test_rnn_with_optimised_values.pth'
model.load_state_dict(torch.load(checkpoint_path))

# Set the model to evaluation mode
model.eval()

# Evaluate the model on the test set
test_correct = 0
test_total = 0

# Assuming test_loader is defined and contains the test dataset
with torch.no_grad():  # Disable gradient calculation during evaluation
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass to get predictions
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)  # Get the class with highest probability

        # Update metrics
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

# Print test_correct and test_total
print(f'Correct Predictions: {test_correct}')
print(f'Total Examples: {test_total}')


# Calculate test accuracy
test_accuracy = 100.0 * test_correct / test_total
print()
print(f'Test Accuracy: {test_accuracy:.2f}%')

# Print early stopping history
print(early_stop_history)


In [None]:
model_max = RNNWithPooling(pooling_type='max')

# Train model with max pooling, learning rate 0.005, and Adam optimizer
history_max_pooling, early_stop_epoch_max_pooling, early_stop_history_max_pooling = train_model_rnn(
    model, train_loader, val_loader, num_epochs=40, device=device, model_name='rnn_with_max_pooling', learning_rate=0.005, optimizer_type='Adam'
)

print(early_stop_history_max_pooling)

In [None]:
from collections import OrderedDict

# Load the best model checkpoint for testing (with max pooling)
checkpoint_path = 'test_rnn_with_max_pooling.pth'
model_max.load_state_dict(torch.load(checkpoint_path))

# Set the model to evaluation mode
model_max.eval()

# Evaluate the model on the test set
test_correct = 0
test_total = 0

# Assuming test_loader is defined and contains the test dataset
with torch.no_grad():  # Disable gradient calculation during evaluation
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass to get predictions
        outputs = model_max(inputs)
        _, predicted = torch.max(outputs, 1)  # Get the class with highest probability

        # Update metrics
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

# Print test_correct and test_total
print(f'Correct Predictions: {test_correct}')
print(f'Total Examples: {test_total}')

# Calculate and print test accuracy
test_accuracy = 100.0 * test_correct / test_total
print()
print(f'Test Accuracy for Max Pooling: {test_accuracy:.2f}%')

# Print early stopping history
print(early_stop_history_max_pooling)

In [None]:
model_avg = RNNWithPooling(pooling_type='avg')

# Train model with avg pooling, learning rate 0.005, and Adam optimizer
history_avg_pooling, early_stop_epoch_avg_pooling, early_stop_history_avg_pooling = train_model_rnn(
    model, train_loader, val_loader, num_epochs=40, device=device, model_name='rnn_with_avg_pooling', learning_rate=0.005, optimizer_type='Adam'
)

print(early_stop_history_avg_pooling)

In [None]:
from collections import OrderedDict

# Load the best model checkpoint for testing (with avg pooling)
checkpoint_path = 'test_rnn_with_avg_pooling.pth'
model_avg.load_state_dict(torch.load(checkpoint_path))

# Set the model to evaluation mode
model_avg.eval()

# Evaluate the model on the test set
test_correct = 0
test_total = 0

# Assuming test_loader is defined and contains the test dataset
with torch.no_grad():  # Disable gradient calculation during evaluation
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass to get predictions
        outputs = model_avg(inputs)
        _, predicted = torch.max(outputs, 1)  # Get the class with highest probability

        # Update metrics
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

# Print test_correct and test_total
print(f'Correct Predictions: {test_correct}')
print(f'Total Examples: {test_total}')

# Calculate and print test accuracy
test_accuracy = 100.0 * test_correct / test_total
print()
print(f'Test Accuracy for Avg Pooling: {test_accuracy:.2f}%')

# Print early stopping history
print(early_stop_history_avg_pooling)


# 2c) RNNs produce a hidden vector for each word, instead of the entire sentence. Which methods have you tried in deriving the final sentence representation to perform sentiment classification? Describe all the strategies you have implemented, together with their accuracy scores on the test set.

In [None]:
import matplotlib.pyplot as plt

# Plot training accuracy for max pooling, avg pooling, and default RNN
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(early_stop_history_max_pooling['train_acc']) + 1), early_stop_history_max_pooling['train_acc'], label='Max Pooling - Training Accuracy')
plt.plot(range(1, len(early_stop_history_avg_pooling['train_acc']) + 1), early_stop_history_avg_pooling['train_acc'], label='Avg Pooling - Training Accuracy')
plt.plot(range(1, len(early_stop_history['train_acc']) + 1), early_stop_history['train_acc'], label='Optimized RNN - Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Training Accuracy (%)')
plt.title('Training Accuracy for Max Pooling, Avg Pooling, and Optimized RNN')
plt.legend()
plt.show()

# Plot validation accuracy for max pooling, avg pooling, and default RNN
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(early_stop_history_max_pooling['val_acc']) + 1), early_stop_history_max_pooling['val_acc'], label='Max Pooling - Validation Accuracy')
plt.plot(range(1, len(early_stop_history_avg_pooling['val_acc']) + 1), early_stop_history_avg_pooling['val_acc'], label='Avg Pooling - Validation Accuracy')
plt.plot(range(1, len(early_stop_history['val_acc']) + 1), early_stop_history['val_acc'], label='Optimized RNN - Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Validation Accuracy (%)')
plt.title('Validation Accuracy for Max Pooling, Avg Pooling, and Optimized RNN')
plt.legend()
plt.show()

In [None]:
import RNN
importlib.reload(RNN)

In [None]:
from RNN import RNN, RNNWithConcatPooling, train_model_rnn

In [None]:
model = RNNWithConcatPooling()

# Train model with max pooling, learning rate 0.005, and Adam optimizer
history_con, early_stop_epoch_con, early_stop_history_con = train_model_rnn(
    model, train_loader, val_loader, num_epochs=40, device=device, model_name='rnn_with_optimised_values', learning_rate=0.005, optimizer_type='Adam'
)

print(early_stop_history_con)

In [None]:
# Plot training accuracy for max pooling, avg pooling, and default RNN
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(early_stop_history_max_pooling['train_acc']) + 1), early_stop_history_max_pooling['train_acc'], label='Max Pooling - Training Accuracy')
plt.plot(range(1, len(early_stop_history_con['train_acc']) + 1), early_stop_history_con['train_acc'], label='Concat Pooling - Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Training Accuracy (%)')
plt.title('Training Accuracy for Max Pooling vs Concat Pooling')
plt.legend()
plt.show()

# Plot validation accuracy for max pooling, avg pooling, and default RNN
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(early_stop_history_max_pooling['val_acc']) + 1), early_stop_history_max_pooling['val_acc'], label='Max Pooling - Validation Accuracy')
plt.plot(range(1, len(early_stop_history_con['val_acc']) + 1), early_stop_history_con['val_acc'], label='Concat Pooling - Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Validation Accuracy (%)')
plt.title('Validation Accuracy for Max Pooling vs Concat Pooling')
plt.legend()
plt.show()

For sentiment classification, we have used different methods to derive the final sentence representation from the RNN's hidden states, which are produced for each word. Below, we describe each strategy implemented and their corresponding accuracy scores on the test set:

1. **Max Pooling**:
   - **Approach**: This method takes the maximum value across the hidden states produced by the RNN for each word in the sequence. It extracts the most prominent feature along the time dimension, which represents the strongest response for each feature across all words in the sequence.
   - **Implementation**: In the code, `torch.max(out, dim=1)` was used to compute the max-pooled representation of the hidden states.
   - **Accuracy**: The test accuracy with max pooling was higher compared to the baseline RNN due to its ability to retain the most important features, which often helps in capturing critical sentiment-related words in the text.

2. **Average Pooling**:
   - **Approach**: Average pooling computes the mean value of the hidden states across the sequence, providing an overall representation of the entire sentence by averaging the contributions from each word. This approach tends to capture a more generalized view of the sentence by smoothing out extreme values.
   - **Implementation**: In the code, `torch.mean(out, dim=1)` was used to compute the average-pooled representation.
   - **Accuracy**: The test accuracy with average pooling was comparable to max pooling but slightly lower in some cases, as it can dilute the impact of highly sentiment-bearing words by averaging them with neutral words.

3. **Concatenation of Last Hidden State and Max Pooling**:
   - **Approach**: This method combines the information from the last hidden state of the RNN, which contains sequential information, with the max-pooled representation of all hidden states. The concatenation aims to provide both the sequential context and the most prominent features in the sentence.
   - **Implementation**: In the code, `torch.cat((last_hidden, max_pooling), dim=1)` was used to concatenate the last hidden state with the max-pooled hidden states.
   - **Accuracy**: The concatenation method yielded the best test accuracy among the three approaches, as it was able to leverage both the sequential representation (last hidden state) and the important features (max pooling). This combination allowed the model to capture a richer understanding of the sentence.

### Summary of Accuracy Scores:
- **Max Pooling**: Higher accuracy than the baseline RNN, effectively capturing key features.
- **Average Pooling**: Provided a more generalized sentence representation, with slightly lower accuracy compared to max pooling.
- **Concatenation of Last Hidden State and Max Pooling**: Achieved the best accuracy due to the combination of sequential and prominent features, making it a robust representation for sentiment analysis.

The plots for training and validation accuracy show the comparative performance of these methods, with concatenation consistently outperforming the others in capturing sentiment effectively.

In [None]:
def train_model(model, train_loader, val_loader, num_epochs, device, model_name):
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    # Create an embedding layer as a proper nn.Module
    class LearnableEmbeddings(nn.Module):
        def __init__(self, dim=300):
            super().__init__()
            self.weight = nn.Parameter(torch.ones(dim))

        def forward(self, x):
            return x * self.weight

    embedding_layer = LearnableEmbeddings().to(device)

    # Combine all parameters
    optimizer = optim.AdamW([
        {'params': model.parameters()},
        {'params': embedding_layer.parameters()}
    ], lr=0.005, weight_decay=0.01)

    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer,
        T_0=5,
        T_mult=2,
        eta_min=1e-6
    )

    best_val_acc = 0
    patience = 7
    patience_counter = 0

    history = {
        'train_loss': [],
        'train_acc': [],
        'val_acc': [],
        'lr': []
    }

    ema = torch.optim.swa_utils.AveragedModel(model)

    for epoch in range(num_epochs):
        model.train()
        embedding_layer.train()
        total_loss = 0
        correct = 0
        total = 0

        for batch_inputs, labels in train_loader:
            batch_inputs = batch_inputs.float().to(device)
            batch_inputs = embedding_layer(batch_inputs)

            labels = labels.squeeze().to(device)

            if epoch > 3:
                alpha = 0.2
                lam = np.random.beta(alpha, alpha)
                index = torch.randperm(batch_inputs.size(0)).to(device)
                mixed_inputs = lam * batch_inputs + (1 - lam) * batch_inputs[index]
                batch_inputs = mixed_inputs

            optimizer.zero_grad()
            outputs = model(batch_inputs)

            if epoch > 3:
                loss = lam * criterion(outputs, labels) + (1 - lam) * criterion(outputs, labels[index])
            else:
                loss = criterion(outputs, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                list(model.parameters()) + list(embedding_layer.parameters()),
                max_norm=1.0
            )
            optimizer.step()
            ema.update_parameters(model)

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = 100. * correct / total

        # Validation phase
        model.eval()
        embedding_layer.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch_inputs, labels in val_loader:
                batch_inputs = batch_inputs.float().to(device)
                batch_inputs = embedding_layer(batch_inputs)

                labels = labels.squeeze().to(device)
                outputs = ema(batch_inputs)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()

        val_acc = 100. * val_correct / val_total
        scheduler.step()
        current_lr = scheduler.get_last_lr()[0]

        history['train_loss'].append(total_loss/len(train_loader))
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)
        history['lr'].append(current_lr)

        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Training Accuracy: {train_acc:.2f}%')
        print(f'Validation Accuracy: {val_acc:.2f}%')
        print(f'Learning Rate: {current_lr:.6f}')

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()
            patience_counter = 0
            print(f'New best validation accuracy! Saving model...')
            torch.save({
                'epoch': epoch,
                'model_state_dict': ema.state_dict(),
                'embedding_state_dict': embedding_layer.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'val_acc': val_acc,
                'train_acc': train_acc,
                'history': history,
            }, f'best_{model_name}.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping triggered after epoch {epoch+1}')
                break

    return best_model_state, best_val_acc, embedding_layer


In [None]:
import RNN
importlib.reload(RNN)


In [None]:
from RNN import RNNWithConcatPooling

# 3A Accuracy Score with Dynamic Embeddings


In [None]:
RNNWithdynamicembeddings = RNNWithConcatPooling()

RNNmodelstate, _,_ = train_model(RNNWithdynamicembeddings, train_loader, val_loader, num_epochs=30, device=device, model_name='RNNWithdynamicembeddings')
RNNWithdynamicembeddings.load_state_dict(RNNmodelstate)

# Accuracy Score with OOV Handling

In [None]:
from scipy.linalg import lstsq
def get_improved_embedding(word, models, word_to_vector_map):
    """
    Retrieve the embedding for a word from GloVe, or from FastText (transformed to GloVe space),
    or generate a random embedding if OOV in both, storing it in oov_vectors if new.
    """
    glove_model = models["glove"]
    fasttext_model = models["fasttext"]
    if word in word_to_vector_map:
        return word_to_vector_map[word]
    if word in glove_model:
        return glove_model[word]

    # Compute the FastText-to-GloVe transformation matrix only once per session
    if not hasattr(get_improved_embedding, "W_fasttext"):
        # Find common words in both FastText and GloVe models
        common_words = list(set(glove_model.key_to_index).intersection(set(fasttext_model.key_to_index)))
        X_fasttext = np.array([fasttext_model[w] for w in common_words])
        Y_glove = np.array([glove_model[w] for w in common_words])
        get_improved_embedding.W_fasttext, _, _, _ = lstsq(X_fasttext, Y_glove)


    # If the word is in FastText, transform its embedding to GloVe space
    if word in fasttext_model:
        transformed_embedding = np.dot(fasttext_model[word], get_improved_embedding.W_fasttext)
        return transformed_embedding
    word_to_vector_map[word] = np.random.normal(size=300)
    return word_to_vector_map[word]
train_dataset = SentimentDataset(
    train_texts, train_labels, glove_model, word_to_vector_map, get_improved_embedding
)

val_dataset = SentimentDataset(
    val_texts, val_labels, glove_model, word_to_vector_map, get_improved_embedding
)


In [None]:
RNNWithoovhandling = RNNWithConcatPooling()

RNNmodelstate, _,_ = train_model(RNNWithoovhandling, train_loader, val_loader, num_epochs=30, device=device, model_name='RNNWithoovhandling')
RNNWithoovhandling.load_state_dict(RNNmodelstate)