# Homework 1
**The goal of this homework is to improve the simple classifier explained in the book starting on page 40. It uses a pretrained transformer and then the logistic regression classifier of page 44 uses only the first hidden state for each input text.**

**You should use the full hidden states to classify each input text, and use a neural network instead of a regression classifier. Then, compare the results with the bookâ€™s on page 44.**

**You have 11 days to do it, by pairs. Start working early as you may need computing time to get the results. Please submit a Jupyter notebook or similar, and include some conclusions.**

## Solution

First, we make sure to install and import the necessary libraries

In [1]:
%pip install -r requirements.txt

import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


Then we set up `pytorch` to take profit of the GPU, if available, and load the dataset.

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
emotions_dataset = load_dataset("emotion")

Now, we use a pretrained model to transform the text into numerical representations. The model chosen was `distilbert-base-uncased`, as in the book.

In [3]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModel.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

# TODO: Add explanations for this section
A possible explanation to the more mystical parts can be simply "following the book" or something like that.

In [4]:
emotions_encoded = emotions_dataset.map(tokenize, batched=True, batch_size=None)
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])


max_length = max([emotions_encoded[subset]['input_ids'].shape[1] for subset in emotions_encoded.keys()])
print(emotions_encoded)
print(max_length)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})
87


In [5]:
def compute_hidden_states(batch):
    global max_length
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        
    print(f"The initial dimension is: {last_hidden_state.shape}")
    batch_size, sequence_length, hidden_dim = last_hidden_state.shape
    tensor = last_hidden_state.reshape(batch_size, sequence_length*hidden_dim)

    padding = (max_length - sequence_length) * hidden_dim
    padded = nn.functional.pad(tensor, (0, padding), "constant", 0)
    print(f"The final dimension is: {padded.shape}")
    return {"hidden_state": padded}
        
emotions_hidden = emotions_encoded.map(compute_hidden_states, batched=True)

In [None]:
X_train = torch.cat((emotions_hidden["train"]["hidden_state"], emotions_hidden["validation"]["hidden_state"])).to(device)
y_train = torch.cat((emotions_hidden["train"]["label"], emotions_hidden["validation"]["label"])).to(device)

X_test = torch.cat((emotions_hidden["test"]["hidden_state"], emotions_hidden["test"]["hidden_state"])).to(device)
y_test = torch.cat((emotions_hidden["test"]["label"], emotions_hidden["test"]["label"])).to(device)

Now we set up and train a feed-forward neural network to classify these numerical representations.

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, hidden_dim=64):
        super(SentimentClassifier, self).__init__()
        self.__pipeline = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes)
        )
        
    def forward(self, input):
        print(f"The input of this model has size {input.shape}")
        output = self.__pipeline(input)
        print(f"The output of this model has size {output.shape}")
        return output

    def classify(self, input):
        return torch.argmax(self(input), dim=1)

For training, we'll make use of the `Dataset` class of `pytorch`, as suggested in [the docs](https://pytorch.org/tutorials/beginner/introyt/trainingyt.html).

In [14]:
input_dim = X_train.shape[1]
num_classes = len(set(y_train))
classifier = SentimentClassifier(input_dim, num_classes).to(device)


In [15]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9)

$k$-fold cross validation is not implemented in `pytorch` by default. However, we would like to use this technique due to personal preference, so we will manually implement it.

In [18]:
import torch
import torch.nn as nn
import torch.optim
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

class KFoldTrainer():
    def __init__(self, model: nn.Module, optimizer:torch.optim.Optimizer, loss, k=10, batch_size=16):
        self.__model = model
        self.__optimizer = optimizer
        self.__loss = loss
        self.__k = k
        self.__batch_size = batch_size
        
    def __fit_epoch(self, data_loader):
        for data, target in iter(data_loader):
            self.__optimizer.zero_grad()
            output = self.__model(data)
            loss = self.__loss(output, target)
            loss.backward()
            self.__optimizer.step()

    def fit(self, data, labels, epochs=10):
        """Train the model using K-fold cross-validation."""
        kf = KFold(n_splits=self.__k, shuffle=True, random_state=42)  # You can adjust random_state

        fold_metrics = []
        for fold, (train_index, val_index) in enumerate(kf.split(data, labels)):
            print(f"Fold {fold + 1}/{self.__k}")

            X_train = data[train_index]
            y_train = labels[train_index]
            X_val = data[val_index]
            y_val = data[val_index]

            train_dataset = TensorDataset(X_train, y_train)
            val_dataset = TensorDataset(X_val, y_val)

            train_loader = DataLoader(train_dataset, batch_size=self.__batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=self.__batch_size, shuffle=False)
            
            # Training loop
            for epoch in range(epochs):
                self.__model.train()  # Set model to training mode
                self.__fit_epoch(train_loader)

                val_loss, val_accuracy = self.__evaluate(val_loader) # Sets model to eval mode; hence why the switch is needed.
                print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

            # Store metrics for this fold
            fold_metrics.append({'fold': fold + 1, 'val_loss': val_loss, 'val_accuracy': val_accuracy})

        # Calculate and print average metrics across all folds
        avg_val_loss = np.mean([m['val_loss'] for m in fold_metrics])
        avg_val_accuracy = np.mean([m['val_accuracy'] for m in fold_metrics])
        print(f"Average Validation Loss: {avg_val_loss:.4f}, Average Validation Accuracy: {avg_val_accuracy:.4f}")
        
        return fold_metrics

    def __evaluate(self, data_loader):
        """Evaluate the model on the given data loader."""
        self.__model.eval()  # Set model to evaluation mode
        total_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():  # Disable gradient calculation
            for data, target in iter(data_loader):
                output = self.__model(data)
                loss = self.__loss(output, target)
                total_loss += loss.item() * data.size(0)  # Accumulate loss

                predicted = torch.argmax(output.data, 1)  # Get predicted class
                total += target.size(0)
                correct += (predicted == target).sum().item()

        avg_loss = total_loss / len(data_loader.dataset)
        accuracy = correct / total
        return avg_loss, accuracy
    
    def test(self, test_dataset, test_labels):
        """
        Evaluate the model on unseen test data and compute average accuracy and F1-score.
        """
        self.__model.eval()          

        test_data = TensorDataset(test_dataset, test_labels)
        test_loader = DataLoader(test_data, batch_size=self.__batch_size, shuffle=False)

        all_predicted = []
        all_targets = []

        with torch.no_grad():  # Disable gradient calculation
            for data, target in test_loader:
                predicted = self.__model.classify(data) # Get predicted class

                all_predicted.extend(predicted.cpu().numpy())
                all_targets.extend(target.cpu().numpy())

        # Calculate metrics
        accuracy = accuracy_score(all_targets, all_predicted)
        f1 = f1_score(all_targets, all_predicted, average='weighted')  # Use 'weighted' for multi-class

        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test F1-Score: {f1:.4f}")

        return accuracy, f1

Now we train the model on the dataset and 

In [19]:
trainer = KFoldTrainer(classifier, optimizer, loss)
trainer.fit(X_train, y_train)
trainer.test(X_test, y_test)

Fold 1/10
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.Size([16, 18000])
torch.Size([16, 66816])
torch.

RuntimeError: 0D or 1D target tensor expected, multi-target not supported