In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("SetFit/sst5")
print(dataset)
print(dataset['train'][0])

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2210
    })
})
{'text': 'a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films', 'label': 4, 'label_text': 'very positive'}


Build vocabulary and pre-processing functions:

In [None]:
def tokenize(text: str):
    return text.lower().split()


def build_vocab(sentences: list[str]):
    vocab = set()
    for sentence in sentences:
        vocab.update(tokenize(sentence))
    return {word: idx for idx, word in enumerate(vocab)}


# Build vocabulary
vocab = build_vocab(dataset["train"]["text"])
vocab_size = len(vocab)

In [None]:
def map_token_to_index(token):
    # Return the index of the token or the index of the '<unk>' token if the token is not in the vocabulary
    return vocab.get(token, -1)


def map_text_to_indices(text: str):
    return [map_token_to_index(token) for token in tokenize(text)]


def prepare_dataset(dataset):
    return dataset.map(
        lambda x: {"token_ids": map_text_to_indices(x["text"])}, num_proc=1
    )

Create a function that takes a batch of sequences of token ids (list of list of ints) and converts them into one-hot encodings:

In [None]:
# One-hot encoding function for a batch of sentences
def one_hot_encode_batch(sentences: list[list[int]]):
    # Note that we are assuming that the sentences have the same length
    sequence_length = len(sentences[0])
    batch_size = len(sentences)

    # Create a tensor of zeros with the desired shape (including the batch dimension)
    one_hot_vectors = torch.zeros(
        batch_size, sequence_length, vocab_size, dtype=torch.float32
    )

    for i, indices in enumerate(sentences):
        for j, idx in enumerate(indices):
            # Set the appropriate index to 1.0, but only if the index is not -1
            if idx >= 0:
                one_hot_vectors[i, j, idx] = 1.0

    return one_hot_vectors

In [None]:
preprocessed_dataset = prepare_dataset(dataset)
print(preprocessed_dataset["train"][0])

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

{'text': 'a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films', 'label': 4, 'label_text': 'very positive', 'token_ids': [8286, 927, 7958, 9943, 7045, 5065, 338, 14881, 2391, 2064, 7045, 1686, 14464, 7045, 8100, 290, 11602]}


In [None]:
# Define the collate function for dynamic truncation
def collate_fn(batch):
    ## Truncate all sentences in the batch to the shortest length

    # Find the minimum length of the sentences in the batch
    min_length = min([len(example["token_ids"]) for example in batch])
    
    # We don't convert the inputs to tensors here because we will apply one-hot encoding and therefore converting to tensors in the model on-the-fly
    inputs = [example["token_ids"][:min_length] for example in batch]
    labels = torch.tensor([example["label"] for example in batch])

    return inputs, labels

In [None]:
# Create DataLoaders for train and test datasets
train_dataloader = DataLoader(
    preprocessed_dataset["train"], batch_size=8, collate_fn=collate_fn, shuffle=True
)
validation_dataloader = DataLoader(
    preprocessed_dataset["validation"], batch_size=32, collate_fn=collate_fn
)
test_dataloader = DataLoader(
    preprocessed_dataset["test"], batch_size=32, collate_fn=collate_fn
)

for batch in train_dataloader:
    inputs, labels = batch
    print(inputs)
    print(labels)
    break

[[11030, 8286, 2330], [11514, 11955, 339], [3289, 14111, 7045], [14933, 3289, 15372], [13186, 8957, 8447], [15702, 7212, 13484], [1686, 12701, 5937], [1795, 3289, 12136]]
tensor([2, 1, 0, 4, 3, 1, 3, 0])


In [None]:
# Define the Neural Network
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_classes):
        super(SentimentModel, self).__init__()
        self.hidden_layer = nn.Linear(vocab_size, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_ids: list[list[int]]):
        # Instead of the embedding layer, we will use one-hot encoding
        # Note: you could also use torch's Embedding layer initialized with the one-hot vectors
        encodings = one_hot_encode_batch(input_ids)
        # Sum the one-hot vectors to get the bag of words representation
        bag_of_words = encodings.sum(dim=1)
        # Apply the hidden layer and the output layer
        a_1 = torch.relu(self.hidden_layer(bag_of_words))
        # No activation function is applied to the output layer because we will use CrossEntropyLoss which applies softmax
        z_2 = self.output_layer(a_1)
        return z_2


# Initialize the model
hidden_dim = 256
num_classes = 5
model = SentimentModel(vocab_size, hidden_dim, num_classes)

# Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# Train the Model
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_dataloader:
        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # Backward pass and update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_dataloader):.4f}")

Epoch 1/10, Loss: 1.5323
Epoch 2/10, Loss: 1.2794
Epoch 3/10, Loss: 0.9363
Epoch 4/10, Loss: 0.6376
Epoch 5/10, Loss: 0.4704
Epoch 6/10, Loss: 0.3709
Epoch 7/10, Loss: 0.3100
Epoch 8/10, Loss: 0.2662
Epoch 9/10, Loss: 0.2455
Epoch 10/10, Loss: 0.2284


Bonus: Evaluate the model by means of accuracy (percentage of correctly predicted classes):

In [None]:
def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation
        for inputs, labels in dataloader:
            # Forward pass
            outputs = model(inputs)
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    # Calculate accuracy
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")
    return accuracy

In [None]:
# Evaluate on the test set
test_accuracy = evaluate_model(model, test_dataloader)

Accuracy: 0.3086
