# Predicting Sentiment Using a Transformer

This notebook provides you with a complete code example that predicts the sentiment of movie reviews using a transformer encoder network.

## Using the IMDB Dataset

Start by downloading the Large Movie Review Dataset (often referred to as the IMDB dataset, as it’s available at https://huggingface.co/datasets/imdb). It contains 50,000 movie reviews, labeled as positive or negative. The dataset is divided into 25,000 reviews for training and 25,000 reviews for testing.

Download the IMDB dataset ...

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

... splitting the training and validation datasets ...

In [None]:
split = dataset["train"].train_test_split(
    test_size=0.2,
    stratify_by_column="label",
    seed=42,
)
train_dataset, val_dataset = split["train"], split["test"]

... and print some example reviews.

In [None]:
import numpy as np
import pandas as pd

examples = train_dataset.select(np.random.randint(0, len(train_dataset), 3))
df = pd.DataFrame({"Text": examples["text"], "Label": examples["label"]})
styled_df = df.style.set_properties(**{"text-align": "left"}).set_table_styles(
    [{"selector": "th", "props": [("text-align", "center")]}]
)
with pd.option_context("display.max_colwidth", None):
    display(styled_df)

### Preprocessing the Reviews

Implement a function to tokenize a sentence ...

In [None]:
import contractions
import re
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

def tokenize(text):
    """Tokenize text."""
    text = contractions.fix(text)
    
    replacements = {"’": "'", "‘": "'", "“": '"', "”": '"', " ́": "'", " ́ ́": '"'}
    for old, new in replacements.items():
        text = text.replace(old, new)

    tokens = tokenizer(text)

    filtered_tokens = [
        token for token in tokens
        if re.match(r"^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*(_[a-zA-Z0-9]+)*$", token)
    ]
    
    return filtered_tokens

... create a vocabulary ...

In [None]:
from torchtext.vocab import build_vocab_from_iterator

def imdb_iterator(dataset):
    """Iterate over the IMBD dataset."""
    for data in dataset:
        yield tokenize(data["text"])

vocab = build_vocab_from_iterator(
    imdb_iterator(train_dataset),
    specials=["<unk>"],
)
vocab.set_default_index(vocab["<unk>"])

... and preprocess the training, validation, and testing datasets.

In [None]:
def preprocessing(sample):
    """Preprocess the input data."""
    tokens = tokenize(sample["text"])
    indices = vocab(tokens)
    sample.update({"x": indices})
    return sample

train_dataset = train_dataset.map(preprocessing)
val_dataset = val_dataset.map(preprocessing)
test_dataset = dataset["test"].map(preprocessing)

## Building a Transformer Encoder Layer

Prepare a class to implement a multi-head attention layer ...

In [None]:
import deeplay as dl
import torch
import torch.nn as nn

class MultiHeadAttentionLayer(dl.DeeplayModule):
    """"Multi-head attention layer."""
    
    def __init__(self, features, num_heads):
        """Initialize multi-head attention layer."""
        super().__init__()
        self.features, self.num_heads = features, num_heads
        self.layer = dl.Layer(nn.MultiheadAttention, features, num_heads)

    def forward(self, x, batch_indices):
        """Calculate forward pass."""
        attn_mask = self._fetch_attn_mask(batch_indices)
        y, *_ = self.layer(x, x, x, attn_mask=attn_mask)
        return y

    def _fetch_attn_mask(self, batch_indices):
        """Get attention mask."""
        return ~torch.eq(batch_indices.unsqueeze(1),
                         batch_indices.unsqueeze(0))

... and a class to implement a transformer encoder layer ...

In [None]:
from torch_geometric.nn.norm import LayerNorm

class TransformerEncoderLayer(dl.DeeplayModule):
    """Transformer encoder layer."""

    def __init__(self, d_model, num_heads, feedforward_dim, dropout_p=0.0):
        """Initialize transformer encoder layer."""
        super().__init__()

        self.d_model = d_model
        self.num_heads = num_heads
        self.feedforward_dim = feedforward_dim
        self.dropout_p = dropout_p

        self.self_attn = MultiHeadAttentionLayer(d_model, num_heads)
        self.attn_dropout = dl.Layer(nn.Dropout, dropout_p)
        self.attn_skip = dl.Add()
        self.attn_norm = dl.Layer(LayerNorm, d_model, eps=1e-6)
        
        self.feedforward = dl.Sequential(
            dl.Layer(nn.Linear, d_model, feedforward_dim),
            dl.Layer(nn.ReLU),
            dl.Layer(nn.Linear, feedforward_dim, d_model),
        )
        self.feedforward_dropout = dl.Layer(nn.Dropout, dropout_p)
        self.feedforward_skip = dl.Add()
        self.feedforward_norm = dl.Layer(LayerNorm, d_model, eps=1e-6)
        
    def forward(self, x, batch_index):
        """Calculate forward pass."""
        y_attn = self.self_attn(x, batch_index)
        y_attn = self.attn_dropout(y_attn)
        y_attn = self.attn_skip(x, y_attn)
        y_attn = self.attn_norm(y_attn, batch_index)

        y = self.feedforward(y_attn)
        y = self.feedforward_dropout(y)
        y = self.feedforward_skip(y_attn, y)
        y = self.feedforward_norm(y, batch_index)
        return y

## Building a Transformer Encoder Model

Build a class to implement a transformer encoder model ...

In [None]:
class TransformerEncoderModel(dl.DeeplayModule):
    """Transformer encoder model."""

    def __init__(self, vocab_size, d_model, num_heads, feedforward_dim,
                 num_layers, out_dim, dropout_p=0.0):
        """Initialize transformer encoder model."""
        super().__init__()

        self.d_model = d_model
        self.num_heads = num_heads
        self.feedforward_dim = feedforward_dim
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        self.out_dim = out_dim
        
        self.embedding = dl.Layer(nn.Embedding, vocab_size, d_model)

        self.pos_encoder = dl.IndexedPositionalEmbedding(d_model)
        self.pos_encoder.dropout.configure(p=dropout_p)
        
        self.blocks = dl.LayerList()
        for _ in range(num_layers):
            self.blocks.append(
                TransformerEncoderLayer(
                    d_model, num_heads, feedforward_dim, dropout_p=dropout_p
                )
            )
            
        self.out = dl.Sequential(
            dl.Layer(nn.Dropout, dropout_p),
            dl.Layer(nn.Linear, d_model, d_model // 2),
            dl.Layer(nn.ReLU),
            dl.Layer(nn.Linear, d_model // 2, out_dim), 
            dl.Layer(nn.Sigmoid),
        )
        
    def forward(self, seq):
        """Calculate forward pass."""
        h = self.embedding(seq["x"]) * self.d_model ** 0.5
        h = self.pos_encoder(h, seq["batch_indices"])
        
        for layer in self.blocks:
            h = layer(h, seq["batch_indices"])
        
        batch_size = torch.max(seq["batch_indices"]) + 1
        g = torch.zeros(batch_size, self.d_model, device=h.device)
        g = g.scatter_add(0, seq["batch_indices"][:, None].expand_as(h), h)
        g = g / torch.bincount(seq["batch_indices"])[:, None]

        return self.out(g).squeeze()

... instantiate the transformer encoder model ...

In [None]:
model = TransformerEncoderModel(
    vocab_size=len(vocab),
    d_model=300,
    num_heads=12,
    feedforward_dim=512,
    num_layers=4,
    out_dim=1,
    dropout_p=0.1,
).create()

print(model)

... and add pretrained embeddings.

In [None]:
from torchtext.vocab import GloVe

glove = GloVe(name="42B", dim=300, cache="glove_embeddings_dataset")

model.embedding.weight.data = glove.get_vecs_by_tokens(
    vocab.get_itos(), lower_case_backup=True
)
model.embedding.weight.requires_grad = False

## Defining the Data Loaders

In [None]:
from torch.utils.data import DataLoader
from torch_geometric.data import Data

def collate(batch):
    """Combine data into a single batch that the model can process."""
    xs, ys, batch_indices = [], [], []
    for i, b in enumerate(batch):
        x, label = torch.tensor(b["x"]), torch.tensor(b["label"])
        xs.append(x), ys.append(label)
        batch_indices.append(torch.ones_like(x, dtype=torch.long) * i)
    return Data(x=torch.cat(xs), batch_indices=torch.cat(batch_indices), 
                y=torch.Tensor(ys).float())

train_dataloader = DataLoader(
    train_dataset, batch_size=8, shuffle=True, collate_fn=collate
)
val_dataloader = DataLoader(
    val_dataset, batch_size=8, shuffle=False, collate_fn=collate
)
test_dataloader = DataLoader(
    test_dataset, batch_size=8, shuffle=False, collate_fn=collate
)


## Training the Model

Compile the model ...

In [None]:
class AdamW(dl.Optimizer):
    """AdamW optimizer."""

    def __pre_init__(self, **optimzer_kwargs):
        """Execute before initialization."""
        optimzer_kwargs.pop("classtype", None)
        super().__pre_init__(torch.optim.AdamW, **optimzer_kwargs)

classifier = dl.BinaryClassifier(
    model=model,
    optimizer=AdamW(lr=1e-4),
).create()

... and train it.

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    monitor="valBinaryAccuracy",
    dirpath="models",
    filename="ATT-model{epoch:02d}-val_accuracy{valBinaryAccuracy:.2f}",
    auto_insert_metric_name=False,
    mode="max",
)
trainer = dl.Trainer(max_epochs=5, callbacks=[checkpoint_callback])
trainer.fit(classifier, train_dataloader, val_dataloader)

## Evaluating the Trained Model

Load the best model ...

In [None]:
import glob, os

best_model = glob.glob("./models/ATT-model*")
best_model = max(best_model, key=os.path.getctime)
best_classifier = dl.BinaryClassifier \
    .load_from_checkpoint(best_model, model=model).create()

... test the trained model ...

In [None]:
test_results = trainer.test(best_classifier, test_dataloader)

... and display the model’s prediction on some reviews.

In [None]:
import pandas as pd
import random

best_classifier.model.eval()

texts, labels, predictions = [], [], []
for idx in random.sample(range(len(test_dataset)), 3):
    sample = test_dataset[idx]
    
    input_tensor = torch.Tensor(vocab(tokenize(sample["text"]))).long()
    test_input = {
        "x": input_tensor,
        "batch_indices": torch.zeros_like(input_tensor, dtype=torch.long)
    }

    probability = classifier.model(test_input)
    pred = probability > 0.5

    texts.append(sample["text"])
    labels.append(sample["label"])
    predictions.append(pred.item() * 1)

df = pd.DataFrame({"text": texts, "label": labels, "prediction": predictions})
styled_df = df.style.set_properties(**{"text-align": "left"}).set_table_styles(
    [{"selector": "th", "props": [("text-align", "center")]}]
)
with pd.option_context("display.max_colwidth", None):
    display(styled_df)