#ENGR 8990 - Deep Learning & Engineering Applications
## Assignment 4 - Transformer for Sentiment Classification 
Assignment: Code a transformer model for sentiment classification.

1.   Construct a transformer encoder (you could use the one in NB13) as the backbone and add a linear classifier for sentiment classification using the IMDB dataset (note: the vocab for IMDB is different from the NMT dataset used in NB13).

2.   Train the model and display the proggess showing both training and validation metrics.

2.   Evaluate the trained model on the test dataset.

In [169]:
!pip install d2l==0.17.0



In [1]:
import torch 
from torch import nn
from d2l import torch as d2l
import math
import torch.optim as optim

In [76]:
batch_size = 64
train_iter, test_iter, vocab = d2l.load_data_imdb(batch_size)

In [77]:
len(train_iter), len(test_iter)

(391, 391)

In [78]:
print("The length of the vocab is", len(vocab))

The length of the vocab is 49346


In [79]:
for batch in train_iter:
  print (batch[0][0])
  # print (batch[1])
  break

tensor([   48,    20,     6,   131,    32,    36,    82,  9020,  1356,     2,
        10607, 22968, 12030,    49,    25,   367,    10,   432,   143,    48,
           20,   702,    35,     2,  6280,     4,  2909, 18748, 11110, 12638,
           15,  5274, 26051,  4223,     0,     8,    97,    10,     6, 30218,
          127,   209,   129,     3,    43,    80,  1164,  1582,  9167,    38,
          207,    71, 12150,    15, 14452,    52,  8852,     3,    32,     0,
         2314,  9829,  4120,    27,    35,    43,  1271,    42,    25,    70,
           34,   428,  1037,    35,     0,  9167,    38,   401,    96,    43,
         2484,     7,     1,  1151,  2182,     3,    75,    26,  2259,   239,
          109,   537,    24,   220,     3,  8258,   257,     6,    31,     4,
            1,    80,  1032,  3686,    44,   960,   122,   839,    78,    34,
           49,    24,   199,     4,  9167,    14,   137,     2,  3809,  5530,
            0,   257,     6,  2185,   831,     1,  2923,    20, 

# Code starts here

### Define the Transformer encoder as the backbone

In [94]:
class PositionWiseFFN(nn.Module):
    """Positionwise feed-forward network."""
    def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs,
                 **kwargs):
        super(PositionWiseFFN, self).__init__(**kwargs)
        self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)

    def forward(self, X):
        return self.dense2(self.relu(self.dense1(X)))
    
## Residual Connection and Layer Normalization

ln = nn.LayerNorm(2)
bn = nn.BatchNorm1d(2)

class AddNorm(nn.Module):
    """Residual connection followed by layer normalization."""
    def __init__(self, normalized_shape, dropout, **kwargs):
        super(AddNorm, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(normalized_shape)

    def forward(self, X, Y):
        # print("test add")
        return self.ln(self.dropout(Y) + X)
    



## Encoder

class EncoderBlock(nn.Module):
    """Transformer encoder block."""
    def __init__(self, key_size, query_size, value_size, num_hiddens,
                 norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
                 dropout, use_bias=False, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.attention = d2l.MultiHeadAttention(key_size, query_size,
                                                value_size, num_hiddens,
                                                num_heads, dropout, use_bias)
        self.addnorm1 = AddNorm(norm_shape, dropout)
        self.ffn = PositionWiseFFN(ffn_num_input, ffn_num_hiddens,
                                   num_hiddens)
        self.addnorm2 = AddNorm(norm_shape, dropout)

    def forward(self, X, valid_lens):
        Y = self.addnorm1(X, self.attention(X, X, X, valid_lens))
        # print('test')
        return self.addnorm2(Y, self.ffn(Y))
    



class TransformerEncoder(d2l.Encoder):
    """Transformer encoder."""
    def __init__(self, vocab_size, key_size, query_size, value_size,
                 num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
                 num_heads, num_layers, dropout, use_bias=False, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module(
                "block" + str(i),
                EncoderBlock(key_size, query_size, value_size, num_hiddens,
                             norm_shape, ffn_num_input, ffn_num_hiddens,
                             num_heads, dropout, use_bias))

    def forward(self, X, valid_lens, *args):
        # Since positional encoding values are between -1 and 1, the embedding
        # values are multiplied by the square root of the embedding dimension
        # to rescale before they are summed up
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self.attention_weights = [None] * len(self.blks)
        for i, blk in enumerate(self.blks):
            X = blk(X, valid_lens)
            self.attention_weights[
                i] = blk.attention.attention.attention_weights
        # print('encoder_passed')
        return X
    

### Linear Classifier

In [95]:
class SentimentClassifier(nn.Module):
    """Linear classifier for sentiment classification."""
    def __init__(self, num_hiddens, num_outputs):
        super(SentimentClassifier, self).__init__()
        self.dense = nn.Linear(num_hiddens, num_outputs)

    def forward(self, X):
        # X shape: (batch_size, seq_length, num_hiddens)
        # Average pooling over the sequence length
        X = X.mean(dim=1)
        # Apply the linear layer
        return self.dense(X)

### Add the linear classifier on top of the Transformer encoder

In [96]:
num_hiddens, num_layers, dropout, batch_size = 32, 2, 0.1, 64
lr, num_epochs, device = 0.005, 10, d2l.try_gpu()
ffn_num_input, ffn_num_hiddens, num_heads = 32, 64, 2
key_size, query_size, value_size = 32, 32, 32
norm_shape = [32]
num_outputs = 2

encoder = TransformerEncoder(len(vocab), key_size, query_size, value_size,
                             num_hiddens, norm_shape, ffn_num_input,
                             ffn_num_hiddens, num_heads, num_layers, dropout)

classifier = SentimentClassifier(num_hiddens, num_outputs)

net = nn.Sequential(encoder, classifier)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [97]:
# Move the model to the appropriate device
net.to(device)

Sequential(
  (0): TransformerEncoder(
    (embedding): Embedding(49346, 32)
    (pos_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (blks): Sequential(
      (block0): EncoderBlock(
        (attention): MultiHeadAttention(
          (attention): DotProductAttention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (W_q): Linear(in_features=32, out_features=32, bias=False)
          (W_k): Linear(in_features=32, out_features=32, bias=False)
          (W_v): Linear(in_features=32, out_features=32, bias=False)
          (W_o): Linear(in_features=32, out_features=32, bias=False)
        )
        (addnorm1): AddNorm(
          (dropout): Dropout(p=0.1, inplace=False)
          (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        )
        (ffn): PositionWiseFFN(
          (dense1): Linear(in_features=32, out_features=64, bias=True)
          (relu): ReLU()
          (dense2): Linear(in_features=64, out_featur

### Training

In [98]:
# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.005)

In [99]:
torch.cuda.empty_cache()

In [101]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    net.train()
    for batch_idx, (data, label) in enumerate(train_iter):
        # Move data to the appropriate device
        # Calculate valid_lens for each batch
        max_len = 100  # Assuming max_len is the maximum sequence length you choose
        actual_lens = [min(len(batch), max_len) for batch in data]
        valid_lens = torch.tensor(actual_lens).to(device)

        data, valid_lens, label = data.to(device), valid_lens.to(device), label.to(device)

        optimizer.zero_grad()
        output_tr = net[0](data,valid_lens)
        # print(output_tr.shape)
        output = net[1](output_tr)
        loss = loss_fn(output, label)
        loss.backward()
        optimizer.step()
        
        # Print training statistics
        if batch_idx % 100 == 0:
            print('Epoch {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_iter.dataset),
                100. * batch_idx / len(train_iter), loss.item()))
    



In [103]:
# Validation

net.eval()  # Set the model to evaluation mode
total_correct = 0
total_samples = 0
with torch.no_grad():  # Disable gradient computation during validation
    for data, label in test_iter:
        # Move data to the appropriate device
        data, label = data.to(device), label.to(device)

        # Calculate valid_lens for each batch
        max_len = 100  # Assuming max_len is the maximum sequence length you choose
        actual_lens = [min(len(batch), max_len) for batch in data]
        valid_lens = torch.tensor(actual_lens).to(device)

        # Forward pass
        output_tr = net[0](data, valid_lens)
        output = net[1](output_tr)
        loss = loss_fn(output, label)

        # Compute predictions
        _, predicted = torch.max(output, 1)

        # Update total samples and total correct predictions
        total_samples += label.size(0)
        total_correct += (predicted == label).sum().item()

# Compute accuracy
accuracy = total_correct / total_samples
print('Validation Accuracy: {:.2f}%'.format(accuracy * 100))

Validation Accuracy: 80.00%
