In [None]:
import time
import random
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchvision as tv

In [None]:
"""
Create dummy dataset
"""
BATCH_SIZE = 32
NUM_DATA = BATCH_SIZE * 10
NUM_CLASSES = 10
DIM = 100
LR = 1e-2
VOCAB_SIZE = 100     # index 0 for PAD
MIN_L = 10
MAX_L = 50

DEVICE = "cpu"
if torch.cuda.is_available:
    DEVICE = "cuda:0"

# dummy sequence data generation
# data format = [
#  [5, 21, 5, 9, 47, 8, 38, 1, 2],
#  [9, 5, 3, 1, 4, 6, 7, 24, 44, 12, 2, 1, 3],
#  [5, 1, 6, 4],
#  ...
# ]
dummy_data = [[random.randint(1, VOCAB_SIZE - 1) for _ in range(random.randint(MIN_L, MAX_L))] for _ in range(NUM_DATA)]


# (NUM_CLASS - 1e-10) is to avoid the random variable is 10.
dummy_label = np.random.uniform(0, NUM_CLASSES - 1e-10, size=(NUM_DATA,)).astype(int) # 

print("max length: ", max([len(d) for d in dummy_data]))
print("min length: ", min([len(d) for d in dummy_data]))
print("max vocab index", max([max(d) for d in dummy_data]))

print(len(dummy_data))
print(dummy_label.shape)


max length:  50
min length:  10
max vocab index 99
320
(320,)


In [None]:
"""
Data Module
""" 

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, transformations):
        # data is composed of [xs, ys]
        # where xs.shape = [size of total data, dimension]
        # where ys.shape = [size of total data]

        self.data = data
        self.transformations = transformations

        assert len(self.data) == len(self.transformations)

    def __getitem__(self, index):
        return tuple(transformation(data_[index]) if transformation else data_[index] 
                     for data_, transformation in zip(self.data, self.transformations))

    def __len__(self):
        return len(self.data[0])

def getData_dataloader(x, y, batchSize):
    # Return a dataloader object

    x_transformation = None

    y_transformation = None

    d = MyDataset([x, y], [x_transformation, y_transformation])

    ### TODO 
    # Create a dataloader object and return it

    def custom_collate_func(batch):
        ## get sequence lengths

        data_, label_ = list(zip(*batch))

        lengths = torch.LongTensor([len(d) for d in data_])
        ## padd
        data_ = [torch.LongTensor(d) for d in data_]

        data_ = torch.nn.utils.rnn.pad_sequence(data_, batch_first=True)
        ## compute mask
        mask = (data_ != 0)

        label_ = torch.LongTensor(label_)

        return data_, label_, lengths, mask

    return torch.utils.data.DataLoader(d, batch_size=batchSize, shuffle=True, num_workers=2, collate_fn=custom_collate_func)




# 1. Implementation for Attention Block in Tranformer's Encoder

The output shape should be torch.Size([32, L, 512])



In [None]:
import math
import torch
import torch.nn as nn

class TransformerAttention(nn.Module):
    def __init__(self, input_dim, output_dim, q_dim, v_dim):
        super(TransformerAttention, self).__init__()
        # the dimension of q and k should be the same
        self.q_dim = q_dim
        self.k_dim = q_dim 
        self.v_dim = v_dim

        self.inf = 1e8 # use in masks

        # TODO
        # create the projection layers for query, key, value

        self.linear = nn.Linear(v_dim, output_dim)

    def forward(self, data, lengths, weight_mask):
        # TODO 1
        # Do the linear transformation for query, key, value

        # TODO 2
        # Compute the self-attention from query and key
        
        # TODO 3
        # Get the output from weight-sum of values 
        # output = YourFunction()
        
        return self.linear(output)

class TransformerBlock(nn.Module):
    def __init__(self, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerBlock, self).__init__()
        # create the block of attention
        self.attention = TransformerAttention(model_dim, model_dim, q_dim, v_dim)
        
    def forward(self, data, lengths, weight_mask):
        output = self.attention(data, lengths, weight_mask)
        return output

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerEncoder, self).__init__()
        # Projection layer for embedding
        self.embedding = nn.Embedding(vocab_dim, model_dim)

        # create the transformer-encoding layers
        self.transformer_layers = []

        for _ in range(num_layers):
            self.transformer_layers.append(TransformerBlock(model_dim=model_dim, q_dim=q_dim, v_dim=v_dim, num_layers=num_layers, num_heads=num_heads))

        self.transformer_layers = nn.ModuleList(self.transformer_layers)

    def forward(self, data, lengths, masks):
        # transform word to embedding
        output = self.embedding(data)

        # transform mask to weight_mask, which for masking out the weights computes in attention.
        masks = masks.float()
        weight_mask = masks.unsqueeze(-1).bmm(masks.unsqueeze(1))

        for t_layer in self.transformer_layers:
            output = t_layer(output, lengths, weight_mask)
            # print(output.shape)

        return output
######## 
# test #
########

def test(train_data):
    model = TransformerEncoder(VOCAB_SIZE, 512)
    for batch in getData_dataloader(train_data[0], train_data[1], BATCH_SIZE):
        # only run for one epoch
        data, labels, lengths, masks = batch
        print(data.shape, labels.shape, lengths.shape, masks.shape)
        print(model(data, lengths, masks).shape)
        break
        

test([dummy_data, dummy_label])

torch.Size([32, 48]) torch.Size([32]) torch.Size([32]) torch.Size([32, 48])
torch.Size([32, 48, 512])


# 2. Implementation for Multi-Head Attention Block in Tranformer's Encoder

The output shape should be torch.Size([32, L, 512])

In [None]:
import math
import torch
import torch.nn as nn

class TransformerMultiHeadAttention(nn.Module):
    def __init__(self, input_dim, output_dim, q_dim, v_dim, num_heads=3):
        super(TransformerMultiHeadAttention, self).__init__()
        # the dimension of q and k should be the same
        
        # TODO
        # Create the layers for multi-head attention

        self.linear = nn.Linear(num_heads * v_dim, output_dim)

    def forward(self, data, lengths, weight_mask):
        # TODO
        # Computes the output of multi-head attention
        # output = MultiHeadAttention()

        return self.linear(output)


class TransformerBlock(nn.Module):
    def __init__(self, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerBlock, self).__init__()
        self.attention = TransformerMultiHeadAttention(model_dim, model_dim, q_dim, v_dim)
        
    def forward(self, data, lengths, weight_mask):
        output = self.attention(data, lengths, weight_mask)
        return output

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_dim, model_dim)

        self.transformer_layers = []

        for _ in range(num_layers):
            self.transformer_layers.append(TransformerBlock(model_dim=model_dim, q_dim=q_dim, v_dim=v_dim, num_layers=num_layers, num_heads=num_heads))

        self.transformer_layers = nn.ModuleList(self.transformer_layers)

    def forward(self, data, lengths, masks):
        # transform word to embedding
        output = self.embedding(data)

        # transform mask to weight_mask, which for masking out the weights computes in attention.
        masks = masks.float()
        weight_mask = masks.unsqueeze(-1).bmm(masks.unsqueeze(1))

        for t_layer in self.transformer_layers:
            output = t_layer(output, lengths, weight_mask)
            # print(output.shape)

        return output
######## 
# test #
########

def test(train_data):
    model = TransformerEncoder(VOCAB_SIZE, 512)
    for batch in getData_dataloader(train_data[0], train_data[1], BATCH_SIZE):
        # only run for one epoch
        data, labels, lengths, masks = batch
        print(data.shape, labels.shape, lengths.shape, masks.shape)
        print(model(data, lengths, masks).shape)
        break
        

test([dummy_data, dummy_label])

torch.Size([32, 50]) torch.Size([32]) torch.Size([32]) torch.Size([32, 50])
torch.Size([32, 50, 512])


# 3. Implementation for Feed Forward Network and Layer Normalization in Tranformer's Encoder

The output shape should be torch.Size([32, L, 512])

In [None]:
import math
import torch
import torch.nn as nn

class TransformerFFN(nn.Module):
    """
    Feed forward network
    """
    # TODO
    # Finish this class
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TransformerFFN, self).__init__()
        
    def forward(self, data):
        raise NotImplementedError

class TransformerAddNorm(nn.Module):
    """
    Add and layer normalization module
    """
    # TODO
    # Finish this class
    def __init__(self, input_shape):
        super(TransformerAddNorm, self).__init__()
        
    def forward(self, data):
        raise NotImplementedError

class TransformerBlock(nn.Module):
    def __init__(self, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerBlock, self).__init__()
        self.attention = TransformerMultiHeadAttention(model_dim, model_dim, q_dim, v_dim, num_heads)
        self.ffn = TransformerFFN(model_dim, model_dim*4, model_dim)

        self.addnorm_1 = TransformerAddNorm(model_dim)
        self.addnorm_2 = TransformerAddNorm(model_dim)
        
    def forward(self, data, lengths, weight_mask):
        output = self.attention(data, lengths, weight_mask)
        output = self.addnorm_1(output)
        output = self.ffn(output)
        output = self.addnorm_2(output)
        return output

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_dim, model_dim)

        self.transformer_layers = []

        for _ in range(num_layers):
            self.transformer_layers.append(TransformerBlock(model_dim=model_dim, q_dim=q_dim, v_dim=v_dim, num_layers=num_layers, num_heads=num_heads))

        self.transformer_layers = nn.ModuleList(self.transformer_layers)

    def forward(self, data, lengths, masks):
        # transform word to embedding
        output = self.embedding(data)

        # transform mask to weight_mask, which for masking out the weights computes in attention.
        masks = masks.float()
        weight_mask = masks.unsqueeze(-1).bmm(masks.unsqueeze(1))

        for t_layer in self.transformer_layers:
            output = t_layer(output, lengths, weight_mask)
            # print(output.shape)

        return output
######## 
# test #
########

def test(train_data):
    model = TransformerEncoder(VOCAB_SIZE, 512)
    for batch in getData_dataloader(train_data[0], train_data[1], BATCH_SIZE):
        # only run for one epoch
        data, labels, lengths, masks = batch
        print(data.shape, labels.shape, lengths.shape, masks.shape)
        print(model(data, lengths, masks).shape)
        break
        

test([dummy_data, dummy_label])

torch.Size([32, 50]) torch.Size([32]) torch.Size([32]) torch.Size([32, 50])
torch.Size([32, 50, 512])


# 4. Positional Encoding in Transformer

The output shape should be torch.Size([32, L, 512])

In [None]:
class PositionEncoding(nn.Module):
    # TODO
    # Finish this class
    def __init__(self, model_dim):
        super(PositionEncoding, self).__init__()
        
    def forward(self, data):
        raise NotImplementedError



class TransformerEncoder(nn.Module):
    def __init__(self, vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_dim, model_dim)
        self.pos_embedding = PositionEncoding(model_dim)

        self.transformer_layers = []

        for _ in range(num_layers):
            self.transformer_layers.append(TransformerBlock(model_dim=model_dim, q_dim=q_dim, v_dim=v_dim, num_layers=num_layers, num_heads=num_heads))

        self.transformer_layers = nn.ModuleList(self.transformer_layers)

    def forward(self, data, lengths, masks):
        # transform word to embedding
        output = self.embedding(data)
        output = self.pos_embedding(output)

        # transform mask to weight_mask, which for masking out the weights computes in attention.
        masks = masks.float()
        weight_mask = masks.unsqueeze(-1).bmm(masks.unsqueeze(1))

        for t_layer in self.transformer_layers:
            output = t_layer(output, lengths, weight_mask)
            # print(output.shape)

        return output
######## 
# test #
########

def test(train_data):
    model = TransformerEncoder(VOCAB_SIZE, 512)
    for batch in getData_dataloader(train_data[0], train_data[1], BATCH_SIZE):
        # only run for one epoch
        data, labels, lengths, masks = batch
        print(data.shape, labels.shape, lengths.shape, masks.shape)
        print(model(data, lengths, masks).shape)
        break
        

test([dummy_data, dummy_label])

torch.Size([32, 46]) torch.Size([32]) torch.Size([32]) torch.Size([32, 46])
tensor([[[  0.,   0.,   1.,  ..., 254., 255., 255.]],

        [[  0.,   0.,   1.,  ..., 254., 255., 255.]],

        [[  0.,   0.,   1.,  ..., 254., 255., 255.]],

        ...,

        [[  0.,   0.,   1.,  ..., 254., 255., 255.]],

        [[  0.,   0.,   1.,  ..., 254., 255., 255.]],

        [[  0.,   0.,   1.,  ..., 254., 255., 255.]]])
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [1.0000e+00, 1.0000e+00, 9.6466e-01,  ..., 1.0746e-04,
          1.0366e-04, 1.0366e-04],
         [2.0000e+00, 2.0000e+00, 1.9293e+00,  ..., 2.1492e-04,
          2.0733e-04, 2.0733e-04],
         ...,
         [4.3000e+01, 4.3000e+01, 4.1480e+01,  ..., 4.6209e-03,
          4.4576e-03, 4.4576e-03],
         [4.4000e+01, 4.4000e+01, 4.2445e+01,  ..., 4.7283e-03,
          4.5612e-03, 4.5612e-03],
         [4.5000e+01, 4.5000e+01, 4.3410e+01,  ..., 4.8358e-03,
       

# 5. Test Back Propagation of Your Transformer

In [None]:
import time 
class TransformerClassification(nn.Module):
    def __init__(self, vocab_dim, model_dim=512, num_classes=2, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerClassification, self).__init__()
        self.encoder = TransformerEncoder(vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3)
        self.output_layer = nn.Linear(model_dim, num_classes)

    def forward(self, data, lengths, masks):
        return self.output_layer(self.encoder(data, lengths, masks)[:, 0])
def test(train_data):
    model = TransformerClassification(VOCAB_SIZE, 512, num_classes=NUM_CLASSES).to(DEVICE)
    model.train()
    opt = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = nn.CrossEntropyLoss()
    for epoch in range(20):
        losses = []
        s = time.time()
        for batch in getData_dataloader(train_data[0], train_data[1], BATCH_SIZE):
            # only run for one epoch
            data, labels, lengths, masks = batch
            data, labels, lengths, masks = data.to(DEVICE), labels.to(DEVICE), lengths.to(DEVICE), masks.to(DEVICE)
            output = model(data, lengths, masks)
            loss = loss_function(output, labels)

            opt.zero_grad()
            loss.backward()
            opt.step()

            losses.append(loss.item())
        print(time.time() - s)
        print(f"epoch {epoch}, loss: {np.mean(losses):.3f}")
        

test([dummy_data, dummy_label])

0.5074501037597656
epoch 0, loss: 2.746
0.5081014633178711
epoch 1, loss: 2.385
0.49962329864501953
epoch 2, loss: 2.325
0.5067362785339355
epoch 3, loss: 2.248
0.5004696846008301
epoch 4, loss: 2.114
0.5036993026733398
epoch 5, loss: 1.933
0.5031535625457764
epoch 6, loss: 1.654
0.5001282691955566
epoch 7, loss: 1.433
0.5062453746795654
epoch 8, loss: 1.113
0.5050177574157715
epoch 9, loss: 0.598
0.5157895088195801
epoch 10, loss: 0.315
0.4961247444152832
epoch 11, loss: 0.229
0.4985818862915039
epoch 12, loss: 0.197
0.5039937496185303
epoch 13, loss: 0.206
0.5086092948913574
epoch 14, loss: 0.158
0.5034294128417969
epoch 15, loss: 0.102
0.506798267364502
epoch 16, loss: 0.067
0.5046491622924805
epoch 17, loss: 0.054
0.5069537162780762
epoch 18, loss: 0.031
0.501335620880127
epoch 19, loss: 0.025


# Transformer Decoder

In [None]:
# TODO
# Finish this class
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3):
        super(TransformerDecoder, self).__init__()

    def forward(self, data, max_length, encoder_outputs):
        assert len(encoder_outputs) == self.num_layers, "The number of encoding layers should be the same as decoding layers"

# TODO
# Finish the class
class Transformer(nn.Module):
    def __init__(self, vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3, max_gen_length=50):
        super(Transformer, self).__init__()
        self.max_gen_length = max_gen_length
        self.encoder = TransformerEncoder(vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3)
        self.decoder = TransformerDecoder(vocab_dim, model_dim=512, q_dim=64, v_dim=64, num_layers=2, num_heads=3)

    def forward(self, data, lengths, masks):
        e_outputs = self.encoder(data, lengths, masks)

        return d_output


######## 
# test #
########

def test(train_data):
    model = Transformer(VOCAB_SIZE, 512)
    for batch in getData_dataloader(train_data[0], train_data[1], BATCH_SIZE):
        # only run for one epoch
        data, labels, lengths, masks = batch
        print(data.shape, labels.shape, lengths.shape, masks.shape)
        print(model(data, lengths, masks))
        break
        

test([dummy_data, dummy_label])