In [1]:
%matplotlib inline

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

# Define the model




In this tutorial, we train a ``nn.TransformerEncoder`` model on a
language modeling task. The language modeling task is to assign a
probability for the likelihood of a given word (or a sequence of words)
to follow a sequence of words. A sequence of tokens are passed to the embedding
layer first, followed by a positional encoding layer to account for the order
of the word (see the next paragraph for more details). The
``nn.TransformerEncoder`` consists of multiple layers of
`nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__.
Along with the input sequence, a square attention mask is required because the
self-attention layers in ``nn.TransformerEncoder`` are only allowed to attend
the earlier positions in the sequence. For the language modeling task, any
tokens on the future positions should be masked. To produce a probability
distribution over output words, the output of the ``nn.TransformerEncoder``
model is passed through a linear layer followed by a log-softmax function.




In [9]:
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F


class TransformerModel(nn.Module):

    def __init__(
            self,
            ntoken: int, # size of vocabulary
            d_model: int, # embedding dimension
            nhead: int, # number of heads in nn.MultiheadAttention
            d_hid: int, # dimension of the feedforward network model in nn.TransformerEncoder
            nlayers: int, # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
            dropout: float = 0.5 # dropout probability
        ):
        super().__init__()
        self.model_type = 'Transformer'
        self.embedding = nn.Embedding(ntoken, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model, nhead, d_hid, dropout
            )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layers, nlayers
            )
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, sequence: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            sequence: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        sequence = self.embedding(sequence) * math.sqrt(self.d_model)
        sequence = self.pos_encoder(sequence)
        output = self.transformer_encoder(sequence, src_mask)
        output = self.decoder(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

``PositionalEncoding`` module injects some information about the
relative or absolute position of the tokens in the sequence. The
positional encodings have the same dimension as the embeddings so that
the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
different frequencies.




In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Load and batch data




This tutorial uses ``torchtext`` to generate Wikitext-2 dataset.
To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. 

The vocab object is built based on the train dataset and is used to numericalize
tokens into tensors. Wikitext-2 represents rare tokens as `<unk>`.

Given a 1-D vector of sequential data, ``batchify()`` arranges the data
into ``batch_size`` columns. If the data does not divide evenly into
``batch_size`` columns, then the data is trimmed to fit. For instance, with
the alphabet as the data (total length of 26) and ``batch_size=4``, we would
divide the alphabet into 4 sequences of length 6:

$$
\begin{align}\begin{bmatrix}
  \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
  \end{bmatrix}
  \Rightarrow
  \begin{bmatrix}
  \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
  \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
  \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
  \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
  \end{bmatrix}\end{align}
$$

Batching enables more parallelizable processing. However, batching means that
the model treats each column independently; for example, the dependence of
``G`` and ``F`` can not be learned in the example above.




In [5]:
import ao

DATA_FOLDER = ao.dataset.utils.get_data_folder(env='PRIMITIVE_EXPERIMENT')

In [15]:
import ao
import numpy as np

from pathlib import Path
from torch.utils.data import IterableDataset, DataLoader

class PrimitiveExperimentDataset(IterableDataset):

    def __init__(
        self,
        data_folder: Path,
        frame_ms: int = 500,
        num_features: int = 64,
        # TODO similar call signature to `plot_features`
        ):
        # TODO read config from yaml file
        self.data, self.naming = ao.dataset.utils.list_data(data_folder)
        # TODO split data in train and test
        _, self.sample_rate = ao.io.wave_read(
            list(self.data.keys())[0] / ('audio0crop' + '.wav')
            )
        self.frame_samples = int(
            np.ceil(frame_ms / 1000 * self.sample_rate)
            )  # samples per frame
        self.extract = ao.extractor.GammatoneFilterbank(
            self.frame_samples, num_features, self.sample_rate
            )
    
    def __iter__(self):
        for folder, parameters in self.data.items():
            audio, samplerate_ = ao.io.wave_read(folder / 'audio0crop.wav')
            # TODO do not average over channels
            audio = audio.mean(axis=1)
            # TODO get from wheel test bed
            speed = parameters['contact'] * parameters['w'] * 0.10
            # Discard the last part of the audio if it is not a multiple of the
            # samples per frame
            num_frames = audio.size // self.frame_samples
            audio = audio[:num_frames * self.frame_samples]
            for frame_num in range(num_frames):
                frame_start = frame_num * self.frame_samples
                frame_end = frame_start + self.frame_samples
                frame = audio[frame_start:frame_end]
                features = self.extract(frame)
                # TODO use speed label
                yield features, speed

In [18]:
train_dataset = PrimitiveExperimentDataset(DATA_FOLDER)
train_data = DataLoader(train_dataset, batch_size=6)
print(next(iter(train_data)))

[[tensor([11.1160, 14.1649, 12.4298, 15.5246, 11.4729, 12.1453],
       dtype=torch.float64), tensor([6.3744, 8.3411, 8.5589, 8.3561, 6.6650, 6.9198], dtype=torch.float64), tensor([4.5254, 5.6748, 6.4274, 5.0484, 4.4056, 4.1557], dtype=torch.float64), tensor([4.4976, 4.3082, 4.5307, 3.7558, 3.5565, 3.3138], dtype=torch.float64), tensor([5.1825, 4.2550, 4.2107, 3.8852, 3.4271, 2.9380], dtype=torch.float64), tensor([4.6273, 3.7570, 3.6136, 3.6034, 3.2606, 2.9658], dtype=torch.float64), tensor([3.4330, 2.9436, 3.0607, 3.0571, 2.8368, 2.6423], dtype=torch.float64), tensor([2.5493, 2.3039, 2.5361, 2.6235, 2.3857, 2.3147], dtype=torch.float64), tensor([2.3478, 1.7183, 2.1957, 2.2753, 1.8550, 2.1896], dtype=torch.float64), tensor([2.7531, 1.9256, 2.6044, 2.1041, 1.8589, 2.2956], dtype=torch.float64), tensor([3.1850, 2.4154, 2.8171, 2.2503, 2.2507, 2.5817], dtype=torch.float64), tensor([2.4670, 2.2888, 2.3313, 2.1812, 2.3155, 2.4717], dtype=torch.float64), tensor([1.9361, 2.1633, 2.0781, 2.037

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [6]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>']) 

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# train_iter was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Args:
        data: Tensor, shape [N]
        bsz: int, batch size

    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

Functions to generate input and target sequence
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




``get_batch()`` generates a pair of input-target sequences for
the transformer model. It subdivides the source data into chunks of
length ``bptt``. For the language modeling task, the model needs the
following words as ``Target``. For example, with a ``bptt`` value of 2,
we’d get the following two Variables for ``i`` = 0:

![](../_static/img/transformer_input_target.png)


It should be noted that the chunks are along dimension 0, consistent
with the ``S`` dimension in the Transformer model. The batch dimension
``N`` is along dimension 1.




In [7]:
bptt = 35
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int

    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

Initiate an instance
--------------------




The model hyperparameters are defined below. The vocab size is
equal to the length of the vocab object.




In [9]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

Run the model
-------------




We use `CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html>`__
with the `SGD <https://pytorch.org/docs/stable/generated/torch.optim.SGD.html>`__
(stochastic gradient descent) optimizer. The learning rate is initially set to
5.0 and follows a `StepLR <https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html>`__
schedule. During training, we use `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html>`__
to prevent gradients from exploding.




In [10]:
import copy
import time

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

Loop over epochs. Save the model if the validation loss is the best
we've seen so far. Adjust the learning rate after each epoch.



In [11]:
best_val_loss = float('inf')
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

| epoch   1 |   200/ 2928 batches | lr 5.00 | ms/batch 401.02 | loss  8.08 | ppl  3244.57
| epoch   1 |   400/ 2928 batches | lr 5.00 | ms/batch 388.76 | loss  6.87 | ppl   964.44
| epoch   1 |   600/ 2928 batches | lr 5.00 | ms/batch 387.75 | loss  6.44 | ppl   623.78
| epoch   1 |   800/ 2928 batches | lr 5.00 | ms/batch 397.25 | loss  6.30 | ppl   543.00
| epoch   1 |  1000/ 2928 batches | lr 5.00 | ms/batch 389.22 | loss  6.18 | ppl   480.99
| epoch   1 |  1200/ 2928 batches | lr 5.00 | ms/batch 395.26 | loss  6.15 | ppl   467.25
| epoch   1 |  1400/ 2928 batches | lr 5.00 | ms/batch 392.69 | loss  6.11 | ppl   450.01
| epoch   1 |  1600/ 2928 batches | lr 5.00 | ms/batch 412.98 | loss  6.10 | ppl   447.81
| epoch   1 |  1800/ 2928 batches | lr 5.00 | ms/batch 388.14 | loss  6.02 | ppl   411.47
| epoch   1 |  2000/ 2928 batches | lr 5.00 | ms/batch 388.48 | loss  6.01 | ppl   408.59
| epoch   1 |  2200/ 2928 batches | lr 5.00 | ms/batch 406.52 | loss  5.88 | ppl   359.01
| epoch   

KeyboardInterrupt: 

Evaluate the best model on the test dataset
-------------------------------------------




In [None]:
test_loss = evaluate(best_model, test_data)
test_ppl = math.exp(test_loss)
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)