In [1]:
import torch
from torch import nn
from torch.nn import functional as F

import sys
sys.path.append("..")

from transformer.model import EncoderDecoder, make_model
from transformer.modules import Generator
from transformer.utils.loss import NoamOpt, LabelSmoothing

from transformer.datasets import IWSLTIterator, get_datasets_and_vocab, batch_size_fn, rebatch, Batch

from typing import Optional, List, Iterator
from time import time

## Parallel Training on GPUs
I have 2 GPUs. To best use both of them, we use data parallel setting, where we
* replicate - create the same model in both(read all) GPUs.
* scatter - split data batches onto different GPUs.
* parallel_apply - apply model to batches on the GPUs. 
* gather - gather losses from and apply gradients.

In [2]:
print(torch.cuda.device_count())

2


In [3]:
def run_epoch(data_iter: Iterator[Batch], model: EncoderDecoder, loss_compute: nn.Module):
    "Standard Training and Logging Function"
    start = time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(
            src=batch.src, tgt=batch.trg, 
            src_mask=batch.src_mask,
            tgt_mask=batch.trg_mask
        )
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens

        if i%50 == 1:
            elapsed = time() - start
            print(
                f"Epoch step: {i} Loss: {loss / batch.ntokens} "
                f"Tokens per sec: {tokens/elapsed}"
            )
            start = time()
            tokens = 0
        
    return total_loss / total_tokens

In [17]:
class MultiGPULossCompute:
    def __init__(self, generator: Generator,
                 criterion: torch.optim.Optimizer,
                 devices: List[int],
                 opt: Optional[NoamOpt] = None,
                 chunk_size: int = 5) -> None:
        self.generator = generator
        self.criterion = nn.parallel.replicate(criterion, devices=devices)
        self.opt = opt
        self.devices = devices
        self.chunk_size = chunk_size

    def __call__(self, out, targets, norm):
        total = 0.0
        generator = nn.parallel.replicate(self.generator, devices=self.devices)

        out_scatter = nn.parallel.scatter(out, target_gpus=self.devices)
        out_grad = [[] for _ in out_scatter]
        targets = nn.parallel.scatter(targets, target_gpus=self.devices)

        chunk_size = self.chunk_size
        for i in range(0, out_scatter[0].size(1), chunk_size):
            out_column = [
                [torch.tensor(o[:, i:i+chunk_size].data, requires_grad=self.opt is not None)]
                for o in out_scatter
            ]
            gen = nn.parallel.parallel_apply(generator, out_column)

            y = [(g.contiguous().view(-1, g.size(-1)),
                  t[:, i:i+chunk_size].contiguous().view(-1))
                  for g, t in zip(gen, targets)]

            loss = nn.parallel.parallel_apply(self.criterion, y)

            l = nn.parallel.gather(loss, target_device=self.devices[0])
            l = l.sum() / norm

            # l = l.sum().item() / norm

            total += l.item()

            if self.opt is not None:
                l.backward()
                for j, l in enumerate(loss):
                    out_grad[j].append(out_column[j][0].grad.data.clone())

        if self.opt is not None:
            out_grad = [torch.cat(og, dim=1) for og in out_grad]
            o1 = out
            o2 = nn.parallel.gather(out_grad, target_device=self.devices[0])

            o1.backward(gradient=o2)
            self.opt.step()
            self.opt.optimizer.zero_grad()

        return total * norm



In [5]:
devices = [i for i in range(torch.cuda.device_count())]
print(devices)



[0, 1]


In [6]:
%%time
SRC, TGT, train, val, test = get_datasets_and_vocab(
    dataset_path="../transformer/datasets/.data"
)

CPU times: user 48.6 s, sys: 412 ms, total: 49 s
Wall time: 49 s


In [22]:
pad_idx = TGT.vocab.stoi["<blank>"]
model = make_model(len(SRC.vocab), len(TGT.vocab), N=6)
model.cuda()

criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
criterion.cuda()

BATCH_SIZE = 1200

train_iter = IWSLTIterator(train, batch_size=BATCH_SIZE, device=torch.device(0), repeat=False,
                           sort_key=lambda x: (len(x.src), len(x.trg)), 
                           batch_size_fn=batch_size_fn, train=True)

valid_iter = IWSLTIterator(val, batch_size=BATCH_SIZE, device=torch.device(0), repeat=False,
                           sort_key=lambda x: (len(x.src), len(x.trg)), 
                           batch_size_fn=batch_size_fn, train=False)

model_par = nn.DataParallel(model, device_ids=devices)

In [23]:
model_opt = NoamOpt(model.d_model, 1, 2000,
                    torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [21]:
for epoch in range(10):
    model_par.train()
    run_epoch(
        (rebatch(pad_idx, b) for b in train_iter),
        model_par,
        MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt))

    model_par.eval()

    loss = run_epoch(
        (rebatch(pad_idx, b) for b in valid_iter),
        model_par,
        MultiGPULossCompute(model.generator, criterion, devices=device, opt=None)
    )

    print(loss)

RuntimeError: CUDA out of memory. Tried to allocate 14.00 MiB (GPU 1; 7.80 GiB total capacity; 6.74 GiB already allocated; 10.31 MiB free; 6.85 GiB reserved in total by PyTorch)