**Translation - English to Swedish using Transformers**

**Installing necessary packages**

In [None]:
!pip install torch
!pip install torchtext
!pip install tqdm
!pip install datasets
!pip install pathlib
!pip install tokenizers
!pip install wandb
!pip install torchmetrics

**Transformer model**

In [None]:
import torch
import torch.nn as nn
import math

class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

class ResidualConnection(nn.Module):

        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)

class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

**Dataset**

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

**Configuration**

In [None]:
from pathlib import Path

def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": "opus_books",
        "lang_src": "en",
        "lang_tgt": "sv",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)
    model_filename = f"{config['model_basename']}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

**Training**

In [None]:
#from model import build_transformer
#from dataset import BilingualDataset, causal_mask
#from config import get_config, get_weights_file_path

#import torchtext.datasets as datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR

import warnings
from tqdm import tqdm
import os
from pathlib import Path

# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

import wandb

import torchmetrics

def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)


def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break


    # Evaluate the character error rate
    # Compute the char error rate
    metric = torchmetrics.CharErrorRate()
    cer = metric(predicted, expected)
    wandb.log({'validation/cer': cer, 'global_step': global_step})

    # Compute the word error rate
    metric = torchmetrics.WordErrorRate()
    wer = metric(predicted, expected)
    wandb.log({'validation/wer': wer, 'global_step': global_step})

    # Compute the BLEU metric
    metric = torchmetrics.BLEUScore()
    bleu = metric(predicted, expected)
    wandb.log({'validation/BLEU': bleu, 'global_step': global_step})

def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

def get_ds(config):

    # It only has the train split, so we divide it overselves
    ds_raw = load_dataset('opus_books', f"{config['lang_src']}-{config['lang_tgt']}", split='train')

    # Build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')


    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model'])
    return model

def train_model(config):
    # Define the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    torch.cuda.empty_cache()
    # Make sure the weights folder exists
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
        del state

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    # define our custom x axis metric
    wandb.define_metric("global_step")
    # define which metrics will be plotted against it
    wandb.define_metric("validation/*", step_metric="global_step")
    wandb.define_metric("train/*", step_metric="global_step")

    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            wandb.log({'train/loss': loss.item(), 'global_step': global_step})

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step)

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)


if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    config['num_epochs'] = 30
    config['preload'] = None

    wandb.init(
        # set the wandb project where this run will be logged
        project="pytorch-transformer",

        # track hyperparameters and run metadata
        config=config
    )

    train_model(config)

Using device: cuda


README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/516k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3095 [00:00<?, ? examples/s]

Max length of source sentence: 280
Max length of target sentence: 269


Processing Epoch 00: 100%|██████████| 349/349 [02:01<00:00,  2.87it/s, loss=5.792]


--------------------------------------------------------------------------------
    SOURCE: NERVOUS OLD LADY NEAR THE FIRE BEGINS TO CRY, AND HAS TO BE LED OUT.]
    TARGET: En nervöst lagd äldre dam vid eldstaden börjar gråta och måste föras ut därifrån.]
 PREDICTED: 
--------------------------------------------------------------------------------
    SOURCE: Rich old couples, with no one to leave their money to, die childless.
    TARGET: Rika gamlingar, som saknar arvingar, dör barnlösa.
 PREDICTED: Vi , att , att , att , att .
--------------------------------------------------------------------------------


Processing Epoch 01: 100%|██████████| 349/349 [02:05<00:00,  2.79it/s, loss=3.870]


--------------------------------------------------------------------------------
    SOURCE: We began to understand the sufferings of the Babes in the Wood.
    TARGET: Vi började nu begripa the Babes in the Wood lidanden.
 PREDICTED: Det var .
--------------------------------------------------------------------------------
    SOURCE: And we walked miles upon miles out Birmingham way; but it was no use, the country was steeped in oil.
    TARGET: Och vi vandrade flera miles längs vägen mot Birmingham; men det var fåfängt, landsbygden var indränkt i fotogen.
 PREDICTED: Vi sade , att vi var vi var , att vi var det var det var det var det var det var vi var vi var vi var .
--------------------------------------------------------------------------------


Processing Epoch 02: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=5.685]


--------------------------------------------------------------------------------
    SOURCE: "They'd hardly have taken the pie too," said George.
    TARGET: ”De skulle väl knappast ha tagit pajen också, i så fall”, sade George.
 PREDICTED: ” , det var det var det .
--------------------------------------------------------------------------------
    SOURCE: We got to chatting about our rowing experiences this morning, and to recounting stories of our first efforts in the art of oarsmanship.
    TARGET: Vi kom att prata om våra tidiga roddarerfarenheter denna morgon och återupplivade historier om våra första försök i roddens ädla konst.
 PREDICTED: Vi hade oss i och vi hade .
--------------------------------------------------------------------------------


Processing Epoch 03: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=4.821]


--------------------------------------------------------------------------------
    SOURCE: We played MORCEAUX from the old German masters.
    TARGET: Vi lyssnade till Morceaux av de gamla tyska mästarna.
 PREDICTED: Vi .
--------------------------------------------------------------------------------
    SOURCE: By clinging like grim death to the gunwale, we just managed to keep inside the boat, but it was exhausting work.
    TARGET: Genom att hänga oss fast för glatta livet i fribordet, lyckades vi nätt och jämt stanna kvar ombord, men det var hårt arbete.
 PREDICTED: Och vi skulle vi ha , men vi skulle vi ha ha , men vi skulle vi ha , men vi ha .
--------------------------------------------------------------------------------


Processing Epoch 04: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=4.857]


--------------------------------------------------------------------------------
    SOURCE: The first thing was that they thought the boat was not clean.
    TARGET: För det första ansåg de, att båten inte var ren.
 PREDICTED: var inte inte inte inte , att det var inte inte .
--------------------------------------------------------------------------------
    SOURCE: And, strange as it may appear, those clumps on the head often cured me - for the time being.
    TARGET: Och hur underligt det än kan förefalla, sa bo-tade dessa örfilar mig — för stunden.
 PREDICTED: Och så , så , som , som jag på den .
--------------------------------------------------------------------------------


Processing Epoch 05: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=5.009]


--------------------------------------------------------------------------------
    SOURCE: "Oh, all right, I'll tell `em.
    TARGET: ”Å, jasså! Det skall jag säga till dem.
 PREDICTED: ” Å , jag är jag att jag er .
--------------------------------------------------------------------------------
    SOURCE: We tried to get away from it at Marlow.
    TARGET: Vi försökte undkomma den i Marlow.
 PREDICTED: Vi tog på den i .
--------------------------------------------------------------------------------


Processing Epoch 06: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=4.847]


--------------------------------------------------------------------------------
    SOURCE: When Montmorency meets a cat, the whole street knows about it; and there is enough bad language wasted in ten seconds to last an ordinarily respectable man all his life, with care.
    TARGET: Då Montmorency träffar på en katt, får hela gatan reda på det; och han använder lika mycket fula ord under tio sekunder som skulle räcka åt en någorlunda anständig karl under dennes livstid, med råge.
 PREDICTED: Då en av dem , och det är mycket ; och det är att sig i att sig i .
--------------------------------------------------------------------------------
    SOURCE: By-and-by a small boat came in sight, towed through the water at a tremendous pace by a powerful barge horse, on which sat a very small boy.
    TARGET: Då och då kom en liten båt inom synhåll, dragen genom vattnet i oerhörd hastighet av en kraftfull arbetshäst, på vilken en mycket liten pojke satt.
 PREDICTED: och en av en av en , i en a

Processing Epoch 07: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=3.916]


--------------------------------------------------------------------------------
    SOURCE: The hired up-river boat very soon puts a stop to any nonsense of that sort on the part of its occupants.
    TARGET: Hyrbåtarna uppströms befriar snart sina hyresgästers sinnen från alla dylika dumheter.
 PREDICTED: Det verkade vara ett ställe för mycket ställe att få sig på en av av av .
--------------------------------------------------------------------------------
    SOURCE: I motioned him away with silent dignity, but he still advanced, screeching out the while:
    TARGET: Jag försökte avfärda honom med stilla värdighet, men han fortsatte mot mig, under det att han ropade:
 PREDICTED: Jag honom om honom , men han , men han tog med oss :
--------------------------------------------------------------------------------


Processing Epoch 08: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=4.113]


--------------------------------------------------------------------------------
    SOURCE: It is, I suppose, Boulter's not even excepted, the busiest lock on the river.
    TARGET: Jag tror att den är, Boulter’s inte undantagen, den mest livligt trafikerade slussen längs hela floden.
 PREDICTED: Det är , som jag inte är något på floden , på floden .
--------------------------------------------------------------------------------
    SOURCE: Then the second man climbs out of the boat and comes to help him, and they get in each other's way, and hinder one another.
    TARGET: Så klättrar den andre mannen ur båten och kommer honom till hjälp och de går i vägen för varandra och hindrar varandra.
 PREDICTED: Då man den andra och sig i båten , och de tog sig för att få på och de kommer att få sig på att göra .
--------------------------------------------------------------------------------


Processing Epoch 09: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=4.252]


--------------------------------------------------------------------------------
    SOURCE: For drink, we took some wonderful sticky concoction of Harris's, which you mixed with water and called lemonade, plenty of tea, and a bottle of whisky, in case, as George said, we got upset.
    TARGET: Till måltidsdryck valde vi något underbart, klistrigt hopkok som Harris ägde, vilken man blandade med vatten och kallade lemonad, gott om te och en flaska whisky, för den händelse vi skulle kantra.
 PREDICTED: För att vi en , Harris , Harris och med ett , sade att det var ett , att George sade , att George , , sade att George .
--------------------------------------------------------------------------------
    SOURCE: And so, with sentinel in each dark street, and twinkling watch-fires on each height around, the night has worn away, and over this fair valley of old Thame has broken the morning of the great day that is to close so big with the fate of ages yet unborn.
    TARGET: Och så, med vak

Processing Epoch 10: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=2.558]


--------------------------------------------------------------------------------
    SOURCE: HARRIS (continuing):
    TARGET: Harris: [Fortsätter]
 PREDICTED: Harris : [ ?
--------------------------------------------------------------------------------
    SOURCE: You cannot give me too much work; to accumulate work has almost become a passion with me: my study is so full of it now, that there is hardly an inch of room for any more.
    TARGET: Man kan inte ge mig för många arbetsuppgifter; att samla på mig arbete har sånär blivit min passion: Mitt arbetsrum är nu så fullt av det, att det knappt får plats en gnutta till.
 PREDICTED: Man kan inte mer än att mig ; så , som är så med en , som är det mer än vad som är jag kan att mig om .
--------------------------------------------------------------------------------


Processing Epoch 11: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=3.017]


--------------------------------------------------------------------------------
    SOURCE: Nobody seemed quite sure what it was exactly, but they all agreed that it sounded Scotch.
    TARGET: Ingen verkade helt säker på vad det faktiskt var han spelade, men alla höll med om, att det lät som någonting skotskt.
 PREDICTED: Det verkade som om den första , men de var helt enkelt , men det gjorde det i grytan gjorde jag .
--------------------------------------------------------------------------------
    SOURCE: John Edward says, "Oh!" he hadn't noticed it; and Emily says that papa does not like the gas lit in the afternoon.
    TARGET: John Edward säger ”Åh”, det har han inte lagt märke till; och Emily säger att Pappa inte tycker om att man tänder gasen om eftermiddagarna.
 PREDICTED: Har de säger , ” han !” säger han , att det inte kan få sig ; och , som om någon sig i hela världen med hela världen .
--------------------------------------------------------------------------------


Processing Epoch 12: 100%|██████████| 349/349 [02:05<00:00,  2.79it/s, loss=4.122]


--------------------------------------------------------------------------------
    SOURCE: Perhaps, from the casement, standing hand-in-hand, they were watching the calm moonlight on the river, while from the distant halls the boisterous revelry floated in broken bursts of faint-heard din and tumult.
    TARGET: Kanske stående hand i hand i de franska fönstren, betraktade de det stilla månskenet över floden, under det att de från de avlägsna festsalarna hörde det larmande bullret komma flytande i brutna skurar av otydliga vrål och tumult.
 PREDICTED: , från de av de två små , där de två män vid floden , där de två män , under det att de i och som stod .
--------------------------------------------------------------------------------
    SOURCE: Men came with poles and ropes, and tried to separate the dogs, and the police were sent for.
    TARGET: Karlar kom dit med pålar och rep, för att försöka sära på hundarna och man skickade efter polisen.
 PREDICTED: I med sin hustru och försök

Processing Epoch 13: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=2.415]


--------------------------------------------------------------------------------
    SOURCE: We could see ourselves at supper there, pecking away at cold meat, and passing each other chunks of bread; we could hear the cheery clatter of our knives, the laughing voices, filling all the space, and overflowing through the opening out into the night.
    TARGET: Vi kunde se oss äta kvällsmat där, huggande in på kallskuret och räckande varandra brödbitar; vi kunde höra våra knivars glada klapprande, våra skrattande röster som fyllde hela kapellet och strömmade ut i natten.
 PREDICTED: Vi kunde se oss på vi åter i våra , samt att vi skulle komma med ; och vi kunde våra kläder som av våra kläder som vi kunde av våra kläder och av våra kläder .
--------------------------------------------------------------------------------
    SOURCE: But all their heads were, by this time, in such a confused whirl that they were incapable of grasping anything, and so the man told them to stop where they were,

Processing Epoch 14: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=3.332]


--------------------------------------------------------------------------------
    SOURCE: "Why, we can't steer, if you keep stopping.
    TARGET: ”Jo, för om ni stannar till, så kan vi inte styra.
 PREDICTED: ” Men , vi kan inte se något annat , om man ville .
--------------------------------------------------------------------------------
    SOURCE: Then our porter said he thought that must be it on the high-level platform; said he thought he knew the train.
    TARGET: Så sade vår bärare att det måste finnas vid den övre plattformen; han sade sig känna till det tåget.
 PREDICTED: Så , sade att det trodde att det måste vara den saken ; den saken så han sade att han trodde att han trodde att han inte hade .
--------------------------------------------------------------------------------


Processing Epoch 15: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=2.063]


--------------------------------------------------------------------------------
    SOURCE: I had been told to stand where I was, and wait till the canvas came to me, and Montmorency and I stood there and waited, both as good as gold.
    TARGET: De hade sagt åt mig att stå där jag stod och att vänta tills segelduken kom till mig och Montmorency och jag själv stod där och väntade, trogna som guld.
 PREDICTED: Jag hade varit där där där där , var jag lade in och lade mig för mig och lade mig som jag lade i strömfåran och väntade .
--------------------------------------------------------------------------------
    SOURCE: George anathematized Mrs. G. for a lazy old woman, and thought it was very strange that people could not get up at a decent, respectable time, unlocked and unbolted the door, and ran out.
    TARGET: George öste sitt anatema över Mrs. G. och kallade henne en lat kärring, samt ansåg det vara mycket egendomligt att folk inte kunde stiga upp vid en kristlig tidpunkt som 

Processing Epoch 16: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=2.464]


--------------------------------------------------------------------------------
    SOURCE: To those who do contemplate making Oxford their starting-place, I would say, take your own boat - unless, of course, you can take someone else's without any possible danger of being found out.
    TARGET: Till dem som överväger att låta Oxford utgöra startplats, vill jag säga: Ta med er egen båt — såvida ni inte, naturligtvis, kan ta någon annans båt utan uppenbar risk att bli ertappade.
 PREDICTED: För att de hos deras båt , jag kan få det att säga mig , att man kan göra något emot att ni inte har något emot en enda arbete .
--------------------------------------------------------------------------------
    SOURCE: "Oh, no, it's simple enough.
    TARGET: ”Nej-nej, det är mycket enkelt.
 PREDICTED: ” Å , ingen fara ”, mumlade mycket .
--------------------------------------------------------------------------------


Processing Epoch 17: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=2.516]


--------------------------------------------------------------------------------
    SOURCE: They did scrape it out at last, and put it down on a chair, and Harris sat on it, and it stuck to him, and they went looking for it all over the room.
    TARGET: De skrapade fram det till sist och lade smöret på en stol och Harris satte sig på det och det fastnade på honom och de gick runt och letade efter det i hela rummet.
 PREDICTED: Det gjorde det dock att börja med , satte den på sig och Harris satte sig och satte sig på den , satte de upp sin , för att därefter gå till och de höll upp den .
--------------------------------------------------------------------------------
    SOURCE: There are a certain number of riverside roughs who make quite an income, during the summer, by slouching about the banks and blackmailing weak-minded noodles in this way.
    TARGET: Det finns ett antal skojare som tjänar en rätt duktig hacka under sommarhalvåret, genom att driva runt längs kanalbankarna och u

Processing Epoch 18: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=2.587]


--------------------------------------------------------------------------------
    SOURCE: And one would open the door and mount the steps, and stagger back into the arms of the man behind him; and they would all come and have a sniff, and then droop off and squeeze into other carriages, or pay the difference and go first.
    TARGET: Och de öppnade dörren och klev uppför trappstegen och ryggade tillbaka i famnen på den som stod bakom dem; och så drog de ett andetag, för att därpå slinka undan och tränga in sig i andra vagnar, eller betala mellanskillnaden för att få åka i första klass.
 PREDICTED: Och en annan man kom fram och , i vattnet och upp alla de stora mörka ; och de brukade sig alla , till ett ljud och alla från dem och alla andra sidan och alla .
--------------------------------------------------------------------------------
    SOURCE: There is too much odour about cheese.
    TARGET: Ost luktar helt enkelt för mycket.
 PREDICTED: Det är alltför för egen del .
----------

Processing Epoch 19: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=2.000]


--------------------------------------------------------------------------------
    SOURCE: I heard a man, going up a mountain in Switzerland, once say he would give worlds for a glass of beer, and, when he came to a little shanty where they kept it, he kicked up a most fearful row because they charged him five francs for a bottle of Bass.
    TARGET: Jag hörde en man som var på väg uppför ett berg i Schweiz säga, att han skulle ge bort hela världen för ett glas öl och då vi kom fram till ett litet skjul är de förvarade ölen, skrek han som besatt, då han fick reda på att de tog fem francs för en flaska Bass.
 PREDICTED: Jag kände en gång en annan man vid , vid han säger han en kvarts mile från och då han kom fram till en stund , där han såg på med en annan båt , som skulle komma att försöka få reda på för honom att kunna komma att betala en annan annan båt .
--------------------------------------------------------------------------------
    SOURCE: We took up the hoops, and began to 

Processing Epoch 20: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=1.925]


--------------------------------------------------------------------------------
    SOURCE: It was awful gloomy before."
    TARGET: Det var mycket dystert innan.”
 PREDICTED: Det var verkligen en gång .”
--------------------------------------------------------------------------------
    SOURCE: There is a tomb in Shepperton churchyard, however, with a poem on it, and I was nervous lest Harris should want to get out and fool round it.
    TARGET: Det finns dock en grav på kyrkogården i Shepperton, som har en dikt inristad på stenen och jag var orolig att Harris skulle vilja gå i land och larva runt kring den.
 PREDICTED: Det finns en gott och i ordning med en stund , vilket var det att Harris och jag skulle ta fram sin säng .
--------------------------------------------------------------------------------


Processing Epoch 21: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=1.409]


--------------------------------------------------------------------------------
    SOURCE: I had looped it round slowly and cautiously, and tied it up in the middle, and folded it in two, and laid it down gently at the bottom of the boat.
    TARGET: Jag hade vindat upp den sakta och omsorgsfullt, och knutit ihop den mittpå och vikit den dubbel, samt lagt ner den försiktigt på båtens durk.
 PREDICTED: Jag hade den där gott om den , och satt upp i stolen , och så försöker den ut det ut den där satt vi båten tillbaka .
--------------------------------------------------------------------------------
    SOURCE: I began to think it must be all a dream, and that I was really asleep in bed, and should wake up in a minute, and be told it was past ten.
    TARGET: Jag började tro att det hela var en dröm, att jag låg och sov i min säng, att jag skulle komma att vakna upp om någon minut, och att man skulle säga mig, att klockan var över tio.
 PREDICTED: Jag började tro att det måste vara någo

Processing Epoch 22: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=1.698]


--------------------------------------------------------------------------------
    SOURCE: It was a noble sight to see them suffering thus in silence, but it unnerved me altogether.
    TARGET: Det var en upplyftande syn att se dem lida så under tystnad, men det gjorde mig inte mindre nervös.
 PREDICTED: Det var en stund som såg mig komma att komma ihåg vilken det , men det gick mig .
--------------------------------------------------------------------------------
    SOURCE: Far down the road a little cloud of dust has risen, and draws nearer and grows larger, and the pattering of many hoofs grows louder, and in and out between the scattered groups of drawn-up men, there pushes on its way a brilliant cavalcade of gay-dressed lords and knights.
    TARGET: Långt nere längs vägen har ett litet dammoln bildats, vilket nu närmar sig och växer i storlek, och klapprandet från många hovar växer i styrka och in och ut mellan de utspridda grupperna av samlade härmän, tränger det fram en lysa

Processing Epoch 23: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=1.619]


--------------------------------------------------------------------------------
    SOURCE: Harris and I appeared to be struck by it at the same instant.
    TARGET: Harris och jag verkade slås av den i ett och samma ögonblick.
 PREDICTED: Harris och jag började tro att vid detta var ett ögonblick .
--------------------------------------------------------------------------------
    SOURCE: Then our porter said he thought that must be it on the high-level platform; said he thought he knew the train.
    TARGET: Så sade vår bärare att det måste finnas vid den övre plattformen; han sade sig känna till det tåget.
 PREDICTED: Då så , sade han , att den trodde att han måste vara framme ; den sortens namn sade han , även han trodde att helt enkelt skulle bli .
--------------------------------------------------------------------------------


Processing Epoch 24: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=2.138]


--------------------------------------------------------------------------------
    SOURCE: We began to understand the sufferings of the Babes in the Wood.
    TARGET: Vi började nu begripa the Babes in the Wood lidanden.
 PREDICTED: Vi började sova , vilka i .
--------------------------------------------------------------------------------
    SOURCE: Harris and I would go down in the morning, and take the boat up to Chertsey, and George, who would not be able to get away from the City till the afternoon (George goes to sleep at a bank from ten to four each day, except Saturdays, when they wake him up and put him outside at two), would meet us there.
    TARGET: Harris och jag skulle resa ner på morgonen och ta båten upp till Chertsey och George, som inte skulle kunna komma iväg från City förrän om eftermiddagen (George sover på en bank mellan tio och fyra varje dag, utom lördagar, då de väcker honom och kastar ut honom vid två), skulle möta oss där.
 PREDICTED: Harris och jag brukad

Processing Epoch 25: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=1.690]


--------------------------------------------------------------------------------
    SOURCE: The hired up-river boat very soon puts a stop to any nonsense of that sort on the part of its occupants.
    TARGET: Hyrbåtarna uppströms befriar snart sina hyresgästers sinnen från alla dylika dumheter.
 PREDICTED: Floden är en liten från floden , för att tala om att tala med den saken , som man såg på marken , av att man skall fa tag .
--------------------------------------------------------------------------------
    SOURCE: It was not a beautiful face; it was too prematurely aged-looking, too thin and drawn, to be that; but it was a gentle, lovable face, in spite of its stamp of pinch and poverty, and upon it was that look of restful peace that comes to the faces of the sick sometimes when at last the pain has left them.
    TARGET: Det var inget vackert ansikte; det var alltför präglat av sorger och svårigheter, alltför tunt och fårat för att vara det; men det var ett vänligt, kärleksfull

Processing Epoch 26: 100%|██████████| 349/349 [02:05<00:00,  2.79it/s, loss=1.788]


--------------------------------------------------------------------------------
    SOURCE: They finally rested upon a dusty old glass-case, fixed very high up above the chimney-piece, and containing a trout.
    TARGET: Till sist kom de att vila på ett dammigt vitrinskåp, uppsatt mycket högt ovanför rökgången, innehållande en forell.
 PREDICTED: En del av en ångslup , endast en liten av som upp längs floden och , med ett par åror som svar .
--------------------------------------------------------------------------------
    SOURCE: I expect that machine must have been referring to the following spring.
    TARGET: Jag antar att maskinen måste ha avsett den följande våren.
 PREDICTED: Jag tror att det måste ha som till .
--------------------------------------------------------------------------------


Processing Epoch 27: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=1.547]


--------------------------------------------------------------------------------
    SOURCE: A dense crowd watched the entertainment from Kew Bridge with much interest, and everybody shouted out to them different directions.
    TARGET: En stor folkskara betraktade nöjet från Kew Bridge med stort intresse och alla skrek olika goda råd åt dem.
 PREDICTED: En från grupp till Walton och med en stilla så skedde och de brukade hålla dem .
--------------------------------------------------------------------------------
    SOURCE: After beefsteak and porter, it says, "Sleep!"
    TARGET: Efter biffstek och porter säger den: ”Sov!”
 PREDICTED: Efter varma och säger den säger den : ” Var !”
--------------------------------------------------------------------------------


Processing Epoch 28: 100%|██████████| 349/349 [02:04<00:00,  2.79it/s, loss=1.451]


--------------------------------------------------------------------------------
    SOURCE: I tried to do so once.
    TARGET: Det försökte jag en gång.
 PREDICTED: Jag försökte att säga att en gång gång .
--------------------------------------------------------------------------------
    SOURCE: I woke Harris, and told him.
    TARGET: Jag väckte Harris och berättade det för honom.
 PREDICTED: Jag blev vild och Harris , och sade honom .
--------------------------------------------------------------------------------


Processing Epoch 29: 100%|██████████| 349/349 [02:04<00:00,  2.80it/s, loss=1.568]


--------------------------------------------------------------------------------
    SOURCE: "Why, our boat's gone off!" they replied in an indignant tone. "We just got out to disentangle the tow-line, and when we looked round, it was gone!"
    TARGET: ”Jo, vår båt har åkt sin väg!” svarade de, indignerat. ”Vi klev ur för att trassla upp draglinan och då vi såg os om, var den försvunnen!”
 PREDICTED: ” Men , i vår båt !” har de ställt till med ett par gånger om , ” vi tar in vid årorna och pekade den på väg mot tiden , då vi var , var det endast !”
--------------------------------------------------------------------------------
    SOURCE: Nobody spoke.
    TARGET: Ingen sade något.
 PREDICTED: Ingen talade .
--------------------------------------------------------------------------------


**Translation**

In [None]:
from pathlib import Path
#from config import get_config, latest_weights_file_path
#from model import build_transformer
from tokenizers import Tokenizer
from datasets import load_dataset
#from dataset import BilingualDataset
import torch
import sys

def translate(sentence: str):
    # Define the device, tokenizers, and model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    config = get_config()
    tokenizer_src = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_src']))))
    tokenizer_tgt = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_tgt']))))
    model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(), config["seq_len"], config['seq_len'], d_model=config['d_model']).to(device)

    # Load the pretrained weights
    model_filename = latest_weights_file_path(config)
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])

    # if the sentence is a number use it as an index to the test set
    label = ""
    if type(sentence) == int or sentence.isdigit():
        id = int(sentence)
        ds = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='all')
        ds = BilingualDataset(ds, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
        sentence = ds[id]['src_text']
        label = ds[id]["tgt_text"]
    seq_len = config['seq_len']

    # translate the sentence
    model.eval()
    with torch.no_grad():
        # Precompute the encoder output and reuse it for every generation step
        source = tokenizer_src.encode(sentence)
        source = torch.cat([
            torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64),
            torch.tensor(source.ids, dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[EOS]')], dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[PAD]')] * (seq_len - len(source.ids) - 2), dtype=torch.int64)
        ], dim=0).to(device)
        source_mask = (source != tokenizer_src.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
        encoder_output = model.encode(source, source_mask)

        # Initialize the decoder input with the sos token
        decoder_input = torch.empty(1, 1).fill_(tokenizer_tgt.token_to_id('[SOS]')).type_as(source).to(device)

        # Print the source sentence and target start prompt
        if label != "": print(f"{f'ID: ':>12}{id}")
        print(f"{f'SOURCE: ':>12}{sentence}")
        if label != "": print(f"{f'TARGET: ':>12}{label}")
        print(f"{f'PREDICTED: ':>12}", end='')

        # Generate the translation word by word
        while decoder_input.size(1) < seq_len:
            # build mask for target and calculate output
            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(torch.int).type_as(source_mask).to(device)
            out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

            # project next token
            prob = model.project(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            decoder_input = torch.cat([decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

            # print the translated word
            print(f"{tokenizer_tgt.decode([next_word.item()])}", end=' ')

            # break if we predict the end of sentence token
            if next_word == tokenizer_tgt.token_to_id('[EOS]'):
                break

    # convert ids to tokens
    return tokenizer_tgt.decode(decoder_input[0].tolist())

#read sentence from argument
translate(sys.argv[1] if len(sys.argv) > 1 else "I am not a very good a student.")

Using device: cuda
    SOURCE: -f
 PREDICTED:        

''

In [None]:
from pathlib import Path
import torch
import torch.nn as nn
#from config import get_config, latest_weights_file_path
#from train import get_model, get_ds, run_validation
#from translate import translate
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
config = get_config()
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

# Load the pretrained weights
model_filename = latest_weights_file_path(config)
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])
run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: print(msg), 0, 10)


Using device: cuda
Max length of source sentence: 280
Max length of target sentence: 269
--------------------------------------------------------------------------------
    SOURCE: And out he went, and left us alone.
    TARGET: Och så gick han och lämnade oss ensamma.
 PREDICTED: Och så gick han och lämnade oss ensamma .
--------------------------------------------------------------------------------
    SOURCE: "If I am guilty," said the Earl, "may this bread choke me when I eat it!"
    TARGET: ”Om jag är skyldig”, sade earlen, ”må då detta brödstycke kväva mig, då jag äter det!”
 PREDICTED: ” Om jag är skyldig ”, sade , ” må då detta mig , då jag äter det !”
--------------------------------------------------------------------------------
    SOURCE: I never saw such a thing as potato-scraping for making a fellow in a mess.
    TARGET: Jag har aldrig varit med om något värre än att borsta potatis i fråga om att ställa till en karl.
 PREDICTED: Jag har aldrig varit med om något värr

In [None]:
t = translate("And out he went, and left us alone.")

Using device: cuda
    SOURCE: And out he went, and left us alone.
 PREDICTED: Och så gick han och lämnade ensamma oss ensamma .  

In [None]:
t = translate(34)

Using device: cuda
        ID: 34
    SOURCE: Then I wondered how long I had to live.
    TARGET: Sa undrade jag över hur länge jag hade kvar att leva.
 PREDICTED: Sa undrade jag hur jag hade jag att leva att leva .  