In [5]:
%pip uninstall numpy

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Would remove:
    /usr/local/bin/f2py
    /usr/local/lib/python3.11/dist-packages/numpy-1.26.4.dist-info/*
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libgfortran-040039e1.so.5.0.0
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libopenblas64_p-r0-0cf96a72.3.23.dev.so
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libquadmath-96973f99.so.0.0.0
    /usr/local/lib/python3.11/dist-packages/numpy/*
Proceed (Y/n)? y
  Successfully uninstalled numpy-1.26.4


In [1]:
%pip install "numpy<2"



In [3]:
# Use Python 3.9 environment (ensure you're running Python 3.9)
%pip install torch==2.0.1
%pip install torchvision==0.15.2
%pip install torchaudio==2.0.2
%pip install torchtext==0.15.2
%pip install datasets==2.15.0
%pip install tokenizers==0.21.1  # Fixing tokenizers version issue
%pip install torchmetrics==1.0.3
%pip install tensorboard==2.18.0  # Fixing TensorBoard version issue
%pip install altair==5.1.1
%pip install wandb==0.15.9
%pip install numpy==1.26.4  # Ensure NumPy is installed properly

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
Collecting tokenizers==0.21.1
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.21.1
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use u

In [1]:
import torch
import numpy
print(torch.tensor([1, 2, 3]).numpy())  # Should not give an error

[1 2 3]


In [2]:
numpy.__version__

'1.26.4'

In [7]:
from pathlib import Path

def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 50,
        "lr": 10**-4,
        "seq_len": 500,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "fr",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [9]:
import torch
import torch.nn as nn
import math

class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

class ResidualConnection(nn.Module):

        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)

class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

In [None]:
# from model import build_transformer
# from dataset import BilingualDataset, causal_mask
# from config import get_config, get_weights_file_path, latest_weights_file_path

import torchtext.datasets as datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR

import warnings
from tqdm import tqdm
import os
from pathlib import Path
import numpy

# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

import torchmetrics
from torch.utils.tensorboard import SummaryWriter

def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)


def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break

    if writer:
        # Evaluate the character error rate
        # Compute the char error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

def get_ds(config):
    # It only has the train split, so we divide it overselves
    ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')

    # Build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')


    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model'])
    return model

def train_model(config):
    # Define the device
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    elif (device == 'mps'):
        print(f"Device name: <mps>")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
    device = torch.device(device)

    # Make sure the weights folder exists
    Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)


if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train_model(config)

Using device: cuda
Device name: NVIDIA GeForce RTX 4090
Device memory: 23.64971923828125 GB
Max length of source sentence: 471
Max length of target sentence: 482
No model to preload, starting from scratch


Processing Epoch 00: 100%|██████████| 14297/14297 [25:32<00:00,  9.33it/s, loss=4.784]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: – Pas la première parole ; j’ai quitté Madame à six heures, joyeuse et contente.
    TARGET: "Not a word; I left the Signora at six o'clock, happy and content."
 PREDICTED: " This the ; I shall have been a six or , or , and we shall have been .
--------------------------------------------------------------------------------
    SOURCE: They owed Lestiboudois for so many days. Then the child grew cold and asked for her mother.
    TARGET: Puis l’enfant avait froid et demandait sa mère.
 PREDICTED: Ils étaient plus tard , si la pauvre fille était encore , et le jeune garçon lui demanda .
--------------------------------------------------------------------------------


Processing Epoch 01: 100%|██████████| 14297/14297 [25:33<00:00,  9.32it/s, loss=4.147]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: By God it is!"
    TARGET: Nom de Dieu de nom de Dieu!
 PREDICTED: À Dieu ! c ' est toi !
--------------------------------------------------------------------------------
    SOURCE: "Ha!
    TARGET: – Ah !
 PREDICTED: -- Ah !
--------------------------------------------------------------------------------


Processing Epoch 02: 100%|██████████| 14297/14297 [25:32<00:00,  9.33it/s, loss=3.551]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: But Camille insisted on these Sunday outings, which gave him the satisfaction of showing off his wife. When he met a colleague, particularly one of his chiefs, he felt quite proud to exchange bows with him, in the company of Madame.
    TARGET: Mais Camille tenait bon; il aimait à montrer sa femme; lorsqu'il rencontrait un de ses collègues, un de ses chefs surtout, il était tout fier d'échanger un salut avec lui, en compagnie de madame.
 PREDICTED: Mais Camille insista sur ces , qui lui donna la satisfaction de faire suivre sa femme ; et , comme il vit un voleur , il sentait tout fier à lui , à l ' avance , à la compagnie de madame .
--------------------------------------------------------------------------------
    SOURCE: He, on the contrary is yearning to take you by the hand, and talk to you.
    TARGET: Lui, au contraire, est désireux de vous serrer la main et de causer avec vous.
 PREDIC

Processing Epoch 03: 100%|██████████| 14297/14297 [25:32<00:00,  9.33it/s, loss=3.781]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Then she went on to tell me how she very luckily fell into a good family, where, behaving herself well, and her mistress dying, her master married her, by whom she had my husband and his sister, and that by her diligence and good management after her husband's death, she had improved the plantations to such a degree as they then were, so that most of the estate was of her getting, not her husband's, for she had been a widow upwards of sixteen years.
    TARGET: Puis elle continua à me raconter comment elle était tombée entre les mains d'une bonne famille, où, par sa bonne conduite, sa maîtresse étant morte, son maître l'avait épousée, et c'est de lui qu'elle avait eu mon mari et ma soeur; et comment, par sa diligence et son bon gouvernement, après la mort de son mari, elle avait amélioré les plantations à un point qu'elles n'avaient pas atteint jusque-là, si bien que la plus grande partie des t

Processing Epoch 04: 100%|██████████| 14297/14297 [25:34<00:00,  9.32it/s, loss=3.373]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "But, first of all, it was necessary that I should make arrangements for the wife and the son, of whose existence you and my other friends were ignorant.
    TARGET: «Mais il fallait avant tout prendre des mesures au sujet de ma femme et de mon fils dont vous et mes autres amis ignoriez l'existence.
 PREDICTED: -- Mais , d ' abord , il me fallait que je pour la femme et le fils de l ' existence dont vous et mes amis ont été .
--------------------------------------------------------------------------------
    SOURCE: Il lui croyait beaucoup d’esprit et il était furieux de ce qu’elle s’obstinait évidemment à ne pas ouvrir un avis.
    TARGET: he credited her with great intelligence, and was furious at her evident refusal to offer him any advice.
 PREDICTED: He thought he felt great mind , and he was a fury of this which she was to herself , not a advice .
----------------------------------------

Processing Epoch 05: 100%|██████████| 14297/14297 [25:33<00:00,  9.32it/s, loss=3.045]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "But if you saw him so seldom and wrote so seldom, how did he know enough about your affairs to be able to help you, as you say that he has done?"
    TARGET: – Mais si vous l’avez vu et lui avez écrit si rarement, comment en savait-il assez sur vos affaires pour vous aider ? »
 PREDICTED: -- Mais si vous l ' avez vu tant de fois , et il a écrit si rarement , comment a - t - il su que vous alliez vous aider , comme vous dites qu ' il a fait ?
--------------------------------------------------------------------------------
    SOURCE: Do you know what I did that evening, after the Opera Comique?"
    TARGET: Savez-vous ce que j'ai fait le soir de l'Opéra-Comique?
 PREDICTED: Savez - vous ce que j ' ai fait ce soir , après l ' Opéra - Comique ?
--------------------------------------------------------------------------------


Processing Epoch 06: 100%|██████████| 14297/14297 [25:34<00:00,  9.32it/s, loss=3.088]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "To-morrow work begins again at the Voreux.
    TARGET: —C'est demain que le travail reprend au Voreux.
 PREDICTED: — A demain , le Voreux commence a nouveau au Voreux .
--------------------------------------------------------------------------------
    SOURCE: A dim line of ancestors, in every variety of dress, from the Elizabethan knight to the buck of the Regency, stared down upon us and daunted us by their silent company.
    TARGET: Toute une rangée d’ancêtres, dans une bizarre variété de costumes, depuis le chevalier élisabéthain jusqu’au dandy de la Régence, plongeaient leurs regards fixes sur nous et nous impressionnaient par leur présence silencieuse.
 PREDICTED: Une ligne de ancêtres , dans toutes les de la , du chevalier du , nous regardait et nous par leurs cris de compagnie .
--------------------------------------------------------------------------------


Processing Epoch 07: 100%|██████████| 14297/14297 [25:35<00:00,  9.31it/s, loss=2.804]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: The river still measured from sixty to seventy feet in breadth, and its bed from five to six feet in depth.
    TARGET: La rivière mesurait encore soixante à soixante-dix pieds de large, et son lit cinq à six pieds de profondeur.
 PREDICTED: La Tamise mesurait encore soixante - dix pieds de largeur , et son lit de cinq à six pieds de profondeur .
--------------------------------------------------------------------------------
    SOURCE: The two men were separated, while Chaval, who was quite calm, only repeated:
    TARGET: On se précipita entre les deux hommes, tandis que Chaval, tres calme, répétait:
 PREDICTED: Les deux hommes se séparaient , tandis que Chaval , tres calme , répétait :
--------------------------------------------------------------------------------


Processing Epoch 08: 100%|██████████| 14297/14297 [25:34<00:00,  9.32it/s, loss=2.975]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: There was in the two ways in which "Master Jacques" was pronounced on the one hand, and the "master" by preeminence on the other, the difference between monseigneur and monsieur, between ~domine~ and ~domne~.
    TARGET: Il y avait dans les deux manières dont fut prononcé d’une part ce maître Jacques, de l’autre ce maître par excellence, la différence du monseigneur au monsieur, du domine au domne.
 PREDICTED: Il y avait dans les deux façons que maître Jacques Jacques , la main , et le maître de , l ’ autre , la différence entre Monseigneur et M . le , entre et .
--------------------------------------------------------------------------------
    SOURCE: I want you to tell me whether you love me."
    TARGET: Je veux que vous me disiez si vous m’aimez.
 PREDICTED: Je veux que vous me dise si vous m ' aimez .
--------------------------------------------------------------------------------


Processing Epoch 09: 100%|██████████| 14297/14297 [25:34<00:00,  9.32it/s, loss=3.116]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Ce soir-là Fabrice entendait fort distinctement un grand nombre d’hommes passer sur le pont en fer, dit le pont de l’esclave, parce que jadis un esclave dalmate avait réussi à se sauver, en précipitant le gardien du pont dans la cour.
    TARGET: That evening Fabrizio could hear quite distinctly a considerable number of men cross the iron bridge, known as the Slave's Bridge, because once a Dalmatian slave had succeeded in escaping, by throwing the guardian of the bridge down into the court below.
 PREDICTED: This evening Fabrizio heard quite plainly a number of men who were passing on the bridge , like iron , said the bridge of the Raversi , because a actor had had managed to save himself , in the of the bridge in court .
--------------------------------------------------------------------------------
    SOURCE: Everything contributed to my joy, from the paltry pleasure of awaiting the Thursda

Processing Epoch 10: 100%|██████████| 14297/14297 [25:31<00:00,  9.33it/s, loss=2.712]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: I found the black tor upon which I had seen the solitary watcher, and from its craggy summit I looked out myself across the melancholy downs.
    TARGET: Je retrouvais le pic noir sur lequel j’avais vu le guetteur solitaire, je l’escaladai et de son sommet tourmenté je contemplai la mélancolie du paysage.
 PREDICTED: Je trouvai le pic noir sur lequel j ' avais vu le solitaire , et de sa crête je me trouvai à travers les dunes sombres .
--------------------------------------------------------------------------------
    SOURCE: Along the narrow walls of this passageway, I saw only brilliant streaks, hard lines, fiery furrows, all scrawled by our speeding electric light.
    TARGET: Sur les murailles étroites du passage, je ne voyais plus que des raies éclatantes, des lignes droites, des sillons de feu tracés par la vitesse sous l'éclat de l'électricité.
 PREDICTED: Sur les étroites parois de ce 

Processing Epoch 11: 100%|██████████| 14297/14297 [25:31<00:00,  9.34it/s, loss=2.945]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Therefore the carpenters lost not a moment.
    TARGET: Aussi les charpentiers ne perdirent-ils pas un moment.
 PREDICTED: Donc les charpentiers ne se perdaient pas un instant .
--------------------------------------------------------------------------------
    SOURCE: CHAPTER XIV
    TARGET: CHAPITRE XIV
 PREDICTED: XIV
--------------------------------------------------------------------------------


Processing Epoch 12: 100%|██████████| 14297/14297 [25:32<00:00,  9.33it/s, loss=2.349]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: I would gladly have sent my husband away to Caroline with all our goods, and have come after myself, but this was impracticable; he would never stir without me, being himself perfectly unacquainted with the country, and with the methods of settling there or anywhere else.
    TARGET: J'aurai aimé à envoyer mon mari en Caroline pour le suivre ensuite moi-même, mais c'était impraticable, parce qu'il ne voulait pas bouger sans moi, ne connaissant nullement le pays ni la manière de s'établir en lieu que ce fut.
 PREDICTED: J ’ aurais volontiers envoyé mon mari en Caroline avec toutes nos marchandises , et j ’ aurais suivi moi - même ; mais il était impraticable ; il ne se jamais sans que je pusse en avoir connaissance avec la campagne , et avec les moyens de s ’ établir ou ailleurs .
--------------------------------------------------------------------------------
    SOURCE: "I might have known it.

Processing Epoch 13: 100%|██████████| 14297/14297 [25:32<00:00,  9.33it/s, loss=2.303]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: And why does he sign himself 'him whom you knew as Jim Harrison?'
    TARGET: Et pourquoi signe-t-il celui que vous connaissiez sous le nom de James Harrison?
 PREDICTED: Et pourquoi se fait - il signer de lui que vous connaissiez comme Jim ?
--------------------------------------------------------------------------------
    SOURCE: 'Was I not speaking the truth?' thought Julien; 'why does the love that I felt for that madwoman torment me still?'
    TARGET: Que n’ai-je dit vrai ? pensait Julien, pourquoi l’amour que j’avais pour cette folle me tourmente-t-il encore ?
 PREDICTED: N ’ étais - je pas la vérité ? pensa Julien ; pourquoi me fait - il pour cette folle supplice que je me sentais encore ?
--------------------------------------------------------------------------------


Processing Epoch 14: 100%|██████████| 14297/14297 [25:33<00:00,  9.33it/s, loss=2.927]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "Leave the window open on his side, Carter; there is no wind--good-bye, Dick."
    TARGET: -- Laissez la fenêtre ouverte de son côté, Carter; il n'y a pas de vent. Adieu, Dick.
 PREDICTED: -- Laissons la fenêtre ouverte , Carter ; il n ' y a pas de vent , adieu , Dick , Dick .
--------------------------------------------------------------------------------
    SOURCE: Pleased with the preference of one, and offended by the neglect of the other, on the very beginning of our acquaintance, I have courted prepossession and ignorance, and driven reason away, where either were concerned. Till this moment I never knew myself."
    TARGET: Flattée de la préférence de l’un, froissée du manque d’égards de l’autre, je me suis abandonnée des le début a mes préventions et j’ai jugé l’un et l’autre en dépit du bon sens.
 PREDICTED: avec la préférence d ' un et offensé par la négliger de l ' autre , au commen

Processing Epoch 15: 100%|██████████| 14297/14297 [25:30<00:00,  9.34it/s, loss=2.455]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: '"If I were not afraid of spoiling the finest bass voice I have ever heard, I should lock you up on bread and water for a fortnight, you scoundrel."
    TARGET: – Si je ne craignais pas de gâter la plus belle voix de basse que j’aie jamais entendue, je te mettrais en prison au pain et à l’eau pour quinze jours, polisson.
 PREDICTED: Si je n ’ avais pas peur de gâter la plus belle voix que j ’ aie jamais entendue , je t ’ aurais ouvert sur le pain et l ’ eau pendant une quinzaine , drôle .
--------------------------------------------------------------------------------
    SOURCE: 'That author is most immoral,' Julien said to Madame Valenod; 'in one of his Fables on Messire Jean Chouart, he has ventured to heap ridicule on all that is most venerable.
    TARGET: – Cet auteur est bien immoral, dit Julien à Mme Valenod, certaine fable, sur messire Jean Chouart, ose déverser le ridicule sur ce qu’i

Processing Epoch 16: 100%|██████████| 14297/14297 [25:28<00:00,  9.35it/s, loss=2.371]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Sometimes Herbert accompanied them, but never Pencroft, who could not bear to look upon the prospect of the island now so utterly devastated.
    TARGET: Quelquefois Harbert les accompagnait, jamais Pencroff, qui ne voulait pas voir sous son aspect nouveau l'île si profondément dévastée!
 PREDICTED: Parfois , Harbert les accompagnait , mais jamais Pencroff , qui ne pouvait tenir en place de la perspective de l ' île si mortellement dévastés .
--------------------------------------------------------------------------------
    SOURCE: Then she would knowand understand everything.
    TARGET: Alors elle saurait et comprendrait tout!
 PREDICTED: Alors elle tout .
--------------------------------------------------------------------------------


Processing Epoch 17: 100%|██████████| 14297/14297 [25:28<00:00,  9.36it/s, loss=2.441]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Ceci fait, trouvant sans doute qu’il avait achevé sa tâche, il remit son metre et sa loupe dans sa poche.
    TARGET: This done, he appeared to be satisfied, for he replaced his tape and his glass in his pocket.
 PREDICTED: All this means , not being the doubt that he had his task , he took his metre and his glass in his pocket .
--------------------------------------------------------------------------------
    SOURCE: "Heaven forbid! _That_ would be the greatest misfortune of all!
    TARGET: – Le ciel m’en préserve.
 PREDICTED: – Dieu me garde ! ce serait le plus grand malheur !
--------------------------------------------------------------------------------


Processing Epoch 18: 100%|██████████| 14297/14297 [25:28<00:00,  9.35it/s, loss=2.479]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: And the engine-man's eyes went from the young girl to her companion, while he stepped back with a sudden, relinquishing movement.
    TARGET: Et les yeux du machineur allerent de la jeune fille au camarade; tandis qu'il reculait d'un pas, avec un geste de brusque abandon.
 PREDICTED: Et les yeux de la machine , d ' apres son compagnon , sortaient de sa voisine , il s ' éloigna soudain , jouant des mouvements brusques .
--------------------------------------------------------------------------------
    SOURCE: I wanted no arguments.
    TARGET: Je ne voulais pas discuter.
 PREDICTED: Je ne voulus pas m ' en douter .
--------------------------------------------------------------------------------


Processing Epoch 19: 100%|██████████| 14297/14297 [25:29<00:00,  9.35it/s, loss=2.195]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Certain indigenous plants were discovered, and those fit for immediate use contributed to vary the vegetable stores of Granite House.
    TARGET: Certaines plantes indigènes furent encore découvertes, et, si elles n'avaient pas une utilité immédiate, elles contribuèrent à varier les réserves végétales de Granite-House.
 PREDICTED: Certains plantes furent découvertes , et l ' avantage de naviguer immédiatement la matière végétale de Granite - House .
--------------------------------------------------------------------------------
    SOURCE: We are not on duty, and we believed that not being on duty we were at liberty to dispose of our time as we pleased.
    TARGET: Nous ne sommes pas de service, et nous avons cru que, n'étant pas de service, nous pouvions disposer de notre temps comme bon nous semblait.
 PREDICTED: Nous ne sommes pas en devoir , et nous sommes qu ’ il ne fallait pas avoir la l

Processing Epoch 20: 100%|██████████| 14297/14297 [25:31<00:00,  9.34it/s, loss=2.182]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: D’Artagnan blushed up to the whites of his eyes.
    TARGET: D'Artagnan rougit jusqu'au blanc des yeux.
 PREDICTED: D ' Artagnan rougit jusqu ' aux yeux .
--------------------------------------------------------------------------------
    SOURCE: Often his mistress's sincere admiration, and her transports of passion made him forget the fatuous theory that had kept him so restrained and almost ridiculous in the first moments of their intimacy.
    TARGET: Souvent la sincère admiration et les transports de sa maîtresse lui faisaient oublier la vaine théorie qui l’avait rendu si compassé et presque si ridicule dans les premiers moments de cette liaison.
 PREDICTED: Souvent la sincère admiration de sa maîtresse , ses transports de passion lui faisaient oublier la théorie bête bête qui l ’ avait si mal gardée et presque ridicule aux premières moments de leur intimité .
-----------------------------

Processing Epoch 21: 100%|██████████| 14297/14297 [25:30<00:00,  9.34it/s, loss=2.332]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Ludovic rentra chargé d’un paquet.
    TARGET: Lodovico returned, carrying a packet.
 PREDICTED: Lodovico returned to a .
--------------------------------------------------------------------------------
    SOURCE: The camera was aimed at the scenery on the ocean floor, and in a few seconds we had a perfect negative.
    TARGET: L'instrument fut braqué sur ces sites du fond océanique, et en quelques secondes, nous avions obtenu un négatif d'une extrême pureté.
 PREDICTED: Les furent disposées sur le fond de l ' Océan , et dans quelques instants nous eûmes un véritable négatif .
--------------------------------------------------------------------------------


Processing Epoch 22: 100%|██████████| 14297/14297 [25:30<00:00,  9.34it/s, loss=2.326]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "Well, but Pierre.
    TARGET: --Oui, mais Pierre?
 PREDICTED: -- Mais Pierre …
--------------------------------------------------------------------------------
    SOURCE: Aramis received a ball which passed through his shoulder, and Mousqueton another ball which lodged in the fleshy part which prolongs the lower portion of the loins.
    TARGET: Aramis reçut une balle qui lui traversa l'épaule, et Mousqueton une autre balle qui se logea dans les parties charnues qui prolongent le bas des reins.
 PREDICTED: Aramis reçut un bal qui lui passait par l ' épaule , et Mousqueton , qui se dans la partie charnues qui la partie inférieure des reins .
--------------------------------------------------------------------------------


Processing Epoch 23: 100%|██████████| 14297/14297 [25:31<00:00,  9.34it/s, loss=2.129]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: It was Fix, who, bowing, addressed Mr. Fogg: "Were you not, like me, sir, a passenger by the Rangoon, which arrived yesterday?"
    TARGET: C'était l'inspecteur Fix, qui le salua et lui dit : « N'êtes-vous pas comme moi, monsieur, un des passagers du Rangoon, arrivé hier ?
 PREDICTED: C ' était Fix , qui , s ' adressant à Mr . Fogg : -- Vous n ' étiez pas comme moi , monsieur , un passager , arrivé à bord du Rangoon , qui est arrivé hier ?
--------------------------------------------------------------------------------
    SOURCE: Si vous me donnez à Naples une place dans une loge à San Carlo et un cheval, je suis plus que satisfait ; ce ne sera jamais le plus ou moins de luxe qui nous donnera un rang à vous et à moi, c’est le plaisir que les gens d’esprit du pays pourront trouver peut-être à venir prendre une tasse de thé chez vous.
    TARGET: If you give me, at Naples, a seat in a box at San

Processing Epoch 24: 100%|██████████| 14297/14297 [25:29<00:00,  9.35it/s, loss=2.189]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: 'There, that's settled.'
    TARGET: Allons, voilà qui est décidé.
 PREDICTED: – Là , c ’ est convenu .
--------------------------------------------------------------------------------
    SOURCE: The portions were handed round; those who liked took a draught of the water, the mug being common to all.
    TARGET: Les parts furent distribuées aux élèves, et celles qui avaient soif prirent un peu d'eau dans le gobelet qui servait à toutes.
 PREDICTED: Les portions étaient remplies , on le fit passer par un verre d ' eau , et le vase était commun , au point de tomber .
--------------------------------------------------------------------------------


Processing Epoch 25: 100%|██████████| 14297/14297 [25:29<00:00,  9.35it/s, loss=2.169]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: He does not care for that: when my time came to die, he would resign me, in all serenity and sanctity, to the God who gave me.
    TARGET: Eh bien! après? peu lui importe à lui; quand l'heure de mourir sera venue, il me rendra avec un visage serein au Dieu qui m'aura donnée à lui.
 PREDICTED: Il n ' a garde de cela ; quand mon temps viendrait mourir , il me en sérénité , dans toute sainteté , au Dieu qui m ' a donné .
--------------------------------------------------------------------------------
    SOURCE: Leon walked up and down the room; it seemed strange to him to see this beautiful woman in her nankeen dress in the midst of all this poverty.
    TARGET: Léon se promenait dans la chambre; il lui semblait étrange de voir cette belle dame en robe de nankin, tout au milieu de cette misère.
 PREDICTED: Léon se promena dans la salle , il lui semblait étrange de voir cette belle femme en robe d

Processing Epoch 26: 100%|██████████| 14297/14297 [25:29<00:00,  9.35it/s, loss=2.104]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: The distance, increased by detours and obstacles which could not be surmounted directly, was long.
    TARGET: La route, accrue par des détours et des obstacles qui ne pouvaient être franchis directement, était longue.
 PREDICTED: La distance , par des détours et des obstacles qui ne pouvaient être directement , était longue .
--------------------------------------------------------------------------------
    SOURCE: She turned red, then pale, and began to tremble like a culprit before the captain, who gazed at her with a smile of satisfaction and amazement.
    TARGET: Elle devint rouge, puis pâle, et se mit à trembler comme une coupable devant le capitaine, qui la regardait avec un sourire de satisfaction et d’étonnement.
 PREDICTED: Elle pâlit , et commença à trembler comme un coupable devant le capitaine , qui la regardait avec un sourire d ' animation et de surprise .
--------------------

Processing Epoch 27: 100%|██████████| 14297/14297 [25:30<00:00,  9.34it/s, loss=2.079]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Diving easily, these reptiles can remain a good while underwater by closing the fleshy valves located at the external openings of their nasal passages.
    TARGET: Ces reptiles, qui plongent facilement, peuvent se maintenir longtemps sous l'eau en fermant la soupape charnue située à l'orifice externe de leur canal nasal.
 PREDICTED: Ces reptiles , les reptiles peuvent rester longtemps dans les entrailles du rivage , à leur suite de ces cavernes du .
--------------------------------------------------------------------------------
    SOURCE: "If he has committed any crime, he has most fearfully expiated it, and in our eyes he is absolved."
    TARGET: S'il a commis quelque faute, il l'a cruellement expiée, et, à nos yeux, il est absous.»
 PREDICTED: -- S ' il a commis quelque crime , il l ' a en moi très horriblement , et dans les yeux il est désarmé .
-------------------------------------------

Processing Epoch 28: 100%|██████████| 14297/14297 [25:37<00:00,  9.30it/s, loss=2.523]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: I experienced a strange feeling as the key grated in the lock, and the sound of his retreating step ceased to be heard.
    TARGET: J'éprouvai une étrange sensation lorsque la clef cria dans la serrure et que je n'entendis plus le bruit de ses pas.
 PREDICTED: J ' éprouvais un étrange sentiment à la clef dans la serrure et le bruit de son pas qui cessa d ' être entendu .
--------------------------------------------------------------------------------
    SOURCE: We left the house, and started on our return to Paris, talking over the new plan.
    TARGET: Nous quittâmes la maison et reprîmes la route de Paris tout en causant de cette nouvelle résolution.
 PREDICTED: Nous quittâmes la maison , et nous partîmes en route pour Paris , en causant du nouveau plan .
--------------------------------------------------------------------------------


Processing Epoch 29: 100%|██████████| 14297/14297 [25:42<00:00,  9.27it/s, loss=1.885] 
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "The matter? the matter is that I have just broken a tooth!" replied the sailor.
    TARGET: -- Il y a... il y a... que je viens de me casser une dent! répondit le marin.
 PREDICTED: -- Eh ! l ' affaire est que je viens de briser une dent !» répondit le marin .
--------------------------------------------------------------------------------
    SOURCE: He stood still and counted his pulse.
    TARGET: Il s’arreta pour se tâter le pouls.
 PREDICTED: Il resta immobile et comptait son pouls .
--------------------------------------------------------------------------------


Processing Epoch 30: 100%|██████████| 14297/14297 [25:45<00:00,  9.25it/s, loss=1.992]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: That leather bed on which so many unhappy wretches had writhed, frightened her.
    TARGET: Ce lit de cuir, où s’étaient tordus tant de misérables, l’épouvantait.
 PREDICTED: Cette sur laquelle tant de misérables misérables s ’ étaient , l ’ effraya .
--------------------------------------------------------------------------------
    SOURCE: But the Marquise made him sit facing herself, talked to him continuously, and prevented his saying a word to her daughter.
    TARGET: Mais la marquise le fit placer vis-à-vis d’elle, lui parla constamment et empêcha qu’il ne pût dire un mot à sa fille.
 PREDICTED: Mais la marquise le fit asseoir en face , lui parla continuellement et l ’ empêcha de dire un mot à sa fille .
--------------------------------------------------------------------------------


Processing Epoch 39:  30%|███       | 4327/14297 [08:01<17:47,  9.34it/s, loss=1.784]  IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing Epoch 39:  49%|████▉     | 6970/14297 [12:44<13:01,  9.38it/s, loss=1.934]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing Epoch 39:  68%|██████▊   | 9713/14297 [17:38<08:08,  9.39it/s, loss=1.931]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order

--------------------------------------------------------------------------------
    SOURCE: "So it's an issue of . . . ?"
    TARGET: -- Alors, il s'agit de... ?
 PREDICTED: -- Il s ' agit donc ... ?
--------------------------------------------------------------------------------
    SOURCE: It was eleven o'clock, and if Captain Nemo found conditions favorable for taking his sights, I wanted to be present at the operation.
    TARGET: Il était onze heures, et si le capitaine Nemo se trouvait dans des conditions favorables pour observer, je voulais être présent à son opération.
 PREDICTED: Il était onze heures , et si le capitaine Nemo trouvait les conditions favorables à faire connaître , je désirais assister à l ' opération .
--------------------------------------------------------------------------------


Processing Epoch 40:   6%|▌         | 839/14297 [01:29<23:53,  9.39it/s, loss=1.804]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing Epoch 40:  25%|██▌       | 3627/14297 [06:26<18:51,  9.43it/s, loss=1.934]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing Epoch 40:  45%|████▍     | 6371/14297 [11:19<14:03,  9.39it/s, loss=1.761]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to

--------------------------------------------------------------------------------
    SOURCE: "By Jove! they go in for more than that," exclaimed the druggist.
    TARGET: -- Parbleu! ils en font bien d’autres! exclama l’apothicaire.
 PREDICTED: -- Eh ! ils y vont plus qu ’ ils n ’ en sont pas , s ’ écria l ’ apothicaire .
--------------------------------------------------------------------------------
    SOURCE: The old gentleman was fond of money, and anxious to keep the family estate together.
    TARGET: Le vieux M. Rochester et M. Rowland s'entendirent, et, afin d'enrichir M. Édouard, ils l'entraînèrent dans une position douloureuse.
 PREDICTED: Le vieux monsieur aimait beaucoup d ’ argent , et désirait garder les domaines ensemble de famille .
--------------------------------------------------------------------------------


Processing Epoch 41:   3%|▎         | 488/14297 [00:52<24:36,  9.35it/s, loss=1.706]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing Epoch 41:  23%|██▎       | 3261/14297 [05:47<19:31,  9.42it/s, loss=1.977]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing Epoch 41:  43%|████▎     | 6079/14297 [10:48<14:36,  9.38it/s, loss=1.950]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to

--------------------------------------------------------------------------------
    SOURCE: Now, Cartwright, there are the names of twenty-three hotels here, all in the immediate neighbourhood of Charing Cross.
    TARGET: À présent, Cartwright, voici les noms de vingt-trois hôtels, tous dans les environs immédiats de Charing Cross.
 PREDICTED: Cartwright , y a - t - il dans ce voisinage vingt - trois hôtels , tout est ici dans la région immédiate de Charing - Cross .
--------------------------------------------------------------------------------
    SOURCE: They all offered themselves, throwing coarse chaff at her.
    TARGET: Tous s'offrirent, la chaufferent de gros mots.
 PREDICTED: Tous s ’ offrirent , en lui jetant des plaisanteries nombreuses .
--------------------------------------------------------------------------------


Processing Epoch 42:   7%|▋         | 979/14297 [01:44<23:44,  9.35it/s, loss=1.812]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing Epoch 42:  27%|██▋       | 3824/14297 [06:48<18:43,  9.32it/s, loss=1.735]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing Epoch 42:  50%|█████     | 7154/14297 [12:44<12:43,  9.35it/s, loss=1.902]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to

In [None]:
from google.colab import drive
drive.mount('/content/drive')