In [9]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
pip install torch==2.0.1 torchtext --upgrade

Collecting torchtext
  Using cached torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
INFO: pip is looking at multiple versions of torchtext to determine which version is compatible with other requirements. This could take a while.
  Using cached torchtext-0.17.2-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
  Using cached torchtext-0.17.1-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
  Using cached torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
  Using cached torchtext-0.16.2-cp310-cp310-manylinux1_x86_64.whl.metadata (7.5 kB)
  Using cached torchtext-0.16.1-cp310-cp310-manylinux1_x86_64.whl.metadata (7.5 kB)
  Using cached torchtext-0.16.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.5 kB)


In [10]:
import os
os.chdir('/content/drive/MyDrive/Neural_Machine_Translation')

In [13]:
pip install torchmetrics



In [19]:
from model import build_transformer
from dataset import BilingualDataset, causal_mask
from config import get_config, get_weights_file_path, latest_weights_file_path

import torchtext.datasets as datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR

import warnings
from tqdm import tqdm
import os
from pathlib import Path

# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

import torchmetrics
from torch.utils.tensorboard import SummaryWriter

def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)


def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break

    if writer:
        # Evaluate the character error rate
        # Compute the char error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

def get_ds(config):
    # It only has the train split, so we divide it overselves
    ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')

    # Build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')


    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model'])
    return model

def train_model(config):
    # Define the device
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    elif (device == 'mps'):
        print(f"Device name: <mps>")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
    device = torch.device(device)

    # Make sure the weights folder exists
    Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)


if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train_model(config)

Using device: cuda
Device name: Tesla T4
Device memory: 14.74810791015625 GB
Max length of source sentence: 309
Max length of target sentence: 274
Preloading model opus_books_weights/tmodel_01.pt


Processing Epoch 02: 100%|██████████| 3638/3638 [11:38<00:00,  5.21it/s, loss=5.209]


--------------------------------------------------------------------------------
    SOURCE: They would have considered it unworthy of themselves to deceive that child.
    TARGET: Sentivano come un’offesa a se stessi ingannare quel fanciullo.
 PREDICTED: che si a lui , che la donna si .
--------------------------------------------------------------------------------
    SOURCE: I felt an inexpressible relief, a soothing conviction of protection and security, when I knew that there was a stranger in the room, an individual not belonging to Gateshead, and not related to Mrs. Reed.
    TARGET: Provai un indicibile sollievo, un senso di protezione e di sicurezza, quando mi accorsi che un estraneo era in camera mia, un individuo che non apparteneva a Gateshead nè alla famiglia della signora Reed.
 PREDICTED: Era un uomo che aveva una volta in una volta , quando era un giorno , quando era un momento in una sala di notte , non era stata la signora Fairfax .
----------------------------------

Processing Epoch 03: 100%|██████████| 3638/3638 [11:37<00:00,  5.21it/s, loss=5.218]


--------------------------------------------------------------------------------
    SOURCE: Do you want anything?' Karenin answered reluctantly.
    TARGET: Ti occorre qualcosa? — rispose di malavoglia Aleksej Aleksandrovic.
 PREDICTED: Voi non ti sei bisogno di nulla ? — rispose Aleksej Aleksandrovic .
--------------------------------------------------------------------------------
    SOURCE: Noticing the stranger's confusion, Kapitonich himself came out, admitted her, and inquired what she wanted.
    TARGET: Avendo notato la confusione della sconosciuta, lo stesso Kapitonyc le venne incontro, la lasciò passare dalla porta e domandò cosa desiderasse.
 PREDICTED: la figura di lei , che si era messo in fretta , e domandò che cosa avrebbe voluto .
--------------------------------------------------------------------------------


Processing Epoch 04: 100%|██████████| 3638/3638 [11:39<00:00,  5.20it/s, loss=4.900]


--------------------------------------------------------------------------------
    SOURCE: A lanky official, going down with a portfolio, stopped, with a disapproving look at the feet of the man running upstairs, and then glanced inquiringly at Oblonsky, who was standing at the top of the stairs.
    TARGET: Uno di quelli che scendevano, un impiegato magrolino con una cartella sotto il braccio, fermatosi, guardò con riprovazione le gambe di colui che correva e fissò interrogativamente Oblonskij.
 PREDICTED: Un vecchietto , andò a sedere con un ’ occhiata , sorridendo , si fermò sulla scala , e , guardando Oblonskij , guardò Stepan Arkad ’ ic , guardando Oblonskij , e guardò Oblonskij Oblonskij Oblonskij , che stava in piedi in piedi .
--------------------------------------------------------------------------------
    SOURCE: The master, though vigilant in the business of preserving the ship, yet as he went in and out of his cabin by me, I could hear him softly to himself say, severa

Processing Epoch 05: 100%|██████████| 3638/3638 [11:38<00:00,  5.21it/s, loss=3.893]


--------------------------------------------------------------------------------
    SOURCE: Karenin rose, and though he tried to be cautious he caught against the table.
    TARGET: Aleksej Aleksandrovic si alzò, voleva proceder cauto, ma inciampò nella tavola, si avvicinò e mise la mano nella mano del francese.
 PREDICTED: Aleksej Aleksandrovic si alzò , e , sebbene egli si avvicinava al tavolo .
--------------------------------------------------------------------------------
    SOURCE: 'Not a bit of it!
    TARGET: — Ma che dici!
 PREDICTED: — Non un po ’ di più !
--------------------------------------------------------------------------------


Processing Epoch 06: 100%|██████████| 3638/3638 [11:38<00:00,  5.21it/s, loss=4.672]


--------------------------------------------------------------------------------
    SOURCE: Three women: an old lady a young lady, and a tradesman's wife; and three gentlemen: one a German banker with a ring on his finger, another a bearded merchant, and the third an irate official in uniform with an order hanging from his neck, had evidently long been waiting.
    TARGET: Tre signore, una anziana, una giovane e la moglie di un mercante; tre signori: il primo, un banchiere tedesco con un anello al dito, il secondo, un commerciante con la barba, il terzo, un impiegato rabbioso in piccola tenuta con una decorazione al collo, aspettavano già da tempo.
 PREDICTED: Tre donne : una vecchia signora , e una donna di donna , e un di pastore ; un il tedesco con un dito al suo dito , un altro che aveva con un dito , e un altro che un vecchio in uniforme , evidentemente , evidentemente , evidentemente , evidentemente molto tempo .
------------------------------------------------------------------

Processing Epoch 07: 100%|██████████| 3638/3638 [11:37<00:00,  5.21it/s, loss=2.121]


--------------------------------------------------------------------------------
    SOURCE: My refusals were forgotten--my fears overcome--my wrestlings paralysed.
    TARGET: I timori svanivano, la volontà era paralizzata.
 PREDICTED: " Ero dimenticato , i miei timori erano , i miei timori .
--------------------------------------------------------------------------------
    SOURCE: Moscow, despite its cafés chantants and its omnibuses, was still a stagnant pool.
    TARGET: Mosca, malgrado i suoi cafés chantants e gli omnibus, era pur sempre una palude stagnante.
 PREDICTED: La sua mente , malgrado i e i suoi erano .
--------------------------------------------------------------------------------


Processing Epoch 08: 100%|██████████| 3638/3638 [11:38<00:00,  5.21it/s, loss=3.015]


--------------------------------------------------------------------------------
    SOURCE: Time will pass, and I shall become indifferent.'
    TARGET: Passerà il tempo, e diverrò indifferente anche a questo».
 PREDICTED: — , e io farò tutto indifferente .
--------------------------------------------------------------------------------
    SOURCE: But why prove it to me?
    TARGET: Ma perché dimostrarmelo?
 PREDICTED: Ma perché mi dà questo ?
--------------------------------------------------------------------------------


Processing Epoch 09: 100%|██████████| 3638/3638 [11:38<00:00,  5.21it/s, loss=2.961]


--------------------------------------------------------------------------------
    SOURCE: 'Mama! She often comes to see me, and when she comes...' he began, but stopped, noticing that his nurse was whispering something in his mother's ear and that a look of fear and of something like shame, that did not at all suit her face, appeared there.
    TARGET: — Mamma, lei viene spesso da me, e quando verrà... — cominciò a dire, ma si fermò avendo notato che la njanja aveva sussurrato qualcosa alla madre e che sul viso della madre s’erano espressi lo spavento e qualcosa di simile alla vergogna che le stava così male.
 PREDICTED: — Mamma , ha spesso spesso , mi ha parlato , e quando ... — cominciò , dopo aver notato che la njanja si qualcosa alla madre , e che un ’ espressione di spavento , di spavento , come la vergogna , non le si del viso , le apparve sul viso .
--------------------------------------------------------------------------------
    SOURCE: We went a goodish way without comin