In [None]:
import torch
import torch.optim as optim
import numpy as np
from transformer import Transformer
from metrics import Evaluator
from datasets import load_dataset
from utils import *
from train import *
from inference import *
from tokenizer import *
from config import *

print(f"Using {device}")

In [None]:
traindict = load_dataset("opus100", "de-en", split="train")
testdict = load_dataset("opus100", "de-en", split="test")

In [None]:
train_inputs, train_labels = get_split(traindict, "en", "de", size=train_size)
test_inputs, test_labels = get_split(testdict, "en", "de", size=test_size)
trainset = Dataset(train_inputs, train_labels)
testset = Dataset(test_inputs, test_labels)

In [None]:
trainframe = trainset.dataframe()
trainframe.head()

In [None]:
testframe = testset.dataframe()
testframe.head()

In [None]:
print(trainframe.isnull().values.any())
trainframe.describe()

In [None]:
print(testframe.isnull().values.any())
testframe.describe()

In [None]:
trainset.sample()

In [None]:
testset.sample()

In [None]:
corpus_en = trainset.corpus(data="inputs") + testset.corpus(data="inputs")
corpus_de = trainset.corpus(data="labels") + testset.corpus(data="labels")
tokenizer_en = Nerdimizer()
tokenizer_de = Nerdimizer()
tokenizer_en.train(corpus_en, size=vocab_size_english)
tokenizer_de.train(corpus_de, size=vocab_size_german)
translator = Translator(tokenizer_en, tokenizer_de)
save_tokenizer(translator, "translator")

In [None]:
en_vocab, de_vocab = translator.vocab_size()
start, end, pad = translator["[S]"], translator["[E]"], translator["[P]"]
print(f"Number of input tokens: {en_vocab}\nNumber of output tokens: {de_vocab}")

In [None]:
translator.padon(maxlen, end=True, pad_id=pad)
translator.truncon(maxlen, end=True)
tokenized_trainset = trainset.tokenized(translator, model=True)
dataloader = tokenized_trainset.dataloader(batch_size=batch_size, drop_last=False)
print(f"Maxlen: {maxlen}")

In [None]:
model = Transformer(en_vocab, de_vocab, maxlen, pad_id=pad, dm=dm, nhead=nhead, layers=layers, dff=dff,
                    bias=bias, dropout=dropout, eps=eps)
optimizer = optim.Adam(model.parameters(), lr=lr, betas=betas, eps=adam_eps)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=factor, patience=patience)
search = Beam(model, start, end, maxlen, beam_width=beam_width, breadth=max_breadth, 
                mode=search_mode, alpha=alpha, device=device)
evaluator = Evaluator(testset, translator, search, sample=sample_size, 
                        ngrams=ngrams, bleu_goal=bleu_goal, mode="geometric")
clock = Clock()
checkpoint = Checkpoint(dataloader, model, optimizer, scheduler, evaluator, clock, 
                        epochs=save_every, path="checkpoint", overwrite=overwrite)
model.to(device)
print(f"Number of Trainable Paramaters: {parameter_count(model):.1f}M\nSize of Model: {model_size(model):.1f}MB")

In [None]:
train(dataloader, model, optimizer, scheduler, evaluator, checkpoint, clock,
    epochs=epochs, warmups=warmups, verbose=verbose, device=device)