In [None]:
import torch
import torch.optim as optim
from utils import Dataset, load_model, save_model
from training import train, predict, prompt
from datasets import load_dataset
from tokenizer import Nerdimizer, save_tokenizer, load_tokenizer
from transformer import Transformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

In [None]:
datadict = load_dataset("europa_eac_tm", language_pair=("en", "de"), split="train")

In [None]:
inputs = [pair["en"] for pair in datadict["translation"]]
labels = [pair["de"] for pair in datadict["translation"]]
dataset = Dataset(inputs, labels)

In [None]:
df = dataset.dataframe()
df.head(5)

In [None]:
print(df.isnull().values.any())
df.describe()

In [None]:
dataset.sample()

In [None]:
corpus = dataset.corpus()
tokenizer = Nerdimizer()
tokenizer.train(corpus)
vocab_size = len(tokenizer)
maxlen = dataset.avg_tokenized_len(tokenizer, factor=3)
start, end, pad = tokenizer["[S]"], tokenizer["[E]"], tokenizer["[P]"]
tokenizer.padon(maxlen, pad_id=pad)
tokenizer.truncon(maxlen)
print(f"Number of word piece tokens: {len(tokenizer)}")

In [None]:
tokenized_dataset = dataset.tokenized(tokenizer)
dataloader = tokenized_dataset.dataloader(batch_size=64, drop_last=False)


In [None]:
model = Transformer(vocab_size=len(tokenizer), maxlen=maxlen, pad_id=pad, 
                    dm=512, nhead=8, layers=6, dff=2048)
optimizer = optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=10e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=10)
model.to(device);

In [None]:
train(model, optimizer, scheduler, dataloader, epochs=1000, warmups=100, verbose=True, device=device)

In [None]:
save_model(model, "Transformer-Base", "saves/")
save_tokenizer(tokenizer, "Tokenizer-en-de", "saves/")

In [None]:
# tokenizer = load_tokenizer("models/Tokenizer-en-de")
# model = load_model(model, "models/Transformer-Base", device)