In [None]:
import torch
import torch.optim as optim
import numpy as np
from transformer import Transformer
from datasets import load_dataset
from utils import *
from training import *
from tokenizer import *

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

In [None]:
traindict = load_dataset("opus100", "de-en", split="train")
testdict = load_dataset("opus100", "de-en", split="test")

In [None]:
inputs, labels = get_split(traindict, "en", "de", size=100000)
dataset = Dataset(inputs, labels)

In [None]:
df = dataset.dataframe()
df.head(5)

In [None]:
print(df.isnull().values.any())
df.describe()

In [None]:
dataset.sample()

In [None]:
corpus = dataset.corpus()
tokenizer = Nerdimizer()
tokenizer.train(corpus, size=50000)

In [None]:
vocab_size = len(tokenizer)
maxlen = dataset.avglen(tokenizer, factor=15)
start, end, pad = tokenizer["[S]"], tokenizer["[E]"], tokenizer["[P]"]
tokenizer.padon(maxlen, pad_id=pad, end=True)
tokenizer.truncon(maxlen, end=True)
print(f"Number of word piece tokens: {vocab_size}\nMaxlen: {maxlen}")

In [None]:
tokenized_dataset = dataset.tokenized(tokenizer)
dataloader = tokenized_dataset.dataloader(batch_size=128, drop_last=False)

In [None]:
model = Transformer(vocab_size, maxlen, pad_id=pad, dm=512, nhead=8, layers=6, dff=2048, bias=False)
optimizer = optim.Adam(model.parameters(), lr=0.1, betas=(0.9, 0.98), eps=10e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=10)
checkpoint = Checkpoint(model, optimizer, scheduler, epochs=100, path="models/checkpoint", overwrite=False)
model.to(device);

In [None]:
train(dataloader, model, optimizer, scheduler, checkpoint, epochs=1000, warmups=100, device=device)

In [None]:
# save_model(model, "Transformer-Base", "models/")
# save_tokenizer(tokenizer, "Tokenizer-en-de", "models/")

In [None]:
# tokenizer = load_tokenizer("models/Tokenizer-en-de")
# model = load_model(model, "models/Transformer-Base", device)