In [1]:
import torch
import torch.optim as optim
from utils import Dataset, load
from training import train, predict, prompt
from transformer import Transformer
from tokenizer import WordPieceTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datadict = load("europa_eac_tm", language_pair=("en", "de"), split="train")

Using custom data configuration en2de-176cd02372067e72
Found cached dataset europa_eac_tm (/Users/tonimo/.cache/huggingface/datasets/europa_eac_tm/en2de-176cd02372067e72/0.0.0/955b2501a836c2ea49cfe3e719aec65dcbbc3356bbbe53cf46f08406eb77386a)


In [3]:
inputs = [pair["en"] for pair in datadict["translation"]]
labels = [pair["de"] for pair in datadict["translation"]]
dataset = Dataset(inputs, labels)

In [4]:
df = dataset.dataframe()
df.head(5)

Unnamed: 0,inputs,labels
0,Nr. teachers/trainers,Anzahl Lehrer(innen)/Trainer
1,APPLICANT,ANTRAGSTELLERIN/ANTRAGSTELLER
2,The grant application will be processed by com...,Der Förderantrag wird elektronisch verarbeitet...
3,To be signed by the person legally authorised ...,"Unterschrift der Person, die rechtsverbindlich..."
4,DATE OF BIRTH,GEBURTSDATUM


In [5]:
print(df.isnull().values.any())
df.describe()

False


Unnamed: 0,inputs,labels
count,4473,4473
unique,4452,4312
top,Event,x
freq,2,29


In [6]:
dataset.sample()

('British Indian Ocean Territory', 'Britisches Territorium im Indischen Ozean')

In [7]:
maxlen = 200
corpus = dataset.corpus()
tokenizer = WordPieceTokenizer()
tokenizer.train(corpus)
tokenizer.pruncate(maxlen)
sos, eos, pad = tokenizer["[CLS]"], tokenizer["[SEP]"], tokenizer["[PAD]"]

In [8]:
tokenized_dataset = dataset.tokenized(tokenizer)
dataloader = tokenized_dataset.dataloader(batch_size=32, drop_last=False)

In [9]:
model = Transformer(vocab_size=len(tokenizer), maxlen=maxlen, pad_id=pad)
optimizer = optim.Adam(model.parameters(), lr=1e-6, betas=(0.9, 0.98), eps=10e-9)

In [10]:
train(model, optimizer, dataloader, verbose=True, device=device)