In [2]:
import torch
import torch.optim as optim
from utils import Dataset, load_model, save_model
from training import train, predict, prompt
from datasets import load_dataset
from tokenizer import Nerdimizer, save_tokenizer, load_tokenizer
from transformer import Transformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cpu


In [3]:
datadict = load_dataset("europa_eac_tm", language_pair=("en", "de"), split="train")

Using custom data configuration en2de-176cd02372067e72
Found cached dataset europa_eac_tm (/Users/tonimo/.cache/huggingface/datasets/europa_eac_tm/en2de-176cd02372067e72/0.0.0/955b2501a836c2ea49cfe3e719aec65dcbbc3356bbbe53cf46f08406eb77386a)


In [4]:
inputs = [pair["en"] for pair in datadict["translation"]]
labels = [pair["de"] for pair in datadict["translation"]]
dataset = Dataset(inputs, labels)

In [5]:
df = dataset.dataframe()
df.head(5)

Unnamed: 0,inputs,labels
0,Nr. teachers/trainers,Anzahl Lehrer(innen)/Trainer
1,APPLICANT,ANTRAGSTELLERIN/ANTRAGSTELLER
2,The grant application will be processed by com...,Der Förderantrag wird elektronisch verarbeitet...
3,To be signed by the person legally authorised ...,"Unterschrift der Person, die rechtsverbindlich..."
4,DATE OF BIRTH,GEBURTSDATUM


In [6]:
print(df.isnull().values.any())
df.describe()

False


Unnamed: 0,inputs,labels
count,4473,4473
unique,4452,4312
top,Event,x
freq,2,29


In [17]:
dataset.sample()

(['Other financial activities'],
 ['Mit Finanz-  und Versicherungsdienstleistungen verbundene Tätigkeiten'])

In [11]:
corpus = dataset.corpus()
tokenizer = Nerdimizer()
tokenizer.train(corpus)
vocab_size = len(tokenizer)
maxlen = dataset.avg_tokenized_len(tokenizer, factor=3)
start, end, pad = tokenizer["[S]"], tokenizer["[E]"], tokenizer["[P]"]
tokenizer.padon(maxlen, pad_id=pad)
tokenizer.truncon(maxlen)
print(f"Number of word piece tokens: {len(tokenizer)}")

Number of word piece tokens: 15693


In [28]:
sample = dataset.sample()
print(tokenizer(sample[0]))
print(tokenizer(sample[1]))

[['[S]', 'arts', '(', 'others', ')', '[E]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]']]
[['[S]', 'kunst', '(', 'andere', ')', '[E]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]', '[P]']]


In [16]:
tokenized_dataset = dataset.tokenized(tokenizer)
dataloader = tokenized_dataset.dataloader(batch_size=32, drop_last=False)
next(iter(dataloader))[1]

tensor([[    0,   673,  1560,    11,   778,  2317,  4529,     1,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3],
        [    0,  1628,    18,   862,     1,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3],
        [    0,   179,  3835,   685,  3411,  6536,    17,  1213,  9398,  1554,
            11,   517,  3888,    15, 15259,    15,  7049,   779,   826,   479,
          3146,     1],
        [    0,  2155,   179,   381,    15,   185,  7148,   278,   185,  4153,
           623, 12341,  3379,    17,     1,     3,     3,     3,     3,     3,
             3,     3],
        [    0, 14810,     1,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3],
        [    0,    21,    12,  9417,   181,  5336,   278,   473, 14892,     1,
           

In [10]:
model = Transformer(vocab_size=len(tokenizer), maxlen=maxlen, pad_id=pad, 
                    dm=512, nhead=8, layers=6, dff=2048)
optimizer = optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=10e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=10)

In [None]:
train(model, optimizer, scheduler, dataloader, epochs=1000, warmups=100, verbose=True, device=device)

In [None]:
save_model(model, "Transformer-Base", "models/")
save_tokenizer(tokenizer, "Tokenizer-en-de", "models/")