In [1]:
import torch
import torch.optim as optim
import numpy as np
from transformer import Transformer
from metrics import Evaluator
from datasets import load_dataset
from utils import *
from train import *
from inference import *
from tokenizer import *
from config import *

print(f"Using {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using cpu


In [2]:
traindict = load_dataset("opus100", "de-en", split="train")
testdict = load_dataset("opus100", "de-en", split="test")

Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)
Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)


In [3]:
train_inputs, train_labels = get_split(traindict, "en", "de", size=1000)
test_inputs, test_labels = get_split(testdict, "en", "de", size=100)
trainset = Dataset(train_inputs, train_labels)
testset = Dataset(test_inputs, test_labels)

In [4]:
trainframe = trainset.dataframe()
trainframe.head()

Unnamed: 0,inputs,labels
0,In that case Article 2 applies.,Marktanteil
1,Push! Push!,Komm rein.
2,APPROVAL GRANTED,05:35
3,Having regard to the proposal from the Commiss...,Artikel 58
4,-Come on.,"Wenn du einen Gegner hast, musst du dir eines ..."


In [5]:
testframe = testset.dataframe()
testframe.head()

Unnamed: 0,inputs,labels
0,"I mean, most people, they see another person w...",- Nein. Ich bin niemandes Mäuschen.
1,if ever someone's bones are found 20 feet out ...,This new page was generated in 0.010168 seconds.
2,"Osteoarthritis, rheumatoid arthritis and chron...",Man kann nicht immer gewinnen.
3,That motherfucker who did talk had a strange a...,1Co 15:43 gesät wird in Unehre und auferweckt ...
4,I just wish that you would give me more than e...,Echogerät ist bei mir eine Dolby-Revox mit reg...


In [6]:
print(trainframe.isnull().values.any())
trainframe.describe()

False


Unnamed: 0,inputs,labels
count,1000,1000
unique,1000,1000
top,In that case Article 2 applies.,Marktanteil
freq,1,1


In [7]:
print(testframe.isnull().values.any())
testframe.describe()

False


Unnamed: 0,inputs,labels
count,100,100
unique,100,100
top,"I mean, most people, they see another person w...",- Nein. Ich bin niemandes Mäuschen.
freq,1,1


In [8]:
trainset.sample()

[('Exactly what you said!', 'Du kennst sie.')]

In [9]:
testset.sample()

[('Best practice consulting for software globalization - we help you make sure that your applications and products support the cultural, legal and technical requirements of regional markets',
  'Sollten jemals Knochen 6 m von meinem Schlafzimmer gefunden werden, mache ich mir echt Sorgen.')]

In [10]:
corpus_en = trainset.corpus(data="inputs") + testset.corpus(data="inputs")
corpus_de = trainset.corpus(data="labels") + testset.corpus(data="labels")
tokenizer_en = Nerdimizer()
tokenizer_de = Nerdimizer()
tokenizer_en.train(corpus_en, size=vocab_size_english)
tokenizer_de.train(corpus_de, size=vocab_size_german)
translator = Translator(tokenizer_en, tokenizer_de)
save_tokenizer(translator, "translator")

Tokenizer saved


In [11]:
en_vocab, de_vocab = translator.vocab_size()
start, end, pad = translator["[S]"], translator["[E]"], translator["[P]"]
print(f"Number of input tokens: {en_vocab}\nNumber of output tokens: {de_vocab}")

Number of input tokens: 4498
Number of output tokens: 5659


In [12]:
translator.padon(maxlen, end=True, pad_id=pad)
translator.truncon(maxlen, end=True)
tokenized_trainset = trainset.tokenized(translator, model=True)
dataloader = tokenized_trainset.dataloader(batch_size=16, drop_last=False)
print(f"Maxlen: {maxlen}")

Maxlen: 10


In [13]:
model = Transformer(en_vocab, de_vocab, maxlen, pad_id=pad, dm=256, nhead=nhead, layers=2, dff=1024,
                    bias=bias, dropout=dropout, eps=eps)
optimizer = optim.Adam(model.parameters(), lr=lr, betas=betas, eps=adam_eps)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=factor, patience=patience)
search = Beam(model, start, end, maxlen, beam_width=beam_width, breadth=max_breadth, 
                mode=search_mode, alpha=alpha, device=device)
evaluator = Evaluator(testset, translator, search, sample=sample_size, 
                        ngrams=ngrams, goal_bleu=goal_bleu, mode="geometric")
clock = Clock()
checkpoint = Checkpoint(dataloader, model, optimizer, scheduler, evaluator, clock, 
                        epochs=save_every, path="checkpoint", overwrite=overwrite)
model.to(device)
print(f"Number of Trainable Paramaters: {parameter_count(model):.1f}M\nSize of Model: {model_size(model):.1f}MB")

Number of Trainable Paramaters: 6.3M
Size of Model: 24.0MB


In [14]:
train(dataloader, model, optimizer, scheduler, evaluator, checkpoint, clock,
    epochs=5, warmups=warmups, verbose=verbose, device=device)

Training Started
-------------------------------------------------------------------------------
Epoch 1 Complete | Epoch Duration: 00:00:09 | Elapsed Training Time: 00:00:09 |
Metrics | Epoch Loss: 8.3027 | BLEU Score: 0.0 | 
Other Info | Scheduler Warmup Step: True | Checkpoint Saved: False |
-------------------------------------------------------------------------------
Epoch 2 Complete | Epoch Duration: 00:00:07 | Elapsed Training Time: 00:00:15 |
Metrics | Epoch Loss: 7.8187 | BLEU Score: 0.0 | 
Other Info | Scheduler Warmup Step: True | Checkpoint Saved: False |


KeyboardInterrupt: 