In [1]:
import torch
import torch.optim as optim
import numpy as np
from transformer import Transformer
from metrics import Evaluator
from datasets import load_dataset
from utils import *
from training import *
from tokenizer import *

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using cpu


In [2]:
traindict = load_dataset("opus100", "de-en", split="train")
testdict = load_dataset("opus100", "de-en", split="test")

Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)
Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)


In [3]:
train_inputs, train_labels = get_split(traindict, "en", "de", size=100000)
test_inputs, test_labels = get_split(testdict, "en", "de", size=1000)
trainset = Dataset(train_inputs, train_labels)
testset = Dataset(test_inputs, test_labels)

In [4]:
trainframe = trainset.dataframe()
trainframe.head()

Unnamed: 0,inputs,labels
0,It will have done more harm if Turkey's expect...,- Sicher.
1,- That's true.,Mindestens 90 %; sekundäre Komponenten 4-6 % G...
2,"Slow, slow, quick, quick.","Nein, ehrlich gesagt finde ich, dass du heraus..."
3,What do you say Marylin?,48:02
4,We could be thrown anywhere in the galaxy.,"Pink für dich, Blau für Luke."


In [5]:
testframe = testset.dataframe()
testframe.head()

Unnamed: 0,inputs,labels
0,So I have to withdraw as Bobby's attorney.,moodle:bg-bab: Kalender: Tagesansicht: Freitag...
1,The more pronounced rate of decline was record...,"Inhalt: 88 S., 14 Abb., 6 Taf., 1 Beil."
2,(iii) The degree of substitution of different ...,""" In den ersten fünf Monaten des Jahres 2011 h..."
3,"moodle:bg-bab: Calendar: Day view: Friday, 25 ...","Ich meine, die meisten Leuten sehen eine ander..."
4,"Content: 32 S., 17 Abb., 14 Taf.",Die Verfahren auf der Grundlage des RhE Modell...


In [6]:
print(trainframe.isnull().values.any())
trainframe.describe()

False


Unnamed: 0,inputs,labels
count,100,100
unique,100,100
top,It will have done more harm if Turkey's expect...,- Sicher.
freq,1,1


In [7]:
print(testframe.isnull().values.any())
testframe.describe()

False


Unnamed: 0,inputs,labels
count,10,10
unique,10,10
top,So I have to withdraw as Bobby's attorney.,moodle:bg-bab: Kalender: Tagesansicht: Freitag...
freq,1,1


In [8]:
trainset.sample()

[('Thank you for your patience.', 'Weißt du, wer Remy Danton ist?')]

In [9]:
testset.sample()

[('(iii) The degree of substitution of different factors in pro duction.',
  '" In den ersten fünf Monaten des Jahres 2011 haben sich die Sarden drei Häfen abgewickelt insgesamt 781.193 Passagiere, ein Rückgang von 175.954 Einheiten (-18,4%) gegenüber dem gleichen Zeitraum des Vorjahres. Je ausgeprägter der Rückgang wurde aus dem Hafen von Olbia mit -21,9% verzeichnet, während in Golfo Aranci Verkehr sank um 15,4% und 9,8% von Porto Torres.')]

In [10]:
corpus_en = trainset.corpus(data="inputs") + testset.corpus(data="inputs")
corpus_de = trainset.corpus(data="labels") + testset.corpus(data="labels")
tokenizer_en = Nerdimizer()
tokenizer_de = Nerdimizer()
tokenizer_en.train(corpus_en, size=25000)
tokenizer_de.train(corpus_de, size=25000)
translator = Translator(tokenizer_en, tokenizer_de)

In [11]:
en_vocab, de_vocab = translator.vocab_size()
maxlen_train = trainset.maxlen(translator)
maxlen_test = testset.maxlen(translator)
maxlen = min(maxlen_train, maxlen_test, 256)
start, end, pad = tokenizer_en["[S]"], tokenizer_en["[E]"], tokenizer_en["[P]"]
print(f"Number of input tokens: {len(tokenizer_en)}\nNumber of output tokens: {len(tokenizer_de)}")

Number of input tokens: 845
Number of output tokens: 1019


In [12]:
translator.padon(maxlen, end=True, pad_id=pad)
translator.truncon(maxlen, end=True)
print(f"Maxlen: {maxlen}")

Maxlen: 10


In [13]:
model = Transformer(en_vocab, de_vocab, maxlen, pad_id=pad, dm=512, nhead=8, layers=6, dff=2048)
optimizer = optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=10e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=10)
evaluator = Evaluator(testset, translator, "[S]", "[E]", maxlen, sample=50, ngrams=4, threshold=25, 
                    mode="geometric", device=device)
clock = Clock()
checkpoint = Checkpoint(model, optimizer, scheduler, evaluator, clock, epochs=100, 
                    path="english-german", overwrite=True)
model.to(device)
print(f"Number of Trainable Paramaters: {parameter_count(model)}\nSize of Model: {model_size(model):.1f}MB")

Number of Trainable Paramaters: 45065216
Size of Model: 171.9MB


In [14]:
tokenized_trainset = trainset.tokenized(translator, model=True)
dataloader = tokenized_trainset.dataloader(batch_size=128, drop_last=False)

In [15]:
train(dataloader, model, optimizer, scheduler, evaluator, checkpoint, clock,
    epochs=1000, warmups=100, verbose=True, device=device)

Training Started
-------------------------------------------------------------------------------
Epoch 1 Complete | Epoch Duration: 00:00:05 | Elapsed Training Time: 00:00:05 |
Metrics | Epoch Loss: 6.4136 | BLEU Score: 0.0 | 
Other Info | Scheduler Warmup Step: True | Checkpoint Saved: False |
-------------------------------------------------------------------------------
Epoch 2 Complete | Epoch Duration: 00:00:06 | Elapsed Training Time: 00:00:11 |
Metrics | Epoch Loss: 5.8127 | BLEU Score: 0.0 | 
Other Info | Scheduler Warmup Step: True | Checkpoint Saved: False |
-------------------------------------------------------------------------------
Epoch 3 Complete | Epoch Duration: 00:00:04 | Elapsed Training Time: 00:00:14 |
Metrics | Epoch Loss: 5.5664 | BLEU Score: 0.0 | 
Other Info | Scheduler Warmup Step: True | Checkpoint Saved: False |
-------------------------------------------------------------------------------
Epoch 4 Complete | Epoch Duration: 00:00:04 | Elapsed Training Tim