In [17]:
import torch
import torch.optim as optim
import numpy as np
from transformer import Transformer
from metrics import Evaluator
from datasets import load_dataset
from utils import *
from training import *
from tokenizer import *

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cpu


In [18]:
traindict = load_dataset("opus100", "de-en", split="train")
testdict = load_dataset("opus100", "de-en", split="test")

Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)


In [None]:
train_inputs, train_labels = get_split(traindict, "en", "de", size=100000)
test_inputs, test_labels = get_split(testdict, "en", "de", size=1000)
trainset = Dataset(train_inputs, train_labels)
testset = Dataset(test_inputs, test_labels)

In [None]:
trainframe = trainset.dataframe()
trainframe.head()

Unnamed: 0,inputs,labels
0,It's greed that it's gonna be the death of you...,Deine Habgier wird noch dein Tod sein.
1,Vega.,- Vega.
2,Just say when.,Sagen Sie einfach stopp.
3,- Wait.,- Warte.
4,I don't wanna be here.,Ich will nicht hier sein.


In [None]:
testframe = testset.dataframe()
testframe.head()

Unnamed: 0,inputs,labels
0,"By clicking on 'Save profile', you the user ag...",Die Nutzungsbedingungen werden durch das Klick...
1,I wanted to show you something first.,Ich wollte dir erst noch etwas zeigen.
2,You have suffered because of Shinkichi.,Du musstest wegen Shinkichi leiden.
3,"moodle:bg-bab: Calendar: Day view: Friday, 25 ...",moodle:bg-bab: Kalender: Tagesansicht: Freitag...
4,"I mean, most people, they see another person w...","Ich meine, die meisten Leuten sehen eine ander..."


In [None]:
print(trainframe.isnull().values.any())
trainframe.describe()

False


Unnamed: 0,inputs,labels
count,100000,100000
unique,96385,96634
top,Whereas:,in Erwägung nachstehender Gründe:
freq,49,48


In [None]:
print(testframe.isnull().values.any())
testframe.describe()

False


Unnamed: 0,inputs,labels
count,1000,1000
unique,999,998
top,Yes.,Ja.
freq,2,3


In [None]:
trainset.sample()

[('I declare, based on my own judgement and on the information at my disposal, including, inter alia, the results of work of the internal audit service, that:',
  'Ich erkläre aufgrund meiner Einschätzung und aufgrund der mir zur Verfügung stehenden Informationen, zu denen u. a. die Ergebnisse der Arbeit des internen Revisionsdienstes gehören, Folgendes:')]

In [None]:
testset.sample()

[('A lot of you are probably wondering... ..how we divide the work.',
  'Sie fragen sich vermutlich, wie wir unsere Arbeit aufteilen.')]

In [None]:
corpus_en = trainset.corpus(split=0) + testset.corpus(split=0)
corpus_de = trainset.corpus(split=1) + testset.corpus(split=1)
tokenizer_en = Nerdimizer()
tokenizer_de = Nerdimizer()
tokenizer_en.train(corpus_en, size=25000)
tokenizer_de.train(corpus_de, size=25000)
translator = Translator(tokenizer_en, tokenizer_de)

In [None]:
en_vocab, de_vocab = translator.vocab_size()
maxlen_train = trainset.maxlen(translator)
maxlen_test = testset.maxlen(translator)
maxlen = min(maxlen_train, maxlen_test, 256)
start, end, pad = tokenizer_en["[S]"], tokenizer_en["[E]"], tokenizer_en["[P]"]
print(f"Number of input tokens: {len(tokenizer_en)}\nNumber of output tokens: {len(tokenizer_de)}")

Number of input tokens: 20000
Number of output tokens: 20000


In [None]:
translator.padon(maxlen, end=True, pad_id=pad)
translator.truncon(maxlen, end=True)
print(f"Maxlen: {maxlen}")

Maxlen: 256


In [None]:
model = Transformer(en_vocab, de_vocab, maxlen, pad_id=pad, dm=512, nhead=8, layers=6, dff=2048)
optimizer = optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=10e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=10)
evaluator = Evaluator(testset, translator, "[S]", "[E]", maxlen, sample=100, ngrams=4, threshold=25, 
                    mode="geometric", device=device)
clock = Clock()
checkpoint = Checkpoint(model, optimizer, scheduler, evaluator, clock, epochs=100, 
                    path="english-german", overwrite=True)
model.to(device)
print(f"Number of Trainable Paramaters: {parameter_count(model)}\nSize of Model: {model_size(model):.1f}MB")

Number of Trainable Paramaters: 64590848
Size of Model: 247.4MB


In [None]:
# tokenized_trainset = trainset.tokenized(translator, model=True)
# dataloader = tokenized_trainset.dataloader(batch_size=128, drop_last=False)

In [None]:
# train(dataloader, model, optimizer, scheduler, evaluator, checkpoint, clock,
    # epochs=1000, warmups=100, verbose=True, device=device)