In [1]:
import torch
import torch.optim as optim
import numpy as np
from transformer import Transformer
from metrics import Evaluator
from datasets import load_dataset
from utils import *
from train import *
from inference import *
from tokenizer import *
from config import *

print(f"Using {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using cpu


In [2]:
traindict = load_dataset("opus100", "de-en", split="train")
testdict = load_dataset("opus100", "de-en", split="test")

Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)
Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)


In [3]:
train_inputs, train_labels = get_split(traindict, "en", "de", size=train_size)
test_inputs, test_labels = get_split(testdict, "en", "de", size=test_size)
trainset = Dataset(train_inputs, train_labels)
testset = Dataset(test_inputs, test_labels)

In [4]:
trainframe = trainset.dataframe()
trainframe.head()

Unnamed: 0,inputs,labels
0,"It was tough enough finding out he was dead, w...",Die vier Länder werden im kommenden Dezember i...
1,"And I, the old fool, sucked it in, I believed ...",Du begreifst sehr schnell.
2,In 1357 the castle was owned by Franz von Rave...,Frau Präsidentin! Der Rückbau veralteter kernt...
3,Who is correct?,Zur Förderung gemeinsamer Aufsichtskonzepte en...
4,All I see when I look in your ridiculous face ...,"Wir sind ein Pflegeheim, wo wir kümmern uns um..."


In [5]:
testframe = testset.dataframe()
testframe.head()

Unnamed: 0,inputs,labels
0,"Posted: 26 Mar 2010, 17:58",Ein gutes Zeichen. Vielleicht muss ich nichts ...
1,==Communities=====Cities===* Alabaster* Birmin...,# Antwort: 11 - 14.08.2012 um 11:36 Uhr
2,It's a little dull.,Hat jemand von Ihnen in der Nachbarschaft hell...
3,Garments of the type described in subheading 6...,"Nun, nicht gerade bestens."
4,"For the mystic, everything is connected: there...","Es wäre schade, wenn China, der neue industrie..."


In [6]:
print(trainframe.isnull().values.any())
trainframe.describe()

False


Unnamed: 0,inputs,labels
count,100000,100000
unique,100000,100000
top,"It was tough enough finding out he was dead, w...",Die vier Länder werden im kommenden Dezember i...
freq,1,1


In [7]:
print(testframe.isnull().values.any())
testframe.describe()

False


Unnamed: 0,inputs,labels
count,1000,1000
unique,1000,1000
top,"Posted: 26 Mar 2010, 17:58",Ein gutes Zeichen. Vielleicht muss ich nichts ...
freq,1,1


In [8]:
trainset.sample()

[('22 Then Joab, falling down on his face on the earth, gave the king honour and blessing; and Joab said, Today it is clear to your servant that I have grace in your eyes, my lord king, because the king has given effect to the request of his servant.',
  'https://www.hoteldiagonalzero.com/wp-content/blogs.dir/406/files/rooms_triple/triple001.jpg')]

In [9]:
testset.sample()

[('A week ago, she was basically Amish.',
  'Die nach Abschnitt 11.2.1. ermittelten Maße jeder Tafel der Probe dürfen nicht unter dem bestellten Nennmaß liegen.')]

In [10]:
corpus_en = trainset.corpus(data="inputs") + testset.corpus(data="inputs")
corpus_de = trainset.corpus(data="labels") + testset.corpus(data="labels")
tokenizer_en = Nerdimizer()
tokenizer_de = Nerdimizer()
tokenizer_en.train(corpus_en, size=vocab_size_english)
tokenizer_de.train(corpus_de, size=vocab_size_german)
translator = Translator(tokenizer_en, tokenizer_de)
save_tokenizer(translator, "translator")

Tokenizer saved


In [11]:
en_vocab, de_vocab = translator.vocab_size()
start, end, pad = translator["[S]"], translator["[E]"], translator["[P]"]
print(f"Number of input tokens: {en_vocab}\nNumber of output tokens: {de_vocab}")

Number of input tokens: 32000
Number of output tokens: 32000


In [12]:
translator.padon(maxlen, end=True, pad_id=pad)
translator.truncon(maxlen, end=True)
tokenized_trainset = trainset.tokenized(translator, model=True)
dataloader = tokenized_trainset.dataloader(batch_size=batch_size, drop_last=False)
print(f"Maxlen: {maxlen}")

Maxlen: 256


In [13]:
model = Transformer(en_vocab, de_vocab, maxlen, pad_id=pad, dm=dm, nhead=nhead, layers=layers, dff=dff,
                    bias=bias, dropout=dropout, eps=eps)
optimizer = optim.Adam(model.parameters(), lr=lr, betas=betas, eps=adam_eps)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=factor, patience=patience)
search = Beam(model, start, end, maxlen, beam_width=beam_width, breadth=max_breadth, 
                mode=search_mode, alpha=alpha)
evaluator = Evaluator(testset, translator, search, "[S]", "[E]", maxlen, sample=sample_size, ngrams=ngrams, 
                    bleu_goal=bleu_goal, mode="geometric", device=device)
clock = Clock()
checkpoint = Checkpoint(dataloader, model, optimizer, scheduler, evaluator, clock, epochs=save_every, 
                    path="checkpoint", overwrite=overwrite)
model.to(device)
print(f"Number of Trainable Paramaters: {parameter_count(model):.1f}M\nSize of Model: {model_size(model):.1f}MB")

Number of Trainable Paramaters: 76.9M
Size of Model: 294.3MB


In [14]:
# train(dataloader, model, optimizer, scheduler, evaluator, checkpoint, clock,
    # epochs=epochs, warmups=warmups, verbose=verbose, device=device)

Training Started


KeyboardInterrupt: 