In [1]:
import torch
import torch.optim as optim
import numpy as np
from transformer import Transformer
from metrics import Evaluator
from datasets import load_dataset
from utils import *
from train import *
from inference import *
from tokenizer import *
from config import *

print(f"Using {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using cpu


In [2]:
traindict = load_dataset("opus100", "de-en", split="train")
testdict = load_dataset("opus100", "de-en", split="test")

Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)
Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)


In [3]:
train_inputs, train_labels = get_split(traindict, "en", "de", size=train_size)
test_inputs, test_labels = get_split(testdict, "en", "de", size=test_size)
trainset = Dataset(train_inputs, train_labels)
testset = Dataset(test_inputs, test_labels)

In [4]:
trainframe = trainset.dataframe()
trainframe.head()

Unnamed: 0,inputs,labels
0,Lucroy missed the first ten games of the 2011 ...,Was hast du? -Das Gesicht.
1,Box 1 : Industrial investment survey by branch...,"Ja, das taten sie."
2,So are you ready to test your wits? Then go fo...,The time now is 12:51 PM .
3,Tanggal pengambilan: 2004:08:12 18:41:49,Gleich nach dem Abwasch.
4,Your father used to do that.,Kris bleibt eine Weile bei ihm. Sie gibt sich ...


In [5]:
testframe = testset.dataframe()
testframe.head()

Unnamed: 0,inputs,labels
0,Apartments Furnela - La Val - Alta Badia,Die Bildfeldwölbung ist anscheinend derjenigen...
1,Have you ever considered how you are destrying...,Die Fahrwerks-Federnsätze für den X6 M.
2,A new transparent European cohesion policy,"* Allows for over/undershoot ""slop"" in aiming."
3,"The former leader of the Bektashi Order, Hamdu...","Woher dachten Sie, hatte er all die schmackhaf..."
4,I got started.,"Restaurants in Utrecht, UT, Niederlande"


In [6]:
print(trainframe.isnull().values.any())
trainframe.describe()

False


Unnamed: 0,inputs,labels
count,100000,100000
unique,100000,100000
top,Lucroy missed the first ten games of the 2011 ...,Was hast du? -Das Gesicht.
freq,1,1


In [7]:
print(testframe.isnull().values.any())
testframe.describe()

False


Unnamed: 0,inputs,labels
count,1000,1000
unique,1000,1000
top,Apartments Furnela - La Val - Alta Badia,Die Bildfeldwölbung ist anscheinend derjenigen...
freq,1,1


In [8]:
trainset.sample()

[('The Talmud is the core of Judaism. Understanding of the Talmud is understanding of Judaism, defamation of the Talmud is the defamation of Judaism, to turn away from the Talmud is to turn away from Judaism www.hagalil.com/judentum',
  'Ich finde es krank, dass es hier hängt.')]

In [9]:
testset.sample()

[('2. Each Party shall seek to ensure that such authority or authorities have sufficient resources to perform their tasks effectively.',
  '- Platzhalter8 -')]

In [10]:
corpus_en = trainset.corpus(data="inputs") + testset.corpus(data="inputs")
corpus_de = trainset.corpus(data="labels") + testset.corpus(data="labels")
tokenizer_en = Nerdimizer()
tokenizer_de = Nerdimizer()
tokenizer_en.train(corpus_en, size=vocab_size_english)
tokenizer_de.train(corpus_de, size=vocab_size_german)
translator = Translator(tokenizer_en, tokenizer_de)
save_tokenizer(translator, "translator")

Tokenizer saved


In [11]:
en_vocab, de_vocab = translator.vocab_size()
start, end, pad = translator["[S]"], translator["[E]"], translator["[P]"]
print(f"Number of input tokens: {en_vocab}\nNumber of output tokens: {de_vocab}")

Number of input tokens: 32000
Number of output tokens: 32000


In [12]:
translator.padon(maxlen, end=True, pad_id=pad)
translator.truncon(maxlen, end=True)
tokenized_trainset = trainset.tokenized(translator, model=True)
dataloader = tokenized_trainset.dataloader(batch_size=batch_size, drop_last=False)
print(f"Maxlen: {maxlen}")

Maxlen: 256


In [13]:
model = Transformer(en_vocab, de_vocab, maxlen, pad_id=pad, dm=dm, nhead=nhead, layers=layers, dff=dff,
                    bias=bias, dropout=dropout, eps=eps)
optimizer = optim.Adam(model.parameters(), lr=lr, betas=betas, eps=adam_eps)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=factor, patience=patience)
search = Beam(model, start, end, maxlen, beam_width=beam_width, breadth=max_breadth, 
                mode=search_mode, alpha=alpha)
evaluator = Evaluator(testset, translator, search, "[S]", "[E]", maxlen, sample=sample_size, ngrams=ngrams, 
                    bleu_goal=bleu_goal, mode="geometric", device=device)
clock = Clock()
checkpoint = Checkpoint(dataloader, model, optimizer, scheduler, evaluator, clock, epochs=save_every, 
                    path="checkpoint", overwrite=overwrite)
model.to(device)
print(f"Number of Trainable Paramaters: {parameter_count(model):.1f}M\nSize of Model: {model_size(model):.1f}MB")

Number of Trainable Paramaters: 76.9M
Size of Model: 294.3MB


In [14]:
# train(dataloader, model, optimizer, scheduler, evaluator, checkpoint, clock,
    # epochs=epochs, warmups=warmups, verbose=verbose, device=device)