In [1]:
import torch
import torch.optim as optim
from utils import Dataset, load_model, save_model, get_split
from training import train, predict, prompt
from datasets import load_dataset
from tokenizer import Nerdimizer, save_tokenizer, load_tokenizer
from transformer import Transformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using cpu


In [2]:
traindict = load_dataset("opus100", "de-en", split="train")

Found cached dataset opus100 (/Users/tonimo/.cache/huggingface/datasets/opus100/de-en/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)


In [3]:
inputs, labels = get_split(traindict, "en", "de", size=100000)
dataset = Dataset(inputs, labels)

In [4]:
df = dataset.dataframe()
df.head(5)

Unnamed: 0,inputs,labels
0,It's greed that it's gonna be the death of you...,Deine Habgier wird noch dein Tod sein.
1,Vega.,- Vega.
2,Just say when.,Sagen Sie einfach stopp.
3,- Wait.,- Warte.
4,I don't wanna be here.,Ich will nicht hier sein.


In [5]:
print(df.isnull().values.any())
df.describe()

False


Unnamed: 0,inputs,labels
count,100000,100000
unique,96328,96581
top,Okay.,Okay.
freq,75,52


In [6]:
dataset.sample()

(['And you a drug addict, you trifling bitch, through and through.'],
 ['- Du drogensüchtige, nutzlose Schlampe.'])

In [7]:
corpus = dataset.corpus()
tokenizer = Nerdimizer()
tokenizer.train(corpus, size=32000)
vocab_size = len(tokenizer)

In [8]:
maxlen = dataset.avglen(tokenizer, factor=15)
start, end, pad = tokenizer["[S]"], tokenizer["[E]"], tokenizer["[P]"]
tokenizer.padon(maxlen, pad_id=pad, end=True)
tokenizer.truncon(maxlen, end=True)
print(f"Number of word piece tokens: {vocab_size}\nMaxlen: {maxlen}")

Number of word piece tokens: 32000
Maxlen: 285


In [9]:
tokenized_dataset = dataset.tokenized(tokenizer)
dataloader = tokenized_dataset.dataloader(batch_size=128, drop_last=False)

In [10]:
model = Transformer(vocab_size, maxlen, pad_id=pad, dm=512, nhead=8, layers=6, 
                    dff=2048, bias=False)
optimizer = optim.Adam(model.parameters(), lr=0.1, betas=(0.9, 0.98), eps=10e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=10)
model.to(device);

In [11]:
# train(model, optimizer, scheduler, dataloader, epochs=1000, warmups=100, verbose=True, device=device)

In [12]:
# save_model(model, "Transformer-Base", "saves/")
# save_tokenizer(tokenizer, "Tokenizer-en-de", "saves/")

In [13]:
# tokenizer = load_tokenizer("models/Tokenizer-en-de")
# model = load_model(model, "models/Transformer-Base", device)