In [1]:
import torch
import torch.optim as optim
from utils import Dataset, load_model, save_model
from training import train, predict, prompt
from datasets import load_dataset
from tokenizer import Nerdimizer, save_tokenizer, load_tokenizer
from transformer import Transformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using cpu


In [2]:
datadict = load_dataset("europa_eac_tm", language_pair=("en", "de"), split="train")

Using custom data configuration en2de-176cd02372067e72
Found cached dataset europa_eac_tm (/Users/tonimo/.cache/huggingface/datasets/europa_eac_tm/en2de-176cd02372067e72/0.0.0/955b2501a836c2ea49cfe3e719aec65dcbbc3356bbbe53cf46f08406eb77386a)


In [3]:
inputs = [pair["en"] for pair in datadict["translation"]]
labels = [pair["de"] for pair in datadict["translation"]]
dataset = Dataset(inputs, labels)

In [4]:
df = dataset.dataframe()
df.head(5)

Unnamed: 0,inputs,labels
0,Nr. teachers/trainers,Anzahl Lehrer(innen)/Trainer
1,APPLICANT,ANTRAGSTELLERIN/ANTRAGSTELLER
2,The grant application will be processed by com...,Der Förderantrag wird elektronisch verarbeitet...
3,To be signed by the person legally authorised ...,"Unterschrift der Person, die rechtsverbindlich..."
4,DATE OF BIRTH,GEBURTSDATUM


In [5]:
print(df.isnull().values.any())
df.describe()

False


Unnamed: 0,inputs,labels
count,4473,4473
unique,4452,4312
top,Event,x
freq,2,29


In [6]:
dataset.sample()

(['Social inclusion in higher education'], ['x'])

In [7]:
corpus = dataset.corpus()
tokenizer = Nerdimizer()
tokenizer.train(corpus)
vocab_size = len(tokenizer)
maxlen = dataset.avg_tokenized_len(tokenizer, factor=3)
start, end, pad = tokenizer["[S]"], tokenizer["[E]"], tokenizer["[P]"]
tokenizer.padon(maxlen, pad_id=pad)
tokenizer.truncon(maxlen)
print(f"Number of word piece tokens: {len(tokenizer)}")

Number of word piece tokens: 15695


In [8]:
tokenized_dataset = dataset.tokenized(tokenizer)
dataloader = tokenized_dataset.dataloader(batch_size=64, drop_last=False)


In [9]:
model = Transformer(vocab_size=len(tokenizer), maxlen=maxlen, pad_id=pad, 
                    dm=512, nhead=8, layers=6, dff=2048)
optimizer = optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=10e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=10)
model.to(device)

Transformer(
  (embeddings): Embeddings(
    (embedding): Embedding(15695, 512, padding_idx=3)
  )
  (pos_encoder): PositionalEncoder(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (stack): ModuleList(
      (0): EncoderLayer(
        (multihead): MultiHeadAttention(
          (wq): Linear(in_features=512, out_features=512, bias=False)
          (wk): Linear(in_features=512, out_features=512, bias=False)
          (wv): Linear(in_features=512, out_features=512, bias=False)
          (wo): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (scaled_dot_prod_attn): ScaledDotProductAttention(
            (softmax): Softmax(dim=-1)
          )
        )
        (norm1): Norm()
        (norm2): Norm()
        (feedforward): FeedForwardNetwork(
          (w1): Linear(in_features=512, out_features=2048, bias=True)
          (w2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): Re

In [10]:
# train(model, optimizer, scheduler, dataloader, epochs=1000, warmups=100, verbose=True, device=device)

In [11]:
# save_model(model, "Transformer-Base", "saves/")
# save_tokenizer(tokenizer, "Tokenizer-en-de", "saves/")

In [13]:
tokenizer = load_tokenizer("Tokenizer-en-de", "saves/")
model = load_model("")

Tokenizer loaded from Tokenizer-en-de.json


TypeError: Expected state_dict to be dict-like, got <class 'str'>.