In [1]:
from torch.nn import CrossEntropyLoss
from transformer import TransformerMT
from torch import optim
from data_loader import *
from TrainEval import TrainEval
from DataLoaderProvider import DataLoaderProvider
from torchtext.data import get_tokenizer

en_tokenizer = get_tokenizer(tokenizer='spacy', language='en_core_web_sm')
en_vocab = Vocabulary(tokenizer=en_tokenizer)
en_vocab.filter_out_rare_keys(threshold=1)
fr_tokenizer = get_tokenizer(tokenizer='spacy', language='fr_core_news_sm')
fr_vocab = Vocabulary(tokenizer=fr_tokenizer)
fr_vocab.filter_out_rare_keys(threshold=1)

dataloader_provider = DataLoaderProvider(
    abridge=True,
    batch_size=32
)

model = transformer_mt = TransformerMT(
    source_vocabulary_size=en_vocab.vocabulary_size,
    target_vocabulary_size=fr_vocab.vocabulary_size,
    embedding_size=512,
    max_num_embeddings=100,
    num_attention_heads=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    linear_layer_size=2048,
    dropout=0.1,
    activation='relu',
    layer_norm_eps=1e-5,
    batch_first=False,
    norm_first=False,
    bias=True
)

train_eval = TrainEval(
    dataloader_provider=dataloader_provider,
    num_epochs=10,
    optimizer=optim.Adam(transformer_mt.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9),
    loss_function=CrossEntropyLoss(ignore_index=0),
    model=model,
    model_shortname='transformer'
)

train_eval.execute()

Reading the dataframe and storing untokenized pairs...


100%|████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 94292.16it/s]


Adding sentences to Langs amd geting data pairs...


100%|█████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2125.74it/s]


Creating tokenized pairs of english and french sentences...


100%|█████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 6281.46it/s]
