In [1]:
from pathlib import Path
from tokenizer.BPE import Tokenizer
from utils.raw_data import load_wmt_chunk_df, get_wmt_df_len
import numpy as np
import math
from typing import Optional

import torch
import torch.nn as nn

from model import Transformer

from torch.utils.data import DataLoader
from utils.dataset import NeuralTranslationDataset

# DEBUG
from matplotlib import pyplot as plt

In [2]:
# load tokenizer
tokenizer = Tokenizer(compute_vocab=False, 
                      max_vocab_size=37_005,
                      corpus_source='wmt',
                      vocab_dest_file=Path('./data/dest/wmt_37k_tokens.yaml'))

In [3]:
# dataset
data_test = NeuralTranslationDataset(subset='test')

# dataloader
loader = DataLoader(data_test,
                    batch_size=3,
                    shuffle=False,
                    num_workers=4, 
                    pin_memory=True)

# sample data
for batch in loader:
    break

In [4]:
# device
device = torch.device('cuda')

# model
transformer = Transformer(N=3, 
                          L=64, 
                          d=512, 
                          h=8, 
                          d_ff=2048,
                          n_vocab=len(tokenizer.token_vocab), 
                          padding_idx=tokenizer.token_vocab['<PAD>'], 
                          bos_idx=tokenizer.token_vocab['<BOS>'], 
                          dtype=torch.float, 
                          device=device)

In [5]:
transformer(**batch).size()

torch.Size([3, 64, 512])