In [85]:
from datasets import load_dataset, DatasetDict

In [86]:
dataset = load_dataset("news_commentary", 'de-en')

Found cached dataset news_commentary (/Users/gcm/.cache/huggingface/datasets/news_commentary/de-en/11.0.0/cfab724ce975dc2da51cdae45302389860badc88b74db8570d561ced6004f8b4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [87]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 223153
    })
})

In [88]:
dataset['train'][1]

{'id': '1',
 'translation': {'de': 'SAN FRANCISCO – Es war noch nie leicht, ein rationales Gespräch über den Wert von Gold zu führen.',
  'en': 'SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.'}}

In [89]:
dataset['train'][1]['translation']

{'de': 'SAN FRANCISCO – Es war noch nie leicht, ein rationales Gespräch über den Wert von Gold zu führen.',
 'en': 'SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.'}

In [90]:
len(dataset['train'])

223153

In [91]:
# function to normalise my text
import unicodedata
import re

# From https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
def _normalize(s):
	s = unicodeToAscii(s.lower().strip())
	s = re.sub(r"([.!?])", r" \1", s)
	s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
	return s

#Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
	return ''.join(
		c for c in unicodedata.normalize('NFD', s)
		if unicodedata.category(c) != 'Mn'
	)


In [92]:
# normalise text and save it in variable "data_norm"
MAX_WORDS = 12
data_norm  = []
count = 0
for i in range(len(dataset['train'])):
	en = _normalize(dataset['train'][i]['translation']['en'])
	de = _normalize(dataset['train'][i]['translation']['de'])
	if len(en.split()) <= MAX_WORDS and len(de.split()) <= MAX_WORDS:
		data_norm.append({'idx': count,
						 'en_text': en,
						 'de_text': de,
						 'en_word_idx': [],
						 'de_word_idx': []})
		count = count + 1

In [93]:
data_norm[:5]

[{'idx': 0,
  'en_text': ' gold ?',
  'de_text': 'steigt gold auf . dollar ?',
  'en_word_idx': [],
  'de_word_idx': []},
 {'idx': 1,
  'en_text': 'wouldn t you know it ?',
  'de_text': 'und es kam wie es kommen musste .',
  'en_word_idx': [],
  'de_word_idx': []},
 {'idx': 2,
  'en_text': 'gold prices even hit a record high recently .',
  'de_text': 'jungst erreichte er sogar ein rekordhoch von . dollar .',
  'en_word_idx': [],
  'de_word_idx': []},
 {'idx': 3,
  'en_text': 'since then the index has climbed above .',
  'de_text': 'seit damals ist er auf uber . punkte gestiegen .',
  'en_word_idx': [],
  'de_word_idx': []},
 {'idx': 4,
  'en_text': 'gold prices are extremely sensitive to global interest rate movements .',
  'de_text': 'der goldpreis ist extrem empfindlich hinsichtlich globaler veranderungen des zinssatzes .',
  'en_word_idx': [],
  'de_word_idx': []}]

In [94]:
len(data_norm) # dataset contains still 33k observations

33048

In [95]:
import numpy as np
import math
import random
import copy
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_

In [96]:
# Set seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

data = copy.deepcopy(data_norm)
# Shuffling the data
# dataset_shuffled = dataset.shuffle(seed=42)
random.shuffle(data)
# Splitting the data
data_train = data[:15000]
data_val = data[15000:18500]
data_test = data[18500:22000]

In [97]:
len_train = len(data_train)
len_val = len(data_val)
len_test = len(data_test)
print(f'train data length: {len_train} \nval data length: {len_val} \ntest data length: {len_test}')

train data length: 15000 
val data length: 3500 
test data length: 3500


In [98]:
from torch.utils.data import Dataset, DataLoader
import torch

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    # We return the length of the dataset
    def __len__(self):
        return len(self.data)

    # We return the idx'th sample
    def __getitem__(self, idx):
        return {
            'idx': idx,
            'en_word_idx': torch.tensor(self.data[idx]['en_word_idx']).long(),
            'de_word_idx': torch.tensor(self.data[idx]['de_word_idx']).long(),
        }

In [99]:
train_data = TranslationDataset(data_train)
val_data = TranslationDataset(data_val)
test_data = TranslationDataset(data_test)

In [100]:
train_data[0]

{'idx': 0,
 'en_word_idx': tensor([], dtype=torch.int64),
 'de_word_idx': tensor([], dtype=torch.int64)}

In [101]:
len(train_data), len(val_data), len(test_data)

(15000, 3500, 3500)

### Build the vocabulary and word embedding matrix

We now have all our data. Let's build the vocabulary and prepare the word-embeddings for the next steps **FOR EACH LANGUAGE**.

In [102]:
vocabulary = {'en': set(), 'de': set()}

for lang in ('en', 'de'):
  for sample in train_data.data: # Pay attention we only use the training set!
    for token in sample['{}_text'.format(lang)].split(): # tokenize
      vocabulary[lang].add(token)
  print(lang, len(vocabulary[lang]))

en 11472
de 16705


In [103]:
# Create the mapping word - index and vice-versa
word2idx = {'en': {'_PAD_': 0, '_UNK_': 1, '_SOS_': 2, '_EOS_': 3},
            'de': {'_PAD_': 0, '_UNK_': 1, '_SOS_': 2, '_EOS_': 3}}

idx2word = {'en': {}, 'de': {}}
for lang in ('en', 'de'):
  for word in vocabulary[lang]:
    word2idx[lang][word] = len(word2idx[lang])
  idx2word[lang] = {idx:word for word, idx in word2idx[lang].items()}
  print(word2idx[lang])

{'_PAD_': 0, '_UNK_': 1, '_SOS_': 2, '_EOS_': 3, 'pulverfass': 4, 'hand': 5, 'ausgestaltung': 6, 'textilien': 7, 'berechtigt': 8, 'erzeugung': 9, 'verschleppen': 10, 'engagement': 11, 'naturwissenschaftliche': 12, 'lebensmittelverschwendung': 13, 'stress': 14, 'vierte': 15, 'illegitime': 16, 'finanzinstitute': 17, 'zollhauser': 18, 'kollektives': 19, 'eingegangen': 20, 'glorreichen': 21, 'hinausschie': 22, 'gordische': 23, 'weise': 24, 'uberschlagen': 25, 'anlass': 26, 'potenziellem': 27, 'bulgarien': 28, 'eigentlichen': 29, 'acht': 30, 'studienteilnehmer': 31, 'herstellung': 32, 'zynismus': 33, 'schwach': 34, 'prasent': 35, 'verstarkt': 36, 'dauern': 37, 'bereinigung': 38, 'unbesicherten': 39, 'abwickeln': 40, 'fourier': 41, 'hugel': 42, 'damaskus': 43, 'terrorhilfe': 44, 'franzosen': 45, 'verbindet': 46, 'ungreifbaren': 47, 'rache': 48, 'zerbricht': 49, 'verbundeten': 50, 'hysterie': 51, 'soup': 52, 'sealevel': 53, 'pacta': 54, 'hinnehmen': 55, 'atomzeitalter': 56, 'vernetzt': 57, 'b

In [104]:
idx2word['en']

{0: '_PAD_',
 1: '_UNK_',
 2: '_SOS_',
 3: '_EOS_',
 4: 'hand',
 5: 'want',
 6: 'smile',
 7: 'hardships',
 8: 'engagement',
 9: 'extremely',
 10: 'stress',
 11: 'fallout',
 12: 'koreans',
 13: 'cooperating',
 14: 'comparison',
 15: 'enthusiasm',
 16: 'stagnate',
 17: 'beggaring',
 18: 'twice',
 19: 'sustainability',
 20: 'fourier',
 21: 'participation',
 22: 'indexing',
 23: 'even',
 24: 'demography',
 25: 'emotional',
 26: 'palace',
 27: 'accuse',
 28: 'hal',
 29: 'imprisoned',
 30: 'emphasis',
 31: 'soup',
 32: 'sealevel',
 33: 'ghost',
 34: 'agrees',
 35: 'pacta',
 36: 'pretence',
 37: 'hobble',
 38: 'banal',
 39: 'arbitrage',
 40: 'documented',
 41: 'immobility',
 42: 'communists',
 43: 'views',
 44: 'governance',
 45: 'drug',
 46: 'crossroads',
 47: 'enormous',
 48: 'mountain',
 49: 'kinds',
 50: 'textiles',
 51: 'fuse',
 52: 'afraid',
 53: 'blanket',
 54: 'columbine',
 55: 'dissident',
 56: 'favorite',
 57: 'catalyzed',
 58: 'woes',
 59: 'mismanagement',
 60: 'shall',
 61: 'gilts

In [105]:
idx2word['de']

{0: '_PAD_',
 1: '_UNK_',
 2: '_SOS_',
 3: '_EOS_',
 4: 'pulverfass',
 5: 'hand',
 6: 'ausgestaltung',
 7: 'textilien',
 8: 'berechtigt',
 9: 'erzeugung',
 10: 'verschleppen',
 11: 'engagement',
 12: 'naturwissenschaftliche',
 13: 'lebensmittelverschwendung',
 14: 'stress',
 15: 'vierte',
 16: 'illegitime',
 17: 'finanzinstitute',
 18: 'zollhauser',
 19: 'kollektives',
 20: 'eingegangen',
 21: 'glorreichen',
 22: 'hinausschie',
 23: 'gordische',
 24: 'weise',
 25: 'uberschlagen',
 26: 'anlass',
 27: 'potenziellem',
 28: 'bulgarien',
 29: 'eigentlichen',
 30: 'acht',
 31: 'studienteilnehmer',
 32: 'herstellung',
 33: 'zynismus',
 34: 'schwach',
 35: 'prasent',
 36: 'verstarkt',
 37: 'dauern',
 38: 'bereinigung',
 39: 'unbesicherten',
 40: 'abwickeln',
 41: 'fourier',
 42: 'hugel',
 43: 'damaskus',
 44: 'terrorhilfe',
 45: 'franzosen',
 46: 'verbindet',
 47: 'ungreifbaren',
 48: 'rache',
 49: 'zerbricht',
 50: 'verbundeten',
 51: 'hysterie',
 52: 'soup',
 53: 'sealevel',
 54: 'pacta',
 5

In [106]:
# Now we create the word embedding matrix randomly
WORD_DIM = 300

# Initialize randomly the word embedding matrix
word_embeddings = {'en': np.random.rand(len(word2idx['en']), WORD_DIM),
                   'de': np.random.rand(len(word2idx['de']), WORD_DIM)}

# Set the values to 0 for padding
word_embeddings['en'][word2idx['en']['_PAD_']] = np.zeros(WORD_DIM)
word_embeddings['de'][word2idx['de']['_PAD_']] = np.zeros(WORD_DIM)

In [107]:
print(f'shape en: {word_embeddings["en"].shape}')
print(f'shape de: {word_embeddings["de"].shape}')

shape en: (11476, 300)
shape de: (16709, 300)


# Process the data.

Now, we can add the word embedding indeces to our data to be fed to the model.

Let's use the attribute `word_idx` for all samples. In case a word is unknown, we will simply replace it with the work "_UNK_".

In [108]:
# We add the word indeces to all data splits
for split_data in [train_data.data, val_data.data, test_data.data]:
	for lang in ['en', 'de']:
		for sample in split_data:
			sample['{}_word_idx'.format(lang)] = []
			for token in sample['{}_text'.format(lang)].split():
				# If a word is not in our vocabulary, we put the UNK token instead
				sample['{}_word_idx'.format(lang)].append(word2idx[lang][token] if token in word2idx[lang] else word2idx[lang]['_UNK_'])

In [109]:
train_data[0], train_data.data[0]

({'idx': 0,
  'en_word_idx': tensor([ 4295, 10301,  1737,  6025,  9059,   409,  7890,  3787,  3261,  2522,
           2604,  9974]),
  'de_word_idx': tensor([13731, 12846, 15173, 13388,  4209,  1000, 14866,  2285, 14963,  2725,
          14474])},
 {'idx': 32674,
  'en_text': 'if greece still had its own currency everything would be easier .',
  'de_text': 'wenn griechenland noch seine eigene wahrung hatte ware alles einfacher .',
  'en_word_idx': [4295,
   10301,
   1737,
   6025,
   9059,
   409,
   7890,
   3787,
   3261,
   2522,
   2604,
   9974],
  'de_word_idx': [13731,
   12846,
   15173,
   13388,
   4209,
   1000,
   14866,
   2285,
   14963,
   2725,
   14474]})

In [110]:
for lang in ['en', 'de']:
	for split_data in [train_data.data, val_data.data, test_data.data]:
		for sample in split_data:

			# Add _SOS_ and _EOS_ tokens!
			sample['{}_word_idx'.format(lang)] = [word2idx[lang]['_SOS_']] + sample['{}_word_idx'.format(lang)] + [word2idx[lang]['_EOS_']]

			# Pad
			while len(sample['{}_word_idx'.format(lang)]) < (MAX_WORDS + 3): # + 2 because of SOS and EOS
				sample['{}_word_idx'.format(lang)].append(word2idx[lang]['_PAD_'])

			# Sanity check
			assert len(sample['{}_word_idx'.format(lang)]) == (MAX_WORDS + 3)

In [111]:
train_data[5]

{'idx': 5,
 'en_word_idx': tensor([    2,  9198,  1916,  5843, 10068,  4549,  5834,   255,  9974,     3,
             0,     0,     0,     0,     0]),
 'de_word_idx': tensor([    2, 14449, 16207,  4382,  8279,  5848,  5025,  1523, 14968, 14474,
             3,     0,     0,     0,     0])}

# Modeling

In [112]:
class TranslationModel(nn.Module):
    def __init__(self, dropout, hidden_dim, words_num_src, words_num_des, word_dim):
        super(TranslationModel, self).__init__()

        # ENCODER
        self.src_word_embedding = nn.Embedding(num_embeddings=words_num_src, embedding_dim=word_dim)
        self.encoder = nn.GRU(input_size=word_dim,
                               hidden_size=hidden_dim,
                               num_layers=2, # Let's add an extra-layer
                               batch_first=True,
                               dropout=0, # No dropout; it is complicated for RNNs. Do you have an intuition why?
                               bidirectional=True) # Let's assume bi-directional

        # DECODER
        self.des_word_embedding = nn.Embedding(num_embeddings=words_num_des, embedding_dim=word_dim)
        self.decoder = nn.GRU(input_size=word_dim,
                               hidden_size=hidden_dim,
                               num_layers=1, # Here we keep it simple
                               batch_first=True,
                               dropout=0, # No dropout; it is complicated for RNNs. Do you have an intuition why?
                               bidirectional=False) # ATTENTION: we cannot have bidirectionality here!
        # The last layer to compute the probabilities for the output classes
        self.final_layer = nn.Linear(in_features=hidden_dim, out_features=words_num_des)

        # BONUS: How would you implement the attention component?

        self.dropout = nn.Dropout(dropout)
        self.activation = nn.Tanh()


    def forward(self, x_src, len_src, x_des, len_des, last_hidden=None):
        if last_hidden is None: # It means we have already encoded the input sentence and we are generated the output. This is a trick to not recompute at each decoding step the latent representation of the input sentence.
          ################
          # ENCODING PART#
          ################

          x_src = self.src_word_embedding(x_src)
          x_src = self.dropout(x_src)

          # PyTorch (and other frameworks) are using CuDNN for efficient neural networks.
          # For some reason, for LSTM, GRU, and RNN, the input sequences MUST be sorted by decreasing length
          # Therefore, we will do it manually before feeding the network AND also after in order to have the hidden vectors in the right order

          # 1. We sort the batch (and keep the sorting index) according to their sequencelength in decreasing order
          sorted_lengths, sorted_order = len_src.sort(0, descending=True)
          sorted_input = x_src[sorted_order] # Here we rearrange the batch
          _, invert_order = sorted_order.sort(0, descending=False) # We store the indeces of the sorting

          # 2. Because all sequences DO NOT have the same length, we might not want to waste GPU resources on useless computation.
          # For example:
          # a) I     am       Diego PAD  PAD
          # b) Ciao  ragazzi  PAD   PAD  PAD
          # c) We    are      doing deep learning
          #
          # Only c) needs to process the full sequence. In a) and b), only 3 and 2 steps are needed.
          # pack_padded_sequence will exactly help us and not compute 15 "operations" but only 10 in total. Here we save already ~33% of the GPU!
          prepared_input = nn.utils.rnn.pack_padded_sequence(sorted_input, sorted_lengths.cpu(), batch_first=True, enforce_sorted=True)

          # If you check the documentation https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
          # hidden and cell (useless) will be the final hidden/cell state (NbDirection xBxH)
          # whereas output contains the values of the hidden states for each time step (NbDirection xBxLxH)
          _, hidden = self.encoder(prepared_input)# Bidirectional*nblayers x B x H

          # We aggregate into one vector
          hidden = torch.sum(hidden, dim=0)

          # 3. We sort back the hidden vectors
          hidden = hidden.index_select(0, invert_order)
          hidden = self.dropout(self.activation(hidden))
          hidden = hidden.unsqueeze(0)

          # Hidden is now our latent representation of our input sentence!
        else:
          hidden = last_hidden

        #################
        # DECODING PART #
        #################

        # We will be using Teacher-Focring, meaning that we will provide to the model the previous REAL words as input.
        # Hence, we need to encode the output sequence as well.
        # At inference (as we will see later), we will generate the output text in an auto-regressive manner, where is word has been predicted.
        x_des = self.des_word_embedding(x_des)
        x_des = self.dropout(x_des)

        sorted_lengths, sorted_order = len_des.sort(0, descending=True)
        sorted_input = x_des[sorted_order] # Here we rearrange the batch
        _, invert_order = sorted_order.sort(0, descending=False) # We store the indeces of the sorting
        hidden = hidden[:, sorted_order] # Pay attention: we have also to sort it!

        prepared_input = nn.utils.rnn.pack_padded_sequence(sorted_input, sorted_lengths.cpu(), batch_first=True, enforce_sorted=True)
        # We will parallelize and predict ALL next tokens in once step.
        # The most important here is to provide the context to initialize the hidden states
        output, new_last_hidden = self.decoder(prepared_input, hidden)

        # We unpack the output and reorder all hidden states.
        output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True, total_length=torch.max(len_des) if last_hidden is None else 1)[0] #BxLxH
        output = output.index_select(0, invert_order)

        new_last_hidden = new_last_hidden.index_select(1, invert_order)
        output = self.dropout(self.activation(output))

        # Compute the log probabilities. We do not compute the probabilities with Softmax because depending the loss function, we might require unnormalized or probabilities.
        # Worst case, we can always apply F.softmax(output, dim=-1) later.
        word_logits = self.final_layer(output)
        return word_logits, new_last_hidden

In [113]:
x_src = torch.tensor([[1,2,3,4,5,0], [10,20,30,40,0,0], [5,6,0,0,0,0]])
len_src = torch.sum(x_src != 0, dim=1)
x_des = x_src[:, 1:]
len_des = len_src - 1

In [114]:
x_src, len_src

(tensor([[ 1,  2,  3,  4,  5,  0],
         [10, 20, 30, 40,  0,  0],
         [ 5,  6,  0,  0,  0,  0]]),
 tensor([5, 4, 2]))

In [115]:
x_des, len_des

(tensor([[ 2,  3,  4,  5,  0],
         [20, 30, 40,  0,  0],
         [ 6,  0,  0,  0,  0]]),
 tensor([4, 3, 1]))

In [116]:
translator = TranslationModel(dropout=0.1, hidden_dim=24, words_num_src=50, words_num_des=50, word_dim=12)
logits, last_hidden = translator(x_src, len_src, x_des, len_des)
logits, logits.size(), last_hidden.size()

(tensor([[[ 7.0946e-02,  1.8270e-01,  1.1369e-01,  1.4958e-01,  2.8110e-02,
            2.2634e-01,  1.2713e-02,  3.0825e-01, -4.8444e-01, -1.1362e-01,
            2.5104e-01,  7.4229e-02,  1.6561e-01,  2.4883e-03, -6.1863e-02,
            2.0518e-02, -1.0992e-01,  2.3432e-01,  9.7835e-02, -3.7266e-01,
            4.8402e-02, -1.6725e-01,  1.2586e-02, -2.1086e-01,  1.7871e-01,
            2.5846e-01,  1.1262e-01,  3.4478e-01,  1.1535e-01, -2.5898e-01,
            3.5374e-01, -3.2672e-01, -6.3701e-02, -4.1694e-01, -4.3081e-02,
            2.5746e-01,  6.6177e-02, -1.1578e-01,  1.1648e-01,  9.3252e-02,
           -8.0353e-02, -2.0053e-02, -1.4071e-01,  7.1730e-02,  3.2159e-01,
            3.0349e-02, -9.6051e-03,  2.5669e-03, -1.5729e-01,  8.7764e-02],
          [ 1.2079e-01,  1.8161e-01,  1.1327e-02,  7.2552e-02, -1.2333e-01,
            7.3084e-02, -6.4113e-02,  1.8542e-01, -5.3381e-01, -6.5512e-02,
            2.9753e-01, -2.2139e-01,  8.1073e-02,  1.9807e-02, -1.1500e-01,
           

# Putting all together

We now have the model, the data, and the loss function. We are ready to implement the training loop.

In [117]:
BATCH_SIZE = 128
SRC = 'en'
DES = 'de'

# We initialize our model
model = TranslationModel(dropout=0.1,
                         hidden_dim=128,
                         words_num_src=len(word2idx[SRC]),
                         words_num_des=len(word2idx[DES]),
                         word_dim=WORD_DIM)

# Copy the word embedding matrix
model.src_word_embedding.weight.data = torch.from_numpy(word_embeddings[SRC]).float()
model.src_word_embedding.weight.requires_grad = True

# Copy the word embedding matrix
model.des_word_embedding.weight.data = torch.from_numpy(word_embeddings[DES]).float()
model.des_word_embedding.weight.requires_grad = True

In [118]:
# We initialize our optimizer to update the weights of the model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0)

In [119]:
# We can load our dataset using a dataloader
train_loader = DataLoader(
        train_data,
        batch_size=BATCH_SIZE,
        shuffle=True, # Pay attention that we can shuffle the samples for training
        num_workers=0, # And specify how many working we want. 0/1 = 1
        drop_last=False) # Finally, it is possible to drop the last batch if its size is smaller than args.batch_size. In some applications, it is easier to ignore it instead of handling it.

val_loader = DataLoader(
        val_data,
        batch_size=BATCH_SIZE,
        shuffle=False, # Pay attention here that the data is not shuffled.
        num_workers=0,
        drop_last=False)

test_loader = DataLoader(
        test_data,
        batch_size=BATCH_SIZE,
        shuffle=False, # Pay attention here that the data is not shuffled.
        num_workers=0,
        drop_last=False)

In [122]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [123]:
torch.cuda.is_available()

False

In [124]:
torch.cuda.device_count()

0

NameError: name 'nvidia' is not defined

In [126]:
# Move the model to the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Initialize the loss function
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index=0)

best_epoch = 0
best_val_so_far = float('inf')
test_perf = 0


# TRAINING LOOP
for epoch in range(50):
	print('------------------------------------------------------------------------')
	print('Epoch: {}'.format(epoch))

	# TRAIN
	# We set the model in train mode. It will store information to compute the gradients
	# Also, the implementation of dropout, batchnorm, etc is different at training and inference time.
	model.train()

	train_losses = []
	# Pay attention how the data loading become easiers!
	for idx, batch in tqdm.tqdm(enumerate(train_loader), desc='Training'):
		# Prepare data
		x_src = batch['{}_word_idx'.format(SRC)]
		len_src = torch.sum(x_src != 0, dim=1)
		x_des = batch['{}_word_idx'.format(DES)]
		len_des = torch.sum(x_des != 0, dim=1)

		# Move to GPU
		x_src = x_src.to(device)
		len_src = len_src.to(device)
		x_des = x_des.to(device)
		len_des = len_des.to(device)

		# Compute the model output and the loss
		y_logits, last_hidden = model(x_src, len_src, x_des, len_des)
		y_preds = y_logits
		y_gold = x_des[:, 1:y_preds.size(1)+1].contiguous()

		# We have to "flatten" the predictions because CE only handle tensors like BxC and B
		# print(y_preds.size(0)*y_preds.size(1),-1)
		# print(len(y_gold.view(-1)))
		loss = criterion(y_preds.view(y_preds.size(0)*y_preds.size(1), -1), y_gold.view(-1))
		ppl = math.exp(loss)

		# Update model parameters
		optimizer.zero_grad() # This is very important! By default, gradients are cumulated in tensors.
		loss.backward() # Now that gradients have been empties, we compute the new ones using the loss.
		clip_grad_norm_(model.parameters(), 3.0) # PAY ATTENTION: Protection against gradient exploding
		optimizer.step() # We do gradient update with our optimization function (i.e., the weights of the model are updated).

		train_losses.append(loss.item())

	# VAL + TEST
	val_test_losses = {'val': [], 'test': []}
	val_test_ppl = {'val': [], 'test': []}

	# Unlike before, we set the model in eval mode to compute correctly dropout, batchnorm etc
	model.eval()

	# We do not store information relative to gradients as we do not update the model.
	# That's the reason why inference requires less memory and is faster.
	with torch.no_grad():
		for split_data, data in [('val', val_loader), ('test', test_loader)]:
			# Pay attention how the data loading become easiers!
			for idx, batch in tqdm.tqdm(enumerate(data), desc=split_data.capitalize()):
				# Prepare data
				x_src = batch['{}_word_idx'.format(SRC)]
				len_src = torch.sum(x_src != 0, dim=1)
				x_des = batch['{}_word_idx'.format(DES)]
				len_des = torch.sum(x_des != 0, dim=1)

				# Move to GPU
				x_src = x_src.to(device)
				len_src = len_src.to(device)
				x_des = x_des.to(device)
				len_des = len_des.to(device)

				# Compute the model output and the loss
				y_logits, last_hidden = model(x_src, len_src, x_des, len_des) #BxLxC (17)
				y_preds = y_logits
				y_gold = x_des[:, 1:y_preds.size(1)+1].contiguous()

				# We have to "flatten" the predictions because CE only handle tensors like BxC and B
				loss = criterion(y_preds.view(y_preds.size(0)*y_preds.size(1), -1), y_gold.view(-1))
				ppl = math.exp(loss)

				val_test_losses[split_data].append(loss.item())
				val_test_ppl[split_data].append(ppl)

	# Monitoring
	print('Train loss: {:.4f}'.format(np.mean(train_losses)))
	print('Val   loss: {:.4f}'.format(np.mean(val_test_losses['val'])))
	print('Test  loss: {:.4f}'.format(np.mean(val_test_losses['test'])))
	print()

	val_ppl = np.mean(val_test_ppl['val'])
	test_ppl = np.mean(val_test_ppl['test'])
	print('Val   PPL: {:.4f}'.format(val_ppl))
	print('Test  PPL: {:.4f}'.format(test_ppl))
	print()

	if val_ppl < best_val_so_far:
		best_val_so_far = val_ppl
		test_perf = test_ppl
		best_epoch = epoch
		torch.save(model.state_dict(), 'my_model.ckpt')

	print('Best Epoch: {}, best val PPL: {:.4f}, test PPL: {:.4f}'.format(best_epoch, best_val_so_far, test_perf))
	print()
	print()

------------------------------------------------------------------------
Epoch: 0


Training: 118it [01:38,  1.20it/s]
Val: 28it [00:07,  3.69it/s]
Test: 28it [00:07,  3.71it/s]


Train loss: 5.8176
Val   loss: 6.0439
Test  loss: 6.0292

Val   PPL: 422.8143
Test  PPL: 417.1273

Best Epoch: 0, best val PPL: 422.8143, test PPL: 417.1273


------------------------------------------------------------------------
Epoch: 1


Training: 118it [01:38,  1.20it/s]
Val: 28it [00:07,  3.71it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 5.6506
Val   loss: 5.9607
Test  loss: 5.9450

Val   PPL: 389.1851
Test  PPL: 383.5495

Best Epoch: 1, best val PPL: 389.1851, test PPL: 383.5495


------------------------------------------------------------------------
Epoch: 2


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.70it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 5.5017
Val   loss: 5.9049
Test  loss: 5.8883

Val   PPL: 368.2311
Test  PPL: 362.5582

Best Epoch: 2, best val PPL: 368.2311, test PPL: 362.5582


------------------------------------------------------------------------
Epoch: 3


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.71it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 5.3616
Val   loss: 5.8302
Test  loss: 5.8136

Val   PPL: 341.8554
Test  PPL: 336.4780

Best Epoch: 3, best val PPL: 341.8554, test PPL: 336.4780


------------------------------------------------------------------------
Epoch: 4


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 5.2246
Val   loss: 5.7745
Test  loss: 5.7582

Val   PPL: 323.4972
Test  PPL: 318.5616

Best Epoch: 4, best val PPL: 323.4972, test PPL: 318.5616


------------------------------------------------------------------------
Epoch: 5


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 5.0899
Val   loss: 5.7402
Test  loss: 5.7243

Val   PPL: 312.7614
Test  PPL: 308.0227

Best Epoch: 5, best val PPL: 312.7614, test PPL: 308.0227


------------------------------------------------------------------------
Epoch: 6


Training: 118it [01:38,  1.20it/s]
Val: 28it [00:07,  3.71it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 4.9665
Val   loss: 5.7018
Test  loss: 5.6878

Val   PPL: 301.1329
Test  PPL: 297.0633

Best Epoch: 6, best val PPL: 301.1329, test PPL: 297.0633


------------------------------------------------------------------------
Epoch: 7


Training: 118it [01:38,  1.20it/s]
Val: 28it [00:07,  3.71it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 4.8432
Val   loss: 5.6806
Test  loss: 5.6698

Val   PPL: 294.9081
Test  PPL: 291.8569

Best Epoch: 7, best val PPL: 294.9081, test PPL: 291.8569


------------------------------------------------------------------------
Epoch: 8


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.72it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 4.7265
Val   loss: 5.6358
Test  loss: 5.6235

Val   PPL: 282.1391
Test  PPL: 278.6223

Best Epoch: 8, best val PPL: 282.1391, test PPL: 278.6223


------------------------------------------------------------------------
Epoch: 9


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.71it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 4.6069
Val   loss: 5.6272
Test  loss: 5.6154

Val   PPL: 279.8933
Test  PPL: 276.5927

Best Epoch: 9, best val PPL: 279.8933, test PPL: 276.5927


------------------------------------------------------------------------
Epoch: 10


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.72it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 4.4965
Val   loss: 5.5940
Test  loss: 5.5807

Val   PPL: 270.7343
Test  PPL: 267.1560

Best Epoch: 10, best val PPL: 270.7343, test PPL: 267.1560


------------------------------------------------------------------------
Epoch: 11


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 4.3867
Val   loss: 5.5892
Test  loss: 5.5770

Val   PPL: 269.6520
Test  PPL: 266.2048

Best Epoch: 11, best val PPL: 269.6520, test PPL: 266.2048


------------------------------------------------------------------------
Epoch: 12


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 4.2837
Val   loss: 5.5733
Test  loss: 5.5542

Val   PPL: 265.3843
Test  PPL: 260.2485

Best Epoch: 12, best val PPL: 265.3843, test PPL: 260.2485


------------------------------------------------------------------------
Epoch: 13


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 4.1754
Val   loss: 5.5722
Test  loss: 5.5567

Val   PPL: 265.2102
Test  PPL: 261.0083

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 14


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 4.0764
Val   loss: 5.5744
Test  loss: 5.5562

Val   PPL: 265.8882
Test  PPL: 260.9218

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 15


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.72it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 3.9795
Val   loss: 5.5786
Test  loss: 5.5614

Val   PPL: 267.0338
Test  PPL: 262.3245

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 16


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.70it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 3.8871
Val   loss: 5.5858
Test  loss: 5.5673

Val   PPL: 269.1302
Test  PPL: 263.9802

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 17


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.72it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 3.7936
Val   loss: 5.5942
Test  loss: 5.5760

Val   PPL: 271.4350
Test  PPL: 266.4139

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 18


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.71it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 3.7033
Val   loss: 5.5913
Test  loss: 5.5741

Val   PPL: 270.7180
Test  PPL: 265.9378

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 19


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.72it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 3.6122
Val   loss: 5.6163
Test  loss: 5.5965

Val   PPL: 277.6187
Test  PPL: 272.0474

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 20


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 3.5294
Val   loss: 5.6070
Test  loss: 5.5891

Val   PPL: 274.9734
Test  PPL: 270.0844

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 21


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 3.4446
Val   loss: 5.6208
Test  loss: 5.6036

Val   PPL: 279.0636
Test  PPL: 274.1417

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 22


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.72it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 3.3663
Val   loss: 5.6491
Test  loss: 5.6311

Val   PPL: 287.0578
Test  PPL: 281.8088

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 23


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.72it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 3.2877
Val   loss: 5.6639
Test  loss: 5.6476

Val   PPL: 291.3922
Test  PPL: 286.4569

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 24


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 3.2089
Val   loss: 5.6627
Test  loss: 5.6448

Val   PPL: 291.0402
Test  PPL: 285.7776

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 25


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 3.1365
Val   loss: 5.6916
Test  loss: 5.6733

Val   PPL: 299.8412
Test  PPL: 294.1517

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 26


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 3.0627
Val   loss: 5.7042
Test  loss: 5.6797

Val   PPL: 303.6842
Test  PPL: 296.0785

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 27


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 2.9934
Val   loss: 5.7173
Test  loss: 5.6952

Val   PPL: 307.6333
Test  PPL: 300.7126

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 28


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 2.9260
Val   loss: 5.7278
Test  loss: 5.7090

Val   PPL: 310.9538
Test  PPL: 304.9565

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 29


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 2.8592
Val   loss: 5.7474
Test  loss: 5.7234

Val   PPL: 317.2017
Test  PPL: 309.3351

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 30


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 2.7963
Val   loss: 5.7772
Test  loss: 5.7516

Val   PPL: 326.9684
Test  PPL: 318.2896

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 31


Training: 118it [01:37,  1.22it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.7343
Val   loss: 5.7942
Test  loss: 5.7702

Val   PPL: 332.6188
Test  PPL: 324.3201

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 32


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.6776
Val   loss: 5.8125
Test  loss: 5.7858

Val   PPL: 338.8097
Test  PPL: 329.6742

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 33


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 2.6176
Val   loss: 5.8401
Test  loss: 5.8138

Val   PPL: 348.6279
Test  PPL: 339.0641

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 34


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.5654
Val   loss: 5.8439
Test  loss: 5.8190

Val   PPL: 349.6409
Test  PPL: 340.8665

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 35


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.5086
Val   loss: 5.8673
Test  loss: 5.8396

Val   PPL: 358.0781
Test  PPL: 347.9559

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 36


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.4568
Val   loss: 5.8817
Test  loss: 5.8528

Val   PPL: 363.3548
Test  PPL: 352.4819

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 37


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.74it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.4077
Val   loss: 5.9043
Test  loss: 5.8741

Val   PPL: 371.5799
Test  PPL: 360.1451

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 38


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.3619
Val   loss: 5.9408
Test  loss: 5.9103

Val   PPL: 385.6082
Test  PPL: 373.7955

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 39


Training: 118it [02:41,  1.36s/it]
Val: 28it [00:07,  3.70it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 2.3151
Val   loss: 5.9411
Test  loss: 5.9178

Val   PPL: 385.9788
Test  PPL: 376.4702

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 40


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.72it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 2.2695
Val   loss: 5.9673
Test  loss: 5.9388

Val   PPL: 396.3046
Test  PPL: 384.5687

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 41


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.74it/s]


Train loss: 2.2248
Val   loss: 5.9917
Test  loss: 5.9601

Val   PPL: 406.1898
Test  PPL: 392.9380

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 42


Training: 118it [01:37,  1.21it/s]
Val: 28it [00:07,  3.71it/s]
Test: 28it [00:07,  3.73it/s]


Train loss: 2.1836
Val   loss: 6.0279
Test  loss: 5.9985

Val   PPL: 421.8728
Test  PPL: 408.2022

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 43


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.75it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.1410
Val   loss: 6.0268
Test  loss: 5.9991

Val   PPL: 421.2659
Test  PPL: 408.5827

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 44


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.1000
Val   loss: 6.0573
Test  loss: 6.0231

Val   PPL: 434.7237
Test  PPL: 418.7194

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 45


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.0629
Val   loss: 6.0595
Test  loss: 6.0311

Val   PPL: 435.0058
Test  PPL: 422.1083

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 46


Training: 118it [01:36,  1.22it/s]
Val: 28it [00:07,  3.73it/s]
Test: 28it [00:07,  3.75it/s]


Train loss: 2.0211
Val   loss: 6.0884
Test  loss: 6.0606

Val   PPL: 448.0156
Test  PPL: 434.8251

Best Epoch: 13, best val PPL: 265.2102, test PPL: 261.0083


------------------------------------------------------------------------
Epoch: 47


Training: 39it [00:33,  1.17it/s]


KeyboardInterrupt: 

In [None]:
vocabulary = set()

for sample in data_train['translation']: # Pay attention we only use the training set!
	for token in sample['en']:
		vocabulary.add(token)
len(vocabulary)

In [None]:
vocabulary = set()

for sample in data_train['translation'][:1]:
	print(sample)
	for token in sample['de']:
		print(token)
		vocabulary.add(token)
len(vocabulary)

In [None]:
data_train['translation'][0]['en']

In [None]:
len(data_train['translation'])

In [None]:
type(data_train['translation'])

In [None]:
data_train['translation']

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    # We return the length of the dataset
    def __len__(self):
        return len(self.data)

    # We return the idx'th sample
    def __getitem__(self, idx):
        return {
            'idx': idx,
            'en_word_idx': torch.tensor(self.data[idx]['en_word_idx']).long(),
            'fr_word_idx': torch.tensor(self.data[idx]['fr_word_idx']).long(),
        }

In [None]:
train_data = TranslationDataset(data_train['translation'])

In [None]:
data_train