!conda install -c conda-forge spacy  
!python -m spacy download en_core_web_sm  
!python -m spacy download de_core_news_sm
!pip install wasabi==0.9.1  

In [128]:
import urllib
import random
import os
import collections
import math
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
from torch.nn import functional as F
import torchtext
from torch.utils.data import Dataset, DataLoader, random_split



In [129]:
os.makedirs('data', exist_ok=True)
dataset_train, dataset_test = torchtext.datasets.Multi30k(root='./data', split=('train', 'test'), language_pair=('de', 'en'))
tokenizer_en = torchtext.data.utils.get_tokenizer('spacy', language="en_core_web_sm")
tokenizer_de = torchtext.data.utils.get_tokenizer('spacy', language="de_core_news_sm")
dataset_train = list(dataset_train)
dataset_test = list(dataset_test)

In [130]:
tokenized_en_data = []
tokenized_de_data = []
counter_en = collections.Counter()
counter_de = collections.Counter()

print('Start Tokenizing...')
for de, en in tqdm(dataset_train):
    tokens_de = tokenizer_en(de)
    tokens_en = tokenizer_de(en)
    tokenized_de_data.append(tokens_de)
    tokenized_en_data.append(tokens_en)
    
print('Making En Vocab')
for line in tqdm(tokenized_en_data):
    counter_en.update(line)

print('Making De Vocab')
for line in tqdm(tokenized_de_data):
    counter_de.update(line)

vocab_en = torchtext.vocab.vocab(counter_en, min_freq=1, specials=["<unk>", "<pad>", "<sos>", "<eos>"])
vocab_de = torchtext.vocab.vocab(counter_de, min_freq=1, specials=["<unk>", "<pad>", "<sos>", "<eos>"])




Start Tokenizing...


100%|██████████| 29001/29001 [00:05<00:00, 5307.82it/s]


Making En Vocab


100%|██████████| 29001/29001 [00:00<00:00, 201757.20it/s]


Making De Vocab


100%|██████████| 29001/29001 [00:00<00:00, 188096.52it/s]


In [131]:
class Multi30kDataset(Dataset):
    def __init__(self, tokenized_src, tokenized_tgt, vocab_src, vocab_tgt, max_seq = 256):
        self.src = []
        self.tgt = []

        for tokens in tqdm(tokenized_src, "Src data"):
            token_ids = [vocab_src['<sos>']]
            token_ids += [vocab_src[token] for token in tokens]
            token_ids += [vocab_src['<eos>']]
            self.src.append(token_ids)

        for tokens in tqdm(tokenized_tgt, "Tgt data"):
            token_ids = [vocab_tgt['<sos>']]
            token_ids += [vocab_tgt[token] for token in tokens]
            token_ids += [vocab_tgt['<eos>']]
            self.tgt.append(token_ids)

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return torch.tensor(self.src[idx]), torch.tensor(self.tgt[idx])
        


In [132]:
multi30k_dataset = Multi30kDataset(tokenized_de_data, tokenized_en_data, vocab_de, vocab_en)

Src data: 100%|██████████| 29001/29001 [00:00<00:00, 62651.37it/s]
Tgt data: 100%|██████████| 29001/29001 [00:00<00:00, 62425.24it/s]


In [133]:
train_ratio = 0.8
train_size = int(train_ratio*len(multi30k_dataset))
valid_size = len(multi30k_dataset) - train_size
train_dataset, valid_dataset = random_split(multi30k_dataset, [train_size, valid_size])

In [134]:
#Query: decoder_hidden
#Key: encoder_outputs
#Value: encoder_outputs
class Attention(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, decoder_hiddens, encoder_outputs):
        #decoder_hiddens: [batch_size, 1, decoder_hidden]
        #encoder_outputs: [batch_size, sentence_length, decoder_hidden]
        Q = decoder_hiddens
        K = encoder_outputs

        energy = torch.sum(torch.mul(Q,K), dim=-1)
        #energy: [batch_size, sentence_length]
        attn_scores = F.softmax(energy, dim=-1)
        #attn_scores: [batch_size, sentence_length]
        attn_values = torch.sum(torch.mul(K, attn_scores.unsqueeze(2)), dim=1)
        #attn_values: [batch_size, decoder_hidden]
        return  attn_values

In [135]:
#Ref: https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
    def forward(self, input):

        input = self.embedding(input)
        #input: [batch_size, sentence_len, emb_size]

        outputs, (hidden, cell) = self.lstm(input)

        return outputs, (hidden, cell)

In [136]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.attention = Attention()
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.out = nn.Linear(2*hidden_size, vocab_size)

    def forward(self, input, hidden, cell, encoder_outputs):
        #input: [batch_size]
        input = input.unsqueeze(1)
        #input: [batch_size, 1]
        input = self.embedding(input)
        #input: [batch_size, 1, emb_size]
        output ,(hidden,cell) = self.lstm(input, (hidden, cell))
        #output: [batch_size, 1, hidden_size]
        #hidden: [1, batch_size, hidden_size]
        attn_values = self.attention(hidden.transpose(0,1), encoder_outputs)
        #attn_values: [batch_size, decoder_hidden]
        attn_values = attn_values.unsqueeze(1)
        #attn_values: [batch_size, 1, decoder_hidden]

        concat_output = torch.cat((output, attn_values), dim=-1)
        #concat_output: [batch_size, 1, 2 * decoder_hidden]
        
        prediction = self.out(concat_output.squeeze(1))
        #prediction: [batch_size, output_size]
        return prediction, (hidden, cell)


In [137]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio= 0.5):
        batch_size, tgt_len = tgt.size()
        tgt_vocab_size = self.decoder.vocab_size

        outputs = torch.zeros(tgt_len, batch_size, tgt_vocab_size).to(self.device)

        encoder_outputs, (hidden, cell) = self.encoder(src)
        input = tgt[:,0]
        for t in range(1, tgt_len):
            output, (hidden, cell) = self.decoder(input, hidden, cell,encoder_outputs)
            outputs[t] = output
            top1 = output.argmax(1)

            is_teacher_forcing = random.random() < teacher_forcing_ratio

            input = tgt[:,t] if is_teacher_forcing else top1
            #outputs: [tgt_len, batch_size, output_size]
        
        outputs = outputs.transpose(0,1)
        #outputs: [batch_size, tgt_len, output_size]
        return outputs
        






In [138]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [139]:
batch_size=256
learning_rate = 0.001
num_epochs = 10

In [140]:
def pad_collate_fn(batch):
    collate_x = []
    collate_y = []
    for src, tgt in batch:
        collate_x.append(src)
        collate_y.append(tgt)
    collate_x = nn.utils.rnn.pad_sequence(collate_x, padding_value=vocab_de['<pad>'], batch_first=True)
    collate_y = nn.utils.rnn.pad_sequence(collate_y, padding_value=vocab_en['<pad>'], batch_first=True)
    return (collate_x, collate_y)

In [141]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn= pad_collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn= pad_collate_fn)

In [142]:
encoder = Encoder(vocab_size = len(vocab_de), embedding_size=256, hidden_size=512)
decoder = Decoder(vocab_size = len(vocab_en), embedding_size=256, hidden_size=512)
model = Seq2Seq(encoder, decoder, device).to(device)

In [143]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index = vocab_en['<pad>'])

In [144]:
def train(dataloader, epoch):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    data_num = 0
    for src,tgt in tqdm(dataloader, desc=f"Epoch {epoch}"):
        src, tgt = src.to(device), tgt.to(device)
        outputs = model(src, tgt)
        #outputs: [batch_size, tgt_len, output_size]
        #tgt: [batch_size, tgt_len, output_size]
        output_size = outputs.shape[-1]
        
        output = outputs.reshape(-1,output_size)
        #output: [(tgt_len - 1) * batch_size, output_size]
        tgt = tgt.reshape(-1)
        #tgt: [(tgt_len - 1) * batch_size]

        optimizer.zero_grad()
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        data_num += src.size(0)

    print(f"Train Epoch: {epoch}, Loss: {epoch_loss/len(dataloader)}, PPL: {math.exp(epoch_loss/len(dataloader))}")


In [145]:
def evaluate(dataloader, epoch):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    data_num = 0
    for src,tgt in tqdm(dataloader, desc=f"Epoch {epoch}"):
        src, tgt = src.to(device), tgt.to(device)
        outputs = model(src, tgt, 0)
        #outputs: [batch_size, tgt_len, output_size]
        #tgt: [batch_size, tgt_len, output_size]
        output_size = outputs.shape[-1]
        output = outputs.reshape(-1,output_size)
        #output: [(tgt_len - 1) * batch_size, output_size]
        tgt = tgt.reshape(-1)
        #tgt: [(tgt_len - 1) * batch_size]

        loss = criterion(output, tgt)
        epoch_loss += loss.item()
        data_num += src.size(0)

    print(f"Evaluate Epoch: {epoch}, Loss: {epoch_loss/len(dataloader)}, PPL: {math.exp(epoch_loss/len(dataloader))}")


In [146]:

for epoch in range(num_epochs):
    train(train_dataloader, epoch)
    evaluate(valid_dataloader, epoch)
    print('-'*50)

Epoch 0: 100%|██████████| 91/91 [00:46<00:00,  1.97it/s]


Train Epoch: 0, Loss: 5.538500953506637, PPL: 254.29651130220927


Epoch 0: 100%|██████████| 23/23 [00:05<00:00,  4.29it/s]


Evaluate Epoch: 0, Loss: 5.379855114480724, PPL: 216.99083431761358
--------------------------------------------------


Epoch 1: 100%|██████████| 91/91 [00:47<00:00,  1.92it/s]


Train Epoch: 1, Loss: 4.754146051930857, PPL: 116.06449777987498


Epoch 1: 100%|██████████| 23/23 [00:05<00:00,  4.16it/s]


Evaluate Epoch: 1, Loss: 5.030419370402461, PPL: 152.99716172486038
--------------------------------------------------


Epoch 2: 100%|██████████| 91/91 [00:46<00:00,  1.95it/s]


Train Epoch: 2, Loss: 4.212612325018579, PPL: 67.53272701315136


Epoch 2: 100%|██████████| 23/23 [00:05<00:00,  4.10it/s]


Evaluate Epoch: 2, Loss: 4.6819741829581885, PPL: 107.9830405617939
--------------------------------------------------


Epoch 3: 100%|██████████| 91/91 [00:46<00:00,  1.96it/s]


Train Epoch: 3, Loss: 3.7573107310703824, PPL: 42.833081277179446


Epoch 3: 100%|██████████| 23/23 [00:05<00:00,  4.23it/s]


Evaluate Epoch: 3, Loss: 4.508318880329961, PPL: 90.76909646098467
--------------------------------------------------


Epoch 4: 100%|██████████| 91/91 [00:46<00:00,  1.97it/s]


Train Epoch: 4, Loss: 3.4627742086138045, PPL: 31.905365993718142


Epoch 4: 100%|██████████| 23/23 [00:05<00:00,  4.50it/s]


Evaluate Epoch: 4, Loss: 4.430466817772907, PPL: 83.97060673392924
--------------------------------------------------


Epoch 5: 100%|██████████| 91/91 [00:44<00:00,  2.02it/s]


Train Epoch: 5, Loss: 3.177011618247399, PPL: 23.97499993953258


Epoch 5: 100%|██████████| 23/23 [00:05<00:00,  4.47it/s]


Evaluate Epoch: 5, Loss: 4.3419154208639394, PPL: 76.85460736318322
--------------------------------------------------


Epoch 6: 100%|██████████| 91/91 [00:44<00:00,  2.03it/s]


Train Epoch: 6, Loss: 2.9562604139139363, PPL: 19.22594009707434


Epoch 6: 100%|██████████| 23/23 [00:05<00:00,  4.23it/s]


Evaluate Epoch: 6, Loss: 4.306499584861424, PPL: 74.18037185043445
--------------------------------------------------


Epoch 7: 100%|██████████| 91/91 [00:44<00:00,  2.03it/s]


Train Epoch: 7, Loss: 2.757489115327269, PPL: 15.76022112849648


Epoch 7: 100%|██████████| 23/23 [00:05<00:00,  4.16it/s]


Evaluate Epoch: 7, Loss: 4.279578727224599, PPL: 72.21001348604977
--------------------------------------------------


Epoch 8: 100%|██████████| 91/91 [00:47<00:00,  1.93it/s]


Train Epoch: 8, Loss: 2.612686733623127, PPL: 13.635637005613704


Epoch 8: 100%|██████████| 23/23 [00:05<00:00,  3.94it/s]


Evaluate Epoch: 8, Loss: 4.309366951818052, PPL: 74.39337943702441
--------------------------------------------------


Epoch 9: 100%|██████████| 91/91 [00:45<00:00,  1.99it/s]


Train Epoch: 9, Loss: 2.444383222978194, PPL: 11.523440013537298


Epoch 9: 100%|██████████| 23/23 [00:05<00:00,  4.49it/s]

Evaluate Epoch: 9, Loss: 4.2937089671259345, PPL: 73.23760123297254
--------------------------------------------------



