!conda install -c conda-forge spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!pip install wasabi==0.9.1

In [61]:
import urllib
import random
import os
import collections
import math
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
import torchtext
from torch.utils.data import Dataset, DataLoader, random_split



In [62]:
os.makedirs('data', exist_ok=True)
dataset_train, dataset_test = torchtext.datasets.Multi30k(root='./data', split=('train', 'test'), language_pair=('de', 'en'))
tokenizer_en = torchtext.data.utils.get_tokenizer('spacy', language="en_core_web_sm")
tokenizer_de = torchtext.data.utils.get_tokenizer('spacy', language="de_core_news_sm")
dataset_train = list(dataset_train)
dataset_test = list(dataset_test)

In [63]:
tokenized_en_data = []
tokenized_de_data = []
counter_en = collections.Counter()
counter_de = collections.Counter()

print('Start Tokenizing...')
for de, en in tqdm(dataset_train):
    tokens_de = tokenizer_en(de)
    tokens_en = tokenizer_de(en)
    tokenized_de_data.append(tokens_de)
    tokenized_en_data.append(tokens_en)
    
print('Making En Vocab')
for line in tqdm(tokenized_en_data):
    counter_en.update(line)

print('Making De Vocab')
for line in tqdm(tokenized_de_data):
    counter_de.update(line)

vocab_en = torchtext.vocab.vocab(counter_en, min_freq=1, specials=["<unk>", "<pad>", "<sos>", "<eos>"])
vocab_de = torchtext.vocab.vocab(counter_de, min_freq=1, specials=["<unk>", "<pad>", "<sos>", "<eos>"])




Start Tokenizing...


100%|██████████| 29001/29001 [00:04<00:00, 6112.96it/s]


Making En Vocab


100%|██████████| 29001/29001 [00:00<00:00, 255177.99it/s]


Making De Vocab


100%|██████████| 29001/29001 [00:00<00:00, 219270.13it/s]


In [64]:
class Multi30kDataset(Dataset):
    def __init__(self, tokenized_src, tokenized_tgt, vocab_src, vocab_tgt, max_seq = 256):
        self.src = []
        self.tgt = []

        for tokens in tqdm(tokenized_src, "Src data"):
            token_ids = [vocab_src['<sos>']]
            token_ids += [vocab_src[token] for token in tokens]
            token_ids += [vocab_src['<eos>']]
            self.src.append(token_ids)

        for tokens in tqdm(tokenized_tgt, "Tgt data"):
            token_ids = [vocab_tgt['<sos>']]
            token_ids += [vocab_tgt[token] for token in tokens]
            token_ids += [vocab_tgt['<eos>']]
            self.tgt.append(token_ids)

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return torch.tensor(self.src[idx]), torch.tensor(self.tgt[idx])
        


In [65]:
multi30k_dataset = Multi30kDataset(tokenized_de_data, tokenized_en_data, vocab_de, vocab_en)

Src data: 100%|██████████| 29001/29001 [00:00<00:00, 69710.33it/s]
Tgt data: 100%|██████████| 29001/29001 [00:00<00:00, 83243.63it/s]


In [66]:
train_ratio = 0.8
train_size = int(train_ratio*len(multi30k_dataset))
valid_size = len(multi30k_dataset) - train_size
train_dataset, valid_dataset = random_split(multi30k_dataset, [train_size, valid_size])

In [67]:
#Ref: https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
    def forward(self, input):

        input = self.embedding(input)
        #input: [batch_size, sentence_len, emb_size]

        output, (hidden, cell) = self.lstm(input)

        return hidden, cell

In [68]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)

    def forward(self, input, hidden, cell):
        #input: [batch_size]
        input = input.unsqueeze(1)
        #input: [batch_size, 1]
        input = self.embedding(input)
        #input: [batch_size, 1, emb_size]
        output ,(hidden,cell) = self.lstm(input, (hidden, cell))
        #output: [batch_size, 1, hidden_size]
        prediction = self.out(output.squeeze(1))
        #prediction: [batch_size, output_size]
        return prediction, (hidden, cell)


In [69]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio= 0.5):
        batch_size, tgt_len = tgt.size()
        tgt_vocab_size = self.decoder.vocab_size

        outputs = torch.zeros(tgt_len, batch_size, tgt_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)
        input = tgt[:,0]
        for t in range(1, tgt_len):
            output, (hidden, cell) = self.decoder(input, hidden, cell)
            outputs[t] = output
            top1 = output.argmax(1)

            is_teacher_forcing = random.random() < teacher_forcing_ratio

            input = tgt[:,t] if is_teacher_forcing else top1
            #outputs: [tgt_len, batch_size, output_size]
        
        outputs = outputs.transpose(0,1)
        #outputs: [batch_size, tgt_len, output_size]
        return outputs
        






In [70]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [71]:
batch_size=256
learning_rate = 0.001
num_epochs = 10

In [72]:
def pad_collate_fn(batch):
    collate_x = []
    collate_y = []
    for src, tgt in batch:
        collate_x.append(src)
        collate_y.append(tgt)
    collate_x = nn.utils.rnn.pad_sequence(collate_x, padding_value=vocab_de['<pad>'], batch_first=True)
    collate_y = nn.utils.rnn.pad_sequence(collate_y, padding_value=vocab_en['<pad>'], batch_first=True)
    return (collate_x, collate_y)

In [73]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn= pad_collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn= pad_collate_fn)

In [74]:
encoder = Encoder(vocab_size = len(vocab_de), embedding_size=256, hidden_size=512)
decoder = Decoder(vocab_size = len(vocab_en), embedding_size=256, hidden_size=512)
model = Seq2Seq(encoder, decoder, device).to(device)

In [75]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index = vocab_en['<pad>'])

In [76]:
def train(dataloader, epoch):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    data_num = 0
    for src,tgt in tqdm(dataloader, desc=f"Epoch {epoch}"):
        src, tgt = src.to(device), tgt.to(device)
        outputs = model(src, tgt)
        #outputs: [batch_size, tgt_len, output_size]
        #tgt: [batch_size, tgt_len, output_size]
        output_size = outputs.shape[-1]
        
        output = outputs.reshape(-1,output_size)
        #output: [(tgt_len - 1) * batch_size, output_size]
        tgt = tgt.reshape(-1)
        #tgt: [(tgt_len - 1) * batch_size]

        optimizer.zero_grad()
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        data_num += src.size(0)

    print(f"Train Epoch: {epoch}, Loss: {epoch_loss/len(dataloader)}, PPL: {math.exp(epoch_loss/len(dataloader))}")


In [77]:
def evaluate(dataloader, epoch):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    data_num = 0
    for src,tgt in tqdm(dataloader, desc=f"Epoch {epoch}"):
        src, tgt = src.to(device), tgt.to(device)
        outputs = model(src, tgt, 0)
        #outputs: [batch_size, tgt_len, output_size]
        #tgt: [batch_size, tgt_len, output_size]
        output_size = outputs.shape[-1]
        output = outputs.reshape(-1,output_size)
        #output: [(tgt_len - 1) * batch_size, output_size]
        tgt = tgt.reshape(-1)
        #tgt: [(tgt_len - 1) * batch_size]

        loss = criterion(output, tgt)
        epoch_loss += loss.item()
        data_num += src.size(0)

    print(f"Evaluate Epoch: {epoch}, Loss: {epoch_loss/len(dataloader)}, PPL: {math.exp(epoch_loss/len(dataloader))}")


In [78]:

for epoch in range(num_epochs):
    train(train_dataloader, epoch)
    evaluate(valid_dataloader, epoch)
    print('-'*50)

Epoch 0: 100%|██████████| 91/91 [00:35<00:00,  2.58it/s]


Train Epoch: 0, Loss: 5.698838155348223, PPL: 298.5203651155682


Epoch 0: 100%|██████████| 23/23 [00:04<00:00,  4.99it/s]


Evaluate Epoch: 0, Loss: 5.464295262875765, PPL: 236.10940140214757
--------------------------------------------------


Epoch 1: 100%|██████████| 91/91 [00:36<00:00,  2.52it/s]


Train Epoch: 1, Loss: 5.047465041443542, PPL: 155.62745488306055


Epoch 1: 100%|██████████| 23/23 [00:04<00:00,  5.29it/s]


Evaluate Epoch: 1, Loss: 5.497044666953709, PPL: 243.96985362761995
--------------------------------------------------


Epoch 2: 100%|██████████| 91/91 [00:36<00:00,  2.50it/s]


Train Epoch: 2, Loss: 4.80440109378689, PPL: 122.0463747963338


Epoch 2: 100%|██████████| 23/23 [00:04<00:00,  5.21it/s]


Evaluate Epoch: 2, Loss: 5.344540886257006, PPL: 209.46169577659325
--------------------------------------------------


Epoch 3: 100%|██████████| 91/91 [00:35<00:00,  2.54it/s]


Train Epoch: 3, Loss: 4.651103051154168, PPL: 104.70041181498266


Epoch 3: 100%|██████████| 23/23 [00:04<00:00,  4.95it/s]


Evaluate Epoch: 3, Loss: 5.194650567096213, PPL: 180.3051272436166
--------------------------------------------------


Epoch 4: 100%|██████████| 91/91 [00:35<00:00,  2.55it/s]


Train Epoch: 4, Loss: 4.495924912966215, PPL: 89.65105006898598


Epoch 4: 100%|██████████| 23/23 [00:04<00:00,  5.21it/s]


Evaluate Epoch: 4, Loss: 5.15604947960895, PPL: 173.47777264243575
--------------------------------------------------


Epoch 5: 100%|██████████| 91/91 [00:35<00:00,  2.53it/s]


Train Epoch: 5, Loss: 4.381862912859235, PPL: 79.98690332696309


Epoch 5: 100%|██████████| 23/23 [00:04<00:00,  5.32it/s]


Evaluate Epoch: 5, Loss: 5.101881276006284, PPL: 164.33076821382997
--------------------------------------------------


Epoch 6: 100%|██████████| 91/91 [00:36<00:00,  2.51it/s]


Train Epoch: 6, Loss: 4.214052375856337, PPL: 67.63004762976226


Epoch 6: 100%|██████████| 23/23 [00:04<00:00,  4.95it/s]


Evaluate Epoch: 6, Loss: 5.00699520111084, PPL: 149.45497861824882
--------------------------------------------------


Epoch 7: 100%|██████████| 91/91 [00:36<00:00,  2.51it/s]


Train Epoch: 7, Loss: 4.091618042725783, PPL: 59.836631644928985


Epoch 7: 100%|██████████| 23/23 [00:04<00:00,  5.10it/s]


Evaluate Epoch: 7, Loss: 4.9635215012923535, PPL: 143.09682532307124
--------------------------------------------------


Epoch 8: 100%|██████████| 91/91 [00:34<00:00,  2.60it/s]


Train Epoch: 8, Loss: 4.011935951945546, PPL: 55.253735664295114


Epoch 8: 100%|██████████| 23/23 [00:04<00:00,  5.26it/s]


Evaluate Epoch: 8, Loss: 4.8594800285671065, PPL: 128.95713064726328
--------------------------------------------------


Epoch 9: 100%|██████████| 91/91 [00:35<00:00,  2.58it/s]


Train Epoch: 9, Loss: 3.848877353982611, PPL: 46.9403362586983


Epoch 9: 100%|██████████| 23/23 [00:04<00:00,  5.60it/s]

Evaluate Epoch: 9, Loss: 4.884683774865192, PPL: 132.24863837777667
--------------------------------------------------



