In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import random

"""
from konlpy.tag import Hannanum
import pandas as pd

from nltk.tokenize import TreebankWordTokenizer
import nltk
"""
from torchtext.legacy.data import TabularDataset
from tqdm import tqdm
from tensorboardX import SummaryWriter

In [2]:
kor_vocab = torch.load('./datasets/korNeng_corpus/kor_vocab.pt')
eng_vocab = torch.load('./datasets/korNeng_corpus/eng_vocab.pt')

In [3]:
train_data = torch.load('./datasets/korNeng_corpus/train_tensor.pt')
valid_data = torch.load('./datasets/korNeng_corpus/valid_tensor.pt')
test_data = torch.load('./datasets/korNeng_corpus/test_tensor.pt')

In [4]:
class Encoder(nn.Module):
    def __init__(self, n_inputs, n_embeddings, n_hiddens):
        super().__init__()
        self.n_hiddens = n_hiddens
        self.embedding = nn.Embedding(n_inputs, n_embeddings)
        self.bidirectional_gru = nn.GRU(n_embeddings, n_hiddens, bidirectional=True)
        self.fc = nn.Linear(n_hiddens * 2, n_hiddens)

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.bidirectional_gru(x)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        hidden = torch.tanh(self.fc(hidden))
        return output, hidden



In [5]:
class Alignment(nn.Module):
    def __init__(self, n_hiddens):
        super().__init__()
        self.n_hiddens = n_hiddens
        self.v = nn.Parameter(nn.init.uniform_(torch.empty(n_hiddens)))
        self.align = nn.Linear(self.n_hiddens * 3, self.n_hiddens)
        
    def forward(self, h , s):
        e = torch.cat([h, s], dim = 2)
        e = torch.tanh(self.align(e))
        e = e.transpose(1, 2)
        v = self.v.repeat(s.size(0), 1).unsqueeze(1)
        e = torch.bmm(v, e)
        return e.squeeze(1)


In [6]:
class Attention(nn.Module):
    def __init__(self, n_hiddens):
        super().__init__()
        self.n_hiddens = n_hiddens
        self.align = Alignment(self.n_hiddens)
    
    def forward(self, h, s):
        time_step = s.shape[0]
        h = h.unsqueeze(1)
        h = h.repeat(1, time_step, 1)
        s = s.permute(1, 0, 2)
        energy = self.align(h, s)
        return F.softmax(energy, dim=1).unsqueeze(1)


In [7]:
class Decoder(nn.Module):
    def __init__(self, n_outputs, n_embeddings, n_hiddens, n_maxout):
        super().__init__()
        self.n_hiddens = n_hiddens
        self.embedding = nn.Embedding(n_outputs, n_embeddings)
        self.attention_layer = Attention(self.n_hiddens)
        self.gru = nn.GRU(n_embeddings + n_hiddens * 2, n_hiddens)

        self.maxout = Maxout(n_hiddens * 3 + n_embeddings, n_maxout, 2)
        self.out = nn.Linear(n_maxout, n_outputs)

    def forward(self, input, h, s):
        embedded = self.embedding(input)
        attention = self.attention_layer(h, s)
        context = attention.bmm(s.transpose(0, 1)).transpose(0, 1)
        embedded = embedded.unsqueeze(0)
        input = torch.cat([embedded, context], 2)
        h = h.unsqueeze(0)
        out, hidden = self.gru(input, h)
        maxout_input = torch.cat([h, embedded, context], dim=2)
        out = self.maxout(maxout_input).squeeze(0)
        out = self.out(out)
        out = F.log_softmax(out, dim=1)
        return out, hidden.squeeze(0)

In [8]:
class Maxout(nn.Module):

    def __init__(self, d_in, d_out, pool_size):
        super().__init__()
        self.d_in, self.d_out, self.pool_size = d_in, d_out, pool_size
        self.lin = nn.Linear(d_in, d_out * pool_size)


    def forward(self, inputs):
        shape = list(inputs.size())
        shape[-1] = self.d_out
        shape.append(self.pool_size)
        max_dim = len(shape) - 1
        out = self.lin(inputs)
        m, i = out.view(*shape).max(max_dim)
        return m
        

In [9]:
class RNNsearch(nn.Module):
    def __init__(self, n_inputs, n_outputs, n_embeddings, n_hiddens, n_maxout, device):
        super().__init__()
        self.n_outputs = n_outputs
        self.device = device

        self.encoder = Encoder(n_inputs, n_embeddings, n_hiddens)
        self.decoder = Decoder(n_outputs, n_embeddings, n_hiddens, n_maxout)

        
    def forward(self, x, target, teacher_forcing_ratio):
        
        encoder_outputs, hiddens = self.encoder(x)

        output = target[0, :]

        outputs = torch.zeros(target.shape[0], target.shape[1], self.n_outputs).to(self.device)

        for t in range(1, target.shape[0]):
            output, hiddens = self.decoder(output, hiddens, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            output = target[t] if teacher_force else output.argmax(1)
            
    
        return outputs


In [10]:
n_hiddens = 1000 #1000
n_inputs = len(kor_vocab)
n_outputs = len(eng_vocab)
n_embeddings = 620 #620
n_maxout = 500 # 500


model = RNNsearch(n_inputs, n_outputs, n_embeddings, n_hiddens, n_hiddens, 'cuda')
model.to('cuda')
optimizer = optim.Adadelta(model.parameters(), eps=1e-6, rho=0.95)
optimizer = optim.Adam(model.parameters())

In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 128
PAD_IDX = kor_vocab['<pad>']
BOS_IDX = kor_vocab['<bos>']
EOS_IDX = kor_vocab['<eos>']
device = torch.device('cuda')

def generate_batch(data_batch):
    kor_batch, eng_batch = [], []
    for (kor_item, eng_item) in data_batch:
        kor_batch.append(torch.cat([torch.tensor([BOS_IDX]), kor_item, torch.tensor([EOS_IDX])], dim=0))
        eng_batch.append(torch.cat([torch.tensor([BOS_IDX]), eng_item, torch.tensor([EOS_IDX])], dim=0))
    kor_batch = pad_sequence(kor_batch, padding_value=PAD_IDX)
    eng_batch = pad_sequence(eng_batch, padding_value=PAD_IDX)
    return kor_batch, eng_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch, num_workers=4)
valid_iter = DataLoader(valid_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch, num_workers=4)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_batch, num_workers=4)

In [12]:
PAD_IDX = eng_vocab['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [13]:
accumulation_step = 1

In [14]:
import math
import time

writer = SummaryWriter('attention_logs')
def train(model: nn.Module,
          iterator: torch.utils.data.DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for _, (src, trg) in tqdm(enumerate(iterator)):
        src, trg = src.to(device), trg.to(device)
        #optimizer.zero_grad()

        output = model(src, trg, 0.5)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss = loss / accumulation_step
        del src
        del trg
        del output
        torch.cuda.empty_cache()
        loss.backward()

        if (_ + 1) % accumulation_step == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            
            optimizer.step()
            optimizer.zero_grad()
        writer.add_scalar('attention_log/train_loss', loss.item(), _ + 1)
        #if (_ % 1000 == 0):
            
            #print(loss)
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: torch.utils.data.DataLoader,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 1 # 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(model, test_iter, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')