# Naver Coding Test

In [1]:
!which python
!python --version

/usr/local/bin/python
Python 3.6.9


## Adding Data



*   Running **locally**: navigate here to folder that contains datasets, so that pandas.read_csv finds files
*   **Google drive**: create a folder with the datasets in google drive and mount google drive here




In [2]:
# to mount google drive folder with data
from google.colab import drive
drive.mount('/content/gdrive')

dataset_folder = '/content/gdrive/My Drive/Naver'
%cd $dataset_folder
!ls
# expected output:
# /content/gdrive/My Drive/Naver
# Naver_Codingtest.ipynb	test_source.txt  train_source.txt
# README.md		test_target.txt  train_target.txt

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Naver
Naver_Codingtest.ipynb	test_source.txt  train_source.txt   train_target.txt
README.md		test_target.txt  train_target.gdoc  weights


## Dependencies



In [3]:
# general
import time
import math
import random
from tqdm.notebook import tqdm
import os

#plotting
import matplotlib.pyplot as plt
plt.switch_backend('agg')
%matplotlib inline
import matplotlib.ticker as ticker
import numpy as np

# dataset
import pandas as pd

#training
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Running on', device)

Running on cuda


## Dataset and Dataloader

In [4]:
# end-of-sentence token
EOS_token = 659
SOS_token = 660

class Seq2SeqDataSet(Dataset):
  def __init__(self, source_path, target_path):
    self.source_path = source_path
    self.target_path = target_path
    self.source_df = pd.read_csv(source_path, sep='\t', header=None, names=['sequence'])
    self.target_df = pd.read_csv(target_path, sep='\t', header=None, names=['sequence'])
    # assert len(self.source_df) == len(self.target_df)

  def __len__(self):
    return min(len(self.source_df), len(self.target_df))

  def __getitem__(self, idx):
    source_data = self.source_df.iloc[idx]
    target_data = self.target_df.iloc[idx]
    # split numbers to list and remove final space if present
    source_seq = str(source_data.sequence).rstrip().split(" ")
    target_seq = str(target_data.sequence).rstrip().split(" ")
    # map list-elements to int
    source_seq = list(map(int, source_seq))
    target_seq = list(map(int, target_seq))
    source_seq.append(EOS_token)
    target_seq.append(EOS_token)
    source_seq = torch.tensor(source_seq, dtype=torch.long, device=device).view(-1, 1)
    target_seq = torch.tensor(target_seq, dtype=torch.long, device=device).view(-1, 1)

    return {'source_sequence': source_seq,
            'target_sequence': target_seq
            }


train_dataset = Seq2SeqDataSet('train_source.txt', 'train_target.txt')
print('train dataset with %i samples created' % len(train_dataset))
train_loader = DataLoader(dataset=train_dataset, batch_size=1, shuffle=True)

val_dataset = Seq2SeqDataSet('test_source.txt', 'test_target.txt')
print('test dataset with %i samples created' % len(val_dataset))
val_loader = DataLoader(dataset=val_dataset, batch_size=1, shuffle=True)

train dataset with 5540 samples created
test dataset with 2000 samples created


## Overview over Data



In [5]:
source_min = 100000
source_max = -1
target_min = 100000
target_max = -1
source_max_length = 0
target_max_length = 0
for idx, batch in enumerate(val_loader):
  source_sequence = batch['source_sequence']
  target_sequence = batch['target_sequence']
  if source_sequence.shape[1] > source_max_length:
    source_max_length = source_sequence.shape[1]
  if target_sequence.shape[1] > target_max_length:
    target_max_length = target_sequence.shape[1]
  if torch.max(source_sequence) > source_max:
    source_max = torch.max(source_sequence).item()
  if torch.max(target_sequence) > target_max:
    target_max = torch.max(target_sequence).item()
  # exclude EOS_TOKEN
  if torch.min(source_sequence[:,:-1,:]) < source_min:
    source_min = torch.min(source_sequence[:,:-1,:]).item()
  if torch.min(target_sequence[:,:-1,:]) < target_min:
    target_min = torch.min(target_sequence[:,:-1,:]).item()

max_length = max(source_max_length, target_max_length)
print('source')
print('max ',source_max, '\nmin ',source_min, '\nmax length ', source_max_length)
print('target')
print('max ',target_max, '\nmin ',target_min, '\nmax length ', target_max_length)
# expected output for training dataset
# source
# max  619 
# min  21 
# max length  82
# target
# max  658 
# min  0 
# max length  48

# maximum length of eval set
max_length = 85

source
max  659 
min  21 
max length  85
target
max  659 
min  2 
max length  55


# Model

### Encoder RNN

In [6]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


### RNN Decoder

In [7]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### Attentional Decoder

In [8]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=max_length):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Training

In [9]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, 
          decoder_optimizer, criterion, teacher_forcing_ratio, max_length=max_length):
  encoder_hidden = encoder.initHidden()
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()
  input_length = input_tensor.size(0)
  target_length = target_tensor.size(0)

  encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

  loss = 0

  for elem in range(input_length):
    encoder_output, encoder_hidden = encoder(
      input_tensor[elem], encoder_hidden)
    encoder_outputs[elem] = encoder_output[0, 0]
  decoder_input = torch.tensor([[SOS_token]], device=device)

  decoder_hidden = encoder_hidden

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:
    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(
          decoder_input, decoder_hidden, encoder_outputs)
      loss += criterion(decoder_output, target_tensor[di])
      decoder_input = target_tensor[di]  # Teacher forcing

  else:
    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(
          decoder_input, decoder_hidden, encoder_outputs)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach()  # detach from history as input

      loss += criterion(decoder_output, target_tensor[di])
      if decoder_input.item() == EOS_token:
        break

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item() / target_length

### Helper Functions

In [10]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [11]:
def train_epoch(encoder, decoder, 
                enc_optimizer, dec_optimizer,
                train_loader, device, teacher_forcing_ratio, print_every=500, 
               plot_every=100):
  start = time.time()
  plot_losses = []
  print_loss_total = 0  # Reset every print_every
  plot_loss_total = 0  # Reset every plot_every


  # training_pairs = [tensorsFromPair(random.choice(pairs))
                    # for i in range(n_iters)]
  n_iters = len(train_loader)
  criterion = nn.NLLLoss()
  progress_bar = tqdm(enumerate(train_loader), total=len(train_loader) - 1)

  for iter, batch in progress_bar:
    training_pair = batch
    input_tensor = batch['source_sequence'].squeeze(0).to(device)
    target_tensor = batch['target_sequence'].squeeze(0).to(device)

    loss = train(input_tensor, target_tensor, encoder, decoder, 
                 enc_optimizer, dec_optimizer, criterion, teacher_forcing_ratio)
    print_loss_total += loss
    plot_loss_total += loss

    if (iter + 1) % print_every == 0:
      print_loss_avg = print_loss_total / print_every
      print_loss_total = 0
      print('\n%s (%d %d%%) %.4f' % (timeSince(start, (iter+1) / n_iters),
                                  (1+iter), (1+iter) / n_iters * 100, print_loss_avg))

    if (iter + 1) % plot_every == 0:
      plot_loss_avg = plot_loss_total / plot_every
      plot_losses.append(plot_loss_avg)
      plot_loss_total = 0

  showPlot(plot_losses)

### Plotting

In [12]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

## Evaluation function

In [13]:
def evaluate(encoder, decoder, source_tensor, target_tensor, criterion, max_length=max_length):
  with torch.no_grad():
    loss = 0
    input_length = source_tensor.size()[0]
    target_length = target_tensor.size()[0]
    encoder_hidden = encoder.initHidden()

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    for ei in range(input_length):
      encoder_output, encoder_hidden = encoder(source_tensor[ei],
                                                encoder_hidden)
      encoder_outputs[ei] += encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(target_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(
          decoder_input, decoder_hidden, encoder_outputs)
      decoder_attentions[di] = decoder_attention.data
      topv, topi = decoder_output.data.topk(1)
      loss += criterion(decoder_output, target_tensor[di])
      if topi.item() == EOS_token:
          decoded_words.append('<EOS>')
          break
      else:
          decoded_words.append(str(topi.item()))

      decoder_input = topi.squeeze().detach()
    return decoded_words, decoder_attentions[:di + 1], loss.item() / target_length

In [14]:
def eval_epoch(encoder, decoder, val_loader, device):
  start = time.time()
  loss_total = 0

  n_iters = len(val_loader)
  criterion = nn.NLLLoss()
  progress_bar = tqdm(enumerate(val_loader), total=len(val_loader) - 1)

  for iter, batch in progress_bar:
    source_tensor = batch['source_sequence'].squeeze(0).to(device)
    target_tensor = batch['target_sequence'].squeeze(0).to(device)

    output, _, loss = evaluate(encoder, decoder, source_tensor, target_tensor, criterion)
    loss_total += loss
  print(loss_total) / len(val_loader))
  return loss_total / len(val_loader)

## Run train and eval

```
load_pretrained = True
```
to load model state_dict and optimizer state from save_path/filename

In [32]:
load_pretrained = True
save_path = '/content/gdrive/My Drive/Naver/weights'
filename = 'attn_decoder_5epochs.pth'

In [33]:
### Hyper Parameters ###
hidden_size = 256
alphabet_size = 661
dropout_prob = 0.1
learning_rate = 0.01
teacher_forcing_ratio = 0.5

# initialize model
encoder = EncoderRNN(alphabet_size, hidden_size)
attn_decoder = AttnDecoderRNN(hidden_size, alphabet_size, dropout_p=dropout_prob)

# optimizer
enc_optim = optim.SGD(encoder.parameters(), lr=learning_rate)
dec_optim = optim.SGD(attn_decoder.parameters(), lr=learning_rate)

start_epoch = 0
eval_avg_loss = []

# continue training
if load_pretrained:
  if os.path.isfile(os.path.join(save_path, filename)):
    print("=> loading checkpoint '{}'".format(filename))
    checkpoint = torch.load(os.path.join(save_path, filename))
    encoder.load_state_dict(checkpoint['state_dict_encoder'])
    attn_decoder.load_state_dict(checkpoint['state_dict_decoder'])
    enc_optim.load_state_dict(checkpoint['optimizer_encoder'])
    dec_optim.load_state_dict(checkpoint['optimizer_decoder'])
    start_epoch = checkpoint['epoch']
    eval_avg_loss = checkpoint['eval_avg_loss']
    print("=> loaded checkpoint '{}' (epoch {})"
              .format(filename, checkpoint['epoch']))
  else:
    print("=> no checkpoint found at '{}'".format(os.path.join(save_path, filename)))


  
encoder.to(device)
attn_decoder.to(device)

=> loading checkpoint 'attn_decoder_5epochs.pth'
=> loaded checkpoint 'attn_decoder_5epochs.pth' (epoch 5)


AttnDecoderRNN(
  (embedding): Embedding(661, 256)
  (attn): Linear(in_features=512, out_features=85, bias=True)
  (attn_combine): Linear(in_features=512, out_features=256, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(256, 256)
  (out): Linear(in_features=256, out_features=661, bias=True)
)

In [None]:
number_of_epochs = 2

for epoch in range(start_epoch, start_epoch + number_of_epochs):
  print('Starting epoch {}:'.format(epoch+1))
  train_epoch(encoder, attn_decoder, enc_optim, dec_optim, train_loader, device,
              teacher_forcing_ratio)
  avg_loss = eval_epoch(encoder, attn_decoder, val_loader, device)
  eval_avg_loss.append(avg_loss)

showPlot(eval_avg_loss)

Starting epoch 6:


HBox(children=(FloatProgress(value=0.0, max=5539.0), HTML(value='')))


0m 28s (- 4m 51s) (500 9%) 1.4840

0m 57s (- 4m 21s) (1000 18%) 1.4306

1m 25s (- 3m 51s) (1500 27%) 1.4773

1m 54s (- 3m 22s) (2000 36%) 1.4468

2m 23s (- 2m 53s) (2500 45%) 1.3550

2m 50s (- 2m 24s) (3000 54%) 1.3832

3m 18s (- 1m 55s) (3500 63%) 1.4179

3m 47s (- 1m 27s) (4000 72%) 1.4258

4m 17s (- 0m 59s) (4500 81%) 1.4687

4m 46s (- 0m 30s) (5000 90%) 1.5187

5m 16s (- 0m 2s) (5500 99%) 1.4145



HBox(children=(FloatProgress(value=0.0, max=1999.0), HTML(value='')))


Starting epoch 7:


HBox(children=(FloatProgress(value=0.0, max=5539.0), HTML(value='')))


0m 28s (- 4m 42s) (500 9%) 1.1870

0m 57s (- 4m 21s) (1000 18%) 1.2161

1m 28s (- 3m 58s) (1500 27%) 1.2326

1m 57s (- 3m 27s) (2000 36%) 1.3212

2m 26s (- 2m 58s) (2500 45%) 1.3265

2m 56s (- 2m 29s) (3000 54%) 1.3113


In [None]:
save_path = '/content/gdrive/My Drive/Naver/weights'
filename = 'attn_decoder_7epochs.pth'
def save_checkpoint(state, save_path, filename):
    torch.save(state, os.path.join(save_path,filename))


save_checkpoint({'epoch': epoch + 1,
                  'state_dict_encoder': encoder.state_dict(),
                  'state_dict_decoder': attn_decoder.state_dict(),
                  'optimizer_encoder': enc_optim.state_dict(),
                  'optimizer_decoder': dec_optim.state_dict(),
                  'eval_avg_loss': eval_avg_loss},
                save_path,
                filename)
print('saved model to', os.path.join(save_path, filename))

In [None]:
def evaluate_sample(encoder, decoder, source_tensor, max_length=max_length):
    with torch.no_grad():
        input_length = source_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(source_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(str(topi.item()))

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(val_dataset)
        print('source', pair['source_sequence'].cpu().numpy().squeeze(1).tolist())
        print('target', pair['target_sequence'].cpu().numpy().squeeze(1).tolist())
        output_words, attentions = evaluate_sample(encoder, decoder, pair['source_sequence'])
        output_sentence = ' '.join(output_words)
        print(' model', output_sentence)
        print('')

In [None]:
evaluateRandomly(encoder, attn_decoder)