# Sparse Sequence-to-Sequence Models

In [1]:
import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm, tnrange, tqdm_notebook
from time import time

from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.rendered_html { font-size: 18px; }</style>"))

%matplotlib inline

import warnings
warnings.filterwarnings('always')
warnings.simplefilter('ignore')


In [0]:
1 / 0

# choose your path here
# PROJECT_PATH = '/content/gdrive/My Drive/Colab Notebooks/2019 Autumn - DL/Project'


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
project_path = PROJECT_PATH

import os
print(os.listdir(project_path))

In [0]:
import sys
sys.path.append(os.path.join(project_path, 'modules'))

In [0]:
import torch
import torchvision

import torch.nn as nn
from torch import optim
import torch.nn.functional as F


In [0]:

from model import *
from load_data import *


### Training utils:

In [0]:
! pip3 install sacrebleu
from sacrebleu import corpus_bleu


In [0]:
def calc_loss(criterion, pred, target, pad_idx=0):
    target = target[:, 1:]

    batch_size, seq_len = target.size()
    pred = pred.contiguous().view(batch_size * seq_len, pred.size(-1))
    target = target.contiguous().view(batch_size * seq_len)

    loss = criterion(ignore_index=pad_idx)(pred, target)
    return loss


def calc_accuracy(pred, target):
    # -> : [batch_size, max_seq_len]

    rows, cols = torch.where(pred == END_ID)
    mask = torch.zeros_like(pred)
    mask[rows, cols] = 1
    mask = torch.clamp(mask.cumsum(axis=1), max=1)
    mask = torch.roll(mask, 1)
    mask[:, 0] = 0

    pred *= (1 - mask)

    acc = torch.eq(torch.sum(torch.eq(pred, target.int()), axis=1).int(), target.size(1)).float().mean().item()
    return acc * 100


def unpad_string(s, end_id=3):
    pattern = re.compile(' %s ' % end_id)
    return re.split(pattern, s,maxsplit=1)[0]


def calc_bleu(pred, target, vocab_target):
    pred = [unpad_string(' '.join([str(token.item()) for token in sent])) for sent in pred]
    target = [[unpad_string(' '.join([str(token.item()) for token in sent])) for sent in target]]
    return corpus_bleu(pred, target).score


def calc_metric(pred, target, metric_name, vocab_target=None):
    pred = torch.argmax(pred, dim=-1)
    target = target[:, 1:]
    if metric_name == 'accuracy':
        return calc_accuracy(pred, target)
    if metric_name == 'bleu':
        return calc_bleu(pred, target, vocab_target)
    raise ValueError("Choose either 'accuracy' or 'bleu'")


def trainEpoch(model, data_x, data_y, opt,
               criterion, metric_name, shuffle, batch_size, max_iter=None):
    model.train()

    N = len(data_x)
    indices = np.arange(N)
    if shuffle:
        np.random.shuffle(indices)
    loss_log, metric_log = [], []
    n_batches = math.ceil(N / batch_size)
    for i in tnrange(n_batches, desc='train batches:'):
        if max_iter is not None and i >= max_iter:
            break
        idx_i = indices[i * batch_size : (i + 1) * batch_size]
        x_i = data_x[idx_i]
        y_i = data_y[idx_i]

        opt.zero_grad()
        z_i, y_i = model(x_i, y_i)

        loss = 0
        loss = calc_loss(criterion, z_i, y_i)
        loss.backward()
        opt.step()
        loss = loss.item()
        loss_log.append(loss)
        metric_log.append(calc_metric(z_i, y_i, metric_name))

    return loss_log, metric_log


def test(model, data_x, data_y,
         criterion, metric_name, batch_size):
    model.eval()

    N = len(data_x)
    indices = np.arange(N)
    loss_sum, metric_sum = 0, 0
    n_batches = math.ceil(N / batch_size)
    for i in tnrange(n_batches, desc='test batches:'):
        idx_i = indices[i * batch_size : (i + 1) * batch_size]
        x_i = data_x[idx_i]
        y_i = data_y[idx_i]

        z_i, y_i = model(x_i, y_i)

        loss = 0
        loss = calc_loss(criterion, z_i, y_i)
        loss = loss.item()
        loss_sum += loss
        metric_sum += calc_metric(z_i, y_i, metric_name)
    return loss_sum / n_batches, metric_sum / n_batches


def plot_logs(train_loss_log, dev_loss_log, loss_name,
              train_metric_log, dev_metric_log, metric_name,
              results_path, model_name):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 6))

    ax1.plot(train_loss_log, label='train', zorder=1)
    ax1.scatter([x[0] for x in dev_loss_log],
                [x[1] for x in dev_loss_log],
                marker='.', s=90, c='orange', label='dev', zorder=2)
    ax1.set_xlabel('batches')
    ax1.set_ylabel(loss_name)
    ax1.legend(loc='best')
    ax1.grid()
    ax1.set_title(model_name)

    ax2.plot(train_metric_log, label='train', zorder=1)
    ax2.scatter([x[0] for x in dev_metric_log],
                [x[1] for x in dev_metric_log],
                marker='.', s=90, c='orange', label='dev', zorder=2)
    ax2.set_xlabel('batches')
    ax2.set_ylabel(metric_name)
    ax2.legend(loc='best')
    ax2.grid()
    ax2.set_title(model_name)

    fig.show()

    fig.savefig(os.path.join(results_path, '%s.png' % model_name))



def plot_logs_from_model_params(model_params):
    model_name = model_params['model_name']
    loss_name = model_params['loss_name']
    metric_name = model_params['metric_name']
    train_loss_log = model_params['logs'][loss_name]['train']
    dev_loss_log = model_params['logs'][loss_name]['dev']
    train_metric_log = model_params['logs'][metric_name]['train']
    dev_metric_log = model_params['logs'][metric_name]['dev']
    results_path = model_params['results_path']

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 6))

    ax1.plot(train_loss_log, label='train', zorder=1)
    ax1.scatter([x[0] for x in dev_loss_log],
                [x[1] for x in dev_loss_log],
                marker='.', s=90, c='orange', label='dev', zorder=2)
    ax1.set_xlabel('batches')
    ax1.set_ylabel(loss_name)
    ax1.legend(loc='best')
    ax1.grid()
    ax1.set_title(model_name)

    ax2.plot(train_metric_log, label='train', zorder=1)
    ax2.scatter([x[0] for x in dev_metric_log],
                [x[1] for x in dev_metric_log],
                marker='.', s=90, c='orange', label='dev', zorder=2)
    ax2.set_xlabel('batches')
    ax2.set_ylabel(metric_name)
    ax2.legend(loc='best')
    ax2.grid()
    ax2.set_title(model_name)

    fig.show()

    fig.savefig(os.path.join(results_path, '%s.png' % model_name))




def train(model_params, n_epochs=1, max_iter=None, batch_size=64, shuffle=True,
          save_model_after_each_epoch=False):
    model = model_params['model']
    model_name = model_params['model_name']
    train_x = model_params['data']['from']['train']
    train_y = model_params['data']['to']['train']
    dev_x = model_params['data']['from']['dev']
    dev_y = model_params['data']['to']['dev']
    opt = model_params['optimiser']
    scheduler = model_params['scheduler']
    criterion = model_params['criterion']
    loss_name = model_params['loss_name']
    metric_name = model_params['metric_name']
    results_path = model_params['results_path']

    train_loss_log, dev_loss_log = [], []
    train_metric_log, dev_metric_log = [], []
    for epoch in range(n_epochs):
        t = time()

        train_loss, train_metric = trainEpoch(model, train_x, train_y, opt, criterion,
                                              metric_name, shuffle, batch_size, max_iter)
        train_loss_log.extend(train_loss)
        train_metric_log.extend(train_metric)

        dev_loss, dev_metric = test(model, dev_x, dev_y, criterion,
                                    metric_name, batch_size)
        dev_loss_log.append((len(train_loss_log) - 1, dev_loss))
        dev_metric_log.append((len(train_metric_log) - 1, dev_metric))

        print('\n' + '=' * 130)
        print('Epoch %d / %d:' % (epoch + 1, n_epochs), end='\t')
        print('train %s = %.3f, dev %s = %.3f, '
              'train %s = %.2f, dev %s = %.2f, %d seconds' %
              (loss_name, np.mean(train_loss),
               loss_name, dev_loss,
               metric_name,  np.mean(train_metric),
               metric_name, dev_metric, int(time() - t)))
        scheduler.step(dev_loss)
        print('=' * 130 + '\n')
        
        if save_model_after_each_epoch:
            save_model_to(model, results_path, 'model %s epoch %d.pth' %
                          (model_name, epoch + 1))  

    plot_logs(train_loss_log, dev_loss_log, loss_name,
              train_metric_log, dev_metric_log, metric_name,
              results_path, model_name)

    return {
        loss_name : {'train' : train_loss_log, 'dev' : dev_loss_log},
        metric_name : {'train' : train_metric_log, 'dev' : dev_metric_log}
    }


In [0]:
def save_model_to(model, folder_path, file_name):
    with open(os.path.join(folder_path, file_name), 'wb') as f:
        torch.save(model.state_dict(), f)

def load_model_from(model, folder_path, file_name):
    with open(os.path.join(folder_path, file_name), 'rb') as f:
        model.load_state_dict(torch.load(f))

import pickle

def save_object_to(obj, folder_path, file_name):
    with open(os.path.join(folder_path, file_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_object_from(folder_path, file_name):
    with open(os.path.join(folder_path, file_name), 'rb') as f:
        return pickle.load(f)


### Get needed model:

In [0]:
def get_model(task, alpha_attn, alpha_output, project_path,
              setting=None, L1=None, L2=None, LR=1e-3):
    '''
    task : {'mt' or 'inflection'}

    setting : {'high' or 'medium'}
    
    {L1, L2} = {'en', 'de'}
    ____
    Output: dict of following keys { model, model_name, vocab, data, results_path, \
            direction_to_lang, optimiser, scheduler, criterion, loss_name, metric_name }

    '''

    data_path = os.path.join(project_path, 'data')
    results_path = os.path.join(project_path, 'results')

    vocab = dict()
    data = dict()
    direction_to_lang = None

    model_name = '%s' % task

    embed_size = None
    hidden_size = None

    if task == 'mt':

        embed_size = 500
        hidden_size = 500

        data_path = os.path.join(data_path, 'translation')
        results_path = os.path.join(results_path, 'translation')

        if not ((L1 == 'en' and L2 == 'de') or
                (L1 == 'de' and L2 == 'en')):
            raise ValueError('Specify languages correctly')

        model_name += ' %s -> %s' % (L1, L2)
        direction_to_lang = { 'from' : L1, 'to' : L2 }
        lang_to_dir = { L1 : 'from', L2 : 'to'}
        for lang in [L1, L2]:
            vocab[lang_to_dir[lang]] = make_vocabulary_mt(
                os.path.join(data_path, 'vocab.%s' % lang), lang
            )
            data[lang_to_dir[lang]] = dict()
            for set_type in ['train', 'dev', 'test']:
                data[lang_to_dir[lang]][set_type] = load_data_file(data_path,
                    '%s.BPE.%s' % (set_type, lang))
                if lang == L2:
                    # add SOS, EOS for training and evaluation
                    data[lang_to_dir[L2]][set_type] = np.array([
                        [BEGIN_TOKEN] + x + [END_TOKEN] for x in data[lang_to_dir[L2]][set_type]
                    ])

    elif task == 'inflection':

        embed_size = 300
        hidden_size = 300

        data_path = os.path.join(data_path, 'inflection')
        results_path = os.path.join(results_path, 'inflection')

        if setting not in {'medium', 'high'}:
            raise ValueError('Specify setting correctly')
        
        model_name += ' %s' % setting
        data_path = os.path.join(data_path, setting)
        results_path = os.path.join(results_path, setting)

        direction_to_xy = { 'from' : 'x', 'to' : 'y'}
        for direction in ['from', 'to']:
            data[direction] = dict()
            for set_type in ['train', 'dev', 'test']:
                data[direction][set_type] = load_data_file(data_path,
                    '%s_%s' % (set_type, direction_to_xy[direction]))
            vocab[direction] = make_vocabulary_inflection(data[direction]['train'], direction,
                                                          setting + ' ' + direction)

    else:
        raise ValueError('Specify task correctly')


    criterion, loss_name = None, None
    if alpha_output == 1.0:
        criterion = nn.CrossEntropyLoss
        loss_name = 'cross entropy'
    elif alpha_output == 1.5:
        criterion = Entmax15Loss
        loss_name = '1.5-entmax loss'
    elif alpha_output == 2.0:
        criterion = SparsemaxLoss
        loss_name = '2-entmax loss'
    else:
        raise NotImplementedError('Select alpha from {1.0, 1.5, 2.0}')
    model_name += ' alpha_attn %s alpha_out %s' % (str(alpha_attn), str(alpha_output))


    DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = Seq2Seq(vocab['from'], vocab['to'], embed_size, hidden_size, DEVICE,
                    alpha_attn)
    model = model.to(DEVICE)
    print('Model name: %s' % model_name)
    print('DEVICE:', DEVICE)

    n_params_grad = sum(p.numel() for p in model.parameters() if p.requires_grad)
    n_params_no_grad = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    print('Parameters requiring grad: %d, other parameters: %d' % (n_params_grad, n_params_no_grad))

    optimiser = optim.Adam(model.parameters(), lr=LR)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimiser, factor=0.5, patience=0, verbose=True)


    metric_name = 'bleu' if task == 'mt' else 'accuracy'

    print('Loss: %s\nMetric: %s' % (loss_name, metric_name))


    return {
        'model' : model,
        'model_name' : model_name,
        'vocab' : vocab,
        'data' : data,
        'results_path' : results_path,
        'dir_to_lang' : direction_to_lang,
        'optimiser' : optimiser,
        'scheduler' : scheduler,
        'criterion' : criterion,
        'loss_name' : loss_name,
        'metric_name' : metric_name,
        'logs' : dict()
    }




### Create model:

In [0]:
# choose your model parameter:
TASK = 'inflection' # 'mt' or 'inflection'
ALPHA_ATTENTION = 1.0
ALPHA_LOSS = 1.0
INFLECTION_SETTING = 'medium' # 'medium' or 'high', None by default
MT_L1 = 'de' # source language: 'en' or 'de', None by default
MT_L2 = 'en' # target language: 'de' or 'en', None by default
SAVE_MODEL_AFTER_EPOCH = False


In [0]:
model_params = get_model(TASK,
                         alpha_attn=ALPHA_ATTENTION, ALPHA_LOSS=2.0,
                         project_path=PROJECT_PATH,
                         setting=INFLECTION_SETTING,
                         L1=MT_L1, L2=MT_L2,
                         save_model_after_each_epoch=SAVE_MODEL_AFTER_EPOCH)


### Load model weights or logs if needed (you need to create it first):

In [0]:
# We could not store the weights of our models on GitHub

'''
load_model_from(model_params['model'],
                model_params['results_path'],
                'model %s.pth' % model_params['model_name'])
'''

In [0]:
# training and validation logs:

# model_params['logs'] = load_object_from(model_params['results_path'], 'logs %s.pkl' % model_params['model_name'])


### Train model:

In [0]:
BATCH_SIZE = 64
N_EPOCHS = 1
MAX_ITER = None
USE_SHUFFLE = True


In [0]:
model_params['logs'] = train(model_params,
                             n_epochs=N_EPOCHS, max_iter=MAX_ITER, batch_size=BATCH_SIZE, shuffle=USE_SHUFFLE)


In [0]:
model_params['model'].eval()

test(model_params['model'],
     model_params['data']['from']['test'],
     model_params['data']['to']['test'],
     model_params['criterion'], model_params['metric_name'], BATCH_SIZE)


### Save results:

In [0]:
save_model_to(model_params['model'], model_params['results_path'], 'model %s.pth' % model_params['model_name'])


In [0]:
save_object_to(model_params['logs'], model_params['results_path'], 'logs %s.pkl' % model_params['model_name'])
