<b>Transformer</b>

adapted from: https://github.com/zhangxiangnick/Transformer-py

In [2]:

from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
! pip install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/58/33/cced4891eddd1a3ac561ff99081019fddc7838a07cace272c941e3c2f915/pycuda-2018.1.1.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 19.0MB/s 
[?25hCollecting pytools>=2011.2 (from pycuda)
[?25l  Downloading https://files.pythonhosted.org/packages/ac/a3/f54f7190315ad41b7334d8733350e7fcefded8f25e0b45e2329b80279921/pytools-2019.1.tar.gz (57kB)
[K    100% |████████████████████████████████| 61kB 27.6MB/s 
Collecting appdirs>=1.4.0 (from pycuda)
  Downloading https://files.pythonhosted.org/packages/56/eb/810e700ed1349edde4cbdc1b2a21e28cdf115f9faf263f6bbf8447c1abf3/appdirs-1.4.3-py2.py3-none-any.whl
Collecting mako (from pycuda)
[?25l  Downloading https://files.pythonhosted.org/packages/a1/bb/f4e5c056e883915c37bb5fb6fab7f00a923c395674f83bfb45c9ecf836b6/Mako-1.0.9.tar.gz (459kB)
[K    100% |████████████████████████████████| 460kB 30.5MB/s 
Building wheels for collected packages: pycuda, pytool

### Нужные классы и функции

In [0]:
import collections
import datetime
import inspect
import logging
import os
import signal
import types
from pprint import pprint

import dill
import torch
import torch.nn.init as init
from pycuda import autoinit, driver
from torch import nn


#######################################################################################################################

def gpu_stat():
    if torch.cuda.is_available():

        def pretty_bytes(byte_, precision=1):
            abbrevs = (
                (1 << 50, 'PB'), (1 << 40, 'TB'), (1 << 30, 'GB'), (1 << 20, 'MB'), (1 << 10, 'kB'), (1, 'bytes'))
            if byte_ == 1:
                return '1 byte'
            factor, suffix = 1, ''
            for factor, suffix in abbrevs:
                if byte_ >= factor:
                    break
            return '%.*f%s' % (precision, byte_ / factor, suffix)

        device = autoinit.device
        print('GPU Name: %s' % device.name())
        print('GPU Memory: %s' % pretty_bytes(device.total_memory()))
        print('CUDA Version: %s' % str(driver.get_version()))
        print('GPU Free/Total Memory: %d%%' % ((driver.mem_get_info()[0] / driver.mem_get_info()[1]) * 100))


#######################################################################################################################


class HYPERPARAMETERS(collections.OrderedDict):
    """
    Class to make it easier to access hyper parameters by either dictionary or attribute syntax.
    """

    def __init__(self, dictionary):
        super(HYPERPARAMETERS, self).__init__(dictionary)

    def __getattr__(self, name):
        return self[name]

    def __setattr__(self, name, value):
        self[name] = value

    def __getstate__(self):
        return self

    def __setstate__(self, d):
        self = d

    def pprint(self, path):
        with open(path, "w+") as h_file:
            pprint(self, stream=h_file)

    @staticmethod
    def create_timestamp(ts=None):
        ts = datetime.datetime.now().timestamp() if ts is None else ts
        dts = datetime.datetime.fromtimestamp(ts)
        return '{:04d}-{:02d}-{:02d}-{:02d}-{:02d}-{:02d}-{:06d}'.format(dts.year, dts.month, dts.day, dts.hour,
                                                                         dts.minute, dts.second, dts.second,
                                                                         dts.microsecond)

    @staticmethod
    def convert_timestamp(time_str=None):
        dts = datetime.datetime(*[int(parts) for parts in time_str.split('-')])
        return dts.timestamp()

    @staticmethod
    def load(path):
        with open(path, 'rb') as in_strm:
            h = dill.load(in_strm)
        return h

    @staticmethod
    def dump(h, path):
        with open(path, 'wb') as out_strm:
            dill.dump(h, out_strm)

    def __repr__(self):
        fmt_str = '{' + '\n'
        for k, v in self.items():
            if '__class__' in k:
                continue
            if isinstance(v, types.LambdaType):  # function or lambda
                if v.__name__ in '<lambda>':
                    try:
                        fmt_str += inspect.getsource(v)
                    except:
                        fmt_str += "    " + "'{}'".format(k).ljust(32) + ": '" + str(v) + "' ,\n"
                else:
                    fmt_str += "    " + "'{}'".format(k).ljust(32) + ': ' + v.__name__ + ' ,\n'
            elif isinstance(v, type):  # class
                fmt_str += "    " + "'{}'".format(k).ljust(32) + ': ' + v.__name__ + ' ,\n'
            else:  # everything else
                if isinstance(v, str):
                    fmt_str += "    " + "'{}'".format(k).ljust(32) + ": '" + str(v) + "' ,\n"
                else:
                    fmt_str += "    " + "'{}'".format(k).ljust(32) + ': ' + str(v) + ' ,\n'
        fmt_str += '}\n'
        return fmt_str


#######################################################################################################################


class Metric(object):
    """
    Class to track runtime statistics easier. Inspired by History Variables that not only store the current value,
    but also the values previously assigned. (see https://rosettacode.org/wiki/History_variables)
    """

    def __init__(self, metrics):
        self.metrics = [m[0] for m in metrics]
        self.init_vals = {m[0]: m[1] for m in metrics}
        self.values = {}
        for name in self.metrics:
            self.values[name] = []

    def __setattr__(self, name, value):
        self.__dict__[name] = value
        if name in self.metrics:
            self.values[name].append(value)

    def __getattr__(self, attr):
        if attr in self.metrics and not len(self.values[attr]):
            val = self.init_vals[attr]
        else:
            val = self.__dict__[attr]
        return val

    def values(self, metric):
        return self.values[metric]

    def state_dict(self):
        state = {}
        for m in self.metrics:
            state[m] = self.values[m]
        return state

    def load_state_dict(self, state_dict):
        for m in state_dict:
            self.values[m] = state_dict[m]


#######################################################################################################################
# https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5


def torch_weight_init(m):
    """
    Usage:
        model = Model()
        model.apply(weight_init)
    """
    if isinstance(m, nn.Conv1d):
        init.normal(m.weight.data)
        init.normal(m.bias.data)
    elif isinstance(m, nn.Conv2d):
        init.xavier_normal(m.weight.data)
        init.normal(m.bias.data)
    elif isinstance(m, nn.Conv3d):
        init.xavier_normal(m.weight.data)
        init.normal(m.bias.data)
    elif isinstance(m, nn.ConvTranspose1d):
        init.normal(m.weight.data)
        init.normal(m.bias.data)
    elif isinstance(m, nn.ConvTranspose2d):
        init.xavier_normal(m.weight.data)
        init.normal(m.bias.data)
    elif isinstance(m, nn.ConvTranspose3d):
        init.xavier_normal(m.weight.data)
        init.normal(m.bias.data)
    elif isinstance(m, nn.BatchNorm1d):
        init.normal(m.weight.data, mean=1, std=0.02)
        init.constant(m.bias.data, 0)
    elif isinstance(m, nn.BatchNorm2d):
        init.normal(m.weight.data, mean=1, std=0.02)
        init.constant(m.bias.data, 0)
    elif isinstance(m, nn.BatchNorm3d):
        init.normal(m.weight.data, mean=1, std=0.02)
        init.constant(m.bias.data, 0)
    elif isinstance(m, nn.Linear):
        init.xavier_normal(m.weight.data)
        init.normal(m.bias.data)
    elif isinstance(m, nn.LSTM):
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal(param.data)
            else:
                init.normal(param.data)
    elif isinstance(m, nn.LSTMCell):
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal(param.data)
            else:
                init.normal(param.data)
    elif isinstance(m, nn.GRU):
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal(param.data)
            else:
                init.normal(param.data)
    elif isinstance(m, nn.GRUCell):
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal(param.data)
            else:
                init.normal(param.data)
    elif isinstance(m, nn.Embedding):
        m.weight.data.uniform_(-0.1, 0.1)

#######################################################################################################################

def create_logger(H):
    if not os.path.exists(H.EXPERIMENT):
        os.makedirs(H.EXPERIMENT)

    logFormatter = logging.Formatter('%(asctime)s | %(levelname)s : %(message)s')

    fileHandler = logging.FileHandler("{0}/{1}.log".format(H.EXPERIMENT, H.MODEL_NAME))
    fileHandler.setFormatter(logFormatter)

    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormatter)

    logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                        level=logging.INFO, handlers=[consoleHandler, fileHandler])


#######################################################################################################################
# https://github.com/SeanNaren/deepspeech.pytorch/blob/master/model.py


class SequenceWise(nn.Module):
    def __init__(self, module):
        """
        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
        Allows handling of variable sequence lengths and minibatch sizes.
        :param module: Module to apply input to.
        """
        super(SequenceWise, self).__init__()
        self.module = module

    def forward(self, x):
        t, n = x.size(0), x.size(1)
        x = x.contiguous().view(t * n, -1)
        x = self.module(x)
        x = x.view(t, n, -1)
        return x

    def __repr__(self):
        tmpstr = self.__class__.__name__ + ' (\n'
        tmpstr += self.module.__repr__()
        tmpstr += ')'
        return tmpstr


#######################################################################################################################
# https://stackoverflow.com/questions/842557/how-to-prevent-a-block-of-code-from-being-interrupted-by-keyboardinterrupt-in-py


class DelayedKeyboardInterrupt(object):
    def __init__(self):
        self.signal_received = None

    def __enter__(self):
        self.signal_received = None
        self.old_handler = signal.signal(signal.SIGINT, self.handler)

    def handler(self, sig, frame):
        self.signal_received = (sig, frame)
        print('SIGINT received. Delaying KeyboardInterrupt.')

    def __exit__(self, type_, value, traceback):
        signal.signal(signal.SIGINT, self.old_handler)
        if self.signal_received:
            self.old_handler(*self.signal_received)

#######################################################################################################################


In [0]:
import os
import shutil

import torch


#######################################################################################################################
# https://github.com/IBM/pytorch-seq2seq/blob/master/seq2seq/util/checkpoint.py


class Checkpoint(object):
    """
    Class that manages the saving and loading of a model during training. It allows training to be suspended
    and resumed at a later time.
    """

    def __init__(self, module, optimizer=None, stopping=None, metrics=None,
                 root_dir='./', experiment_dir="model", restore_from=-1, interval=10, verbose=0):

        self.CHECKPOINT_DIR_NAME = 'chkpt'
        self.CHECKPOINT_FILE_NAME = 'state.tar'

        self.module = module
        self.optimizer = optimizer
        self.stopping = stopping
        self.metrics = metrics
        self.interval = interval

        self.root_dir = root_dir
        self.experiment_dir = experiment_dir
        self.restore_from = restore_from
        self.verbose = verbose

        self.timestamp = None

    def create(self, epoch):
        """
        Creates a checkpoint of the current model and related training parameters into a subdirectory of the checkpoint
        directory. The name of the subdirectory is the current local time in Y_M_D_H_M_S format.
        """

        self.timestamp = HYPERPARAMETERS.create_timestamp()
        path = os.path.join(self.root_dir, self.CHECKPOINT_DIR_NAME, self.experiment_dir, self.timestamp)

        if os.path.exists(path):
            shutil.rmtree(path)

        os.makedirs(path)

        state = {
            'timestamp': self.timestamp,
            'epoch': epoch,
            'module': self.module.state_dict(),
            'optimizer': self.optimizer.state_dict() if self.optimizer else None,
            'stopping': self.stopping.state_dict() if self.stopping else None,
            'metrics': self.metrics.state_dict() if self.metrics else None
        }

        torch.save(state, os.path.join(path, self.CHECKPOINT_FILE_NAME))

        if self.verbose:
            print("Created checkpoint in '{}' ".format(path))

    def restore(self):
        """
        Restores a current model and related training parameters from a checkpoint object that was previously
        saved to disk.
        """

        file_name = self.last()

        assert file_name is not None

        state = torch.load(file_name)

        self.timestamp = state['timestamp']
        self.module.load_state_dict(state['module'])
        if self.optimizer:
            self.optimizer.load_state_dict(state['optimizer'])
        if self.stopping:
            self.stopping.load_state_dict(state['stopping'])
        if self.metrics:
            self.metrics.load_state_dict(state['metrics'])

        if self.verbose:
            print("Restored checkpoint from '{}' ".format(file_name))

        return state['epoch']

    def step(self, epoch):
        """"""
        if not epoch % self.interval:
            self.create(epoch)
            if self.verbose:
                print("Epoch: %d checkpoint created!" % epoch)

    def last(self):
        """
        Returns the path to the last saved checkpoint file for a given set of parameters.
        Precondition: at least one checkpoint has been made (i.e., latest checkpoint subdirectory exists).
         """
        checkpoints_path = os.path.join(self.root_dir, self.CHECKPOINT_DIR_NAME, self.experiment_dir)

        try:
            path = sorted(os.listdir(checkpoints_path), reverse=False)[self.restore_from]

            last_path = os.path.join(checkpoints_path, os.path.join(path, self.CHECKPOINT_FILE_NAME))
        except:
            last_path = "undefined"

        return last_path

    def __repr__(self):
        fmt_str = self.__class__.__name__ + '\n'
        fmt_str += '    Timestamp: {}\n'.format(self.timestamp)
        fmt_str += '    Last Checkpoint: {}\n'.format(self.last())
        return fmt_str

#######################################################################################################################


In [0]:
import copy


#######################################################################################################################


class Stopping(object):
    """
    Class implement some of regularization techniques to avoid over-training as described in
    http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf
    """

    def __init__(self, model, patience=50):
        self.model = model
        self.patience = patience

        self.best_score = -1
        self.best_score_epoch = 0
        self.best_score_state = None

    def step(self, epoch, train_score, valid_score):
        if valid_score > self.best_score:
            self.best_score = valid_score
            self.best_score_epoch = epoch
            self.best_score_state = copy.deepcopy(self.model.state_dict())
            return False
        elif self.best_score_epoch + self.patience < epoch:
            return True

    def state_dict(self):
        return {
            'patience': self.patience,
            'best_score': self.best_score,
            'best_score_epoch': self.best_score_epoch,
            'best_score_state': self.best_score_state,
        }

    def load_state_dict(self, state_dict):
        self.patience = state_dict['patience']
        self.best_score = state_dict['best_score']
        self.best_score_epoch = state_dict['best_score_epoch']
        self.best_score_state = state_dict['best_score_state']

    def __repr__(self):
        fmt_str = self.__class__.__name__ + '\n'
        fmt_str += '    Patience: {}\n'.format(self.patience)
        fmt_str += '    Best Score: {:.4f}\n'.format(self.best_score)
        fmt_str += '    Epoch of Best Score: {}\n'.format(self.best_score_epoch)
        return fmt_str

#######################################################################################################################


In [0]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import sklearn
import torch
import torch.utils.data
import torchvision.transforms as transforms
from IPython.display import display
from graphviz import Digraph
from sklearn import metrics


#######################################################################################################################


def visualize_data(img, tks, vocab, figsize=None, ax=None):
    if img.size(0) is 3:
        pil_img = transforms.ToPILImage()(img)
        pil_img = pil_img.convert('L')
        img = transforms.ToTensor()(pil_img)

    img = img.squeeze().cpu().numpy()
    if isinstance(tks, str):
        txt = tks
    else:
        txt = ''.join([vocab.idx2token[tkn.item()] for tkn in tks])

    if figsize is not None:
        plt.figure(figsize=figsize)

    if not ax:
        plt.title(txt)
        plt.imshow(img, cmap='gray')
    else:
        ax.set_title(txt)
        ax.imshow(img, cmap='gray')


###################################################################################################################


def plot_learning_curves(m, loss_ylim=(0, 1.0), score_ylim=(0.0, 1.0), figsize=(14, 6)):
    train_loss = m.values['train_loss'] if 'train_loss' in m.values else None
    train_score = m.values['train_score'] if 'train_score' in m.values else None
    train_lr = m.values['train_lr'] if 'train_lr' in m.values else None
    valid_loss = m.values['valid_loss'] if 'valid_loss' in m.values else None
    valid_ppl = m.values['valid_ppl'] if 'valid_ppl' in m.values else None
    valid_score = m.values['valid_score'] if 'valid_score' in m.values else None

    train_epochs = np.linspace(1, len(train_loss), len(train_loss))

    fig, ax = plt.subplots(1, 2, figsize=figsize)

    if train_loss is not None:
        loss_train_min = np.min(train_loss)
        ax[0].plot(train_epochs, train_loss, color="r",
                   label="Trainings loss (min %.4f)" % loss_train_min)  # alpha=0.3)

    if valid_loss is not None:
        loss_valid_min = np.min(valid_loss)
        ax[0].plot(train_epochs, valid_loss, color="b",
                   label="Validation loss (min %.4f)" % loss_valid_min)  # alpha=0.3)
        ax[0].legend(loc="best")

    if train_lr is not None:
        ax0 = ax[0].twinx()
        ax0.plot(train_epochs, train_lr, color="g", label="Learning Rate")  # alpha=0.3)
        ax0.set_ylabel('learning rate')

    ax[0].set_title("Loss")
    ax[0].set_xlim(0, np.max(train_epochs))
    ax[0].set_ylim(*loss_ylim)
    ax[0].set_xlabel('epochs')
    ax[0].set_ylabel('loss')

    if train_score is not None:
        score_train_max = np.max(train_score)
        ax[1].plot(train_epochs, train_score, color="r",
                   label="Trainings score (max %.4f)" % score_train_max)

    if valid_score is not None:
        score_valid_max = np.max(valid_score)
        ax[1].plot(train_epochs, valid_score, color="b",
                   label="Validation score (max %.4f)" % score_valid_max)

    if train_lr is not None:
        ax1 = ax[1].twinx()
        ax1.plot(train_epochs, train_lr, color="g", label="Learning Rate")  # alpha=0.3)
        ax1.set_ylabel('learning rate')

    ax[1].set_title("Score")
    ax[1].set_xlim(0, np.max(train_epochs))
    ax[1].set_ylim(*score_ylim)
    ax[1].set_xlabel('epochs')
    ax[1].set_ylabel('score')
    ax[1].legend(loc="best")

    plt.grid(False)
    plt.tight_layout()


#####################################################################################################################


def plot_cross_validation_scores(scores, figsize=(12, 4)):
    train_score = scores['train_score']
    valid_scores = scores['test_score']
    score_difference = train_score - valid_scores

    plt.figure(figsize=figsize)
    plt.subplot(211)

    train_score_line, = plt.plot(train_score, color='r')
    valid_scores_line, = plt.plot(valid_scores, color='b')
    plt.ylabel("Score", fontsize="14")
    plt.legend([train_score_line, valid_scores_line], ["Train CV", "Validate CV"], bbox_to_anchor=(0, .4, .5, 0))
    plt.title("Train and Validation Cross Validation", x=.5, y=1.1, fontsize="15")

    # Plot bar chart of the difference.
    plt.subplot(212)
    difference_plot = plt.bar(range(len(score_difference)), score_difference)
    plt.xlabel("Cross-fold #")
    plt.legend([difference_plot], ["Test CV - Validation CV Score"], bbox_to_anchor=(0, 1, .8, 0))
    plt.ylabel("Score difference", fontsize="14")

    plt.show()


#####################################################################################################################


def plot_roc_curve(y_true, y_pred, y_proba):
    plt.figure()

    fpr, tpr, _ = metrics.roc_curve(y_true, y_proba[:, 1])
    plt.plot(fpr, tpr, color='red', label="predict_proba")

    fpr, tpr, _ = metrics.roc_curve(y_true, y_pred)
    plt.plot(fpr, tpr, color='darkorange', label="predict")

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")


#####################################################################################################################
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

from sklearn.metrics import confusion_matrix, classification_report
import matplotlib
from matplotlib import cm
import itertools


def cm2inch(*tupl):
    '''
    Specify figure size in centimeter in matplotlib
    Source: http://stackoverflow.com/a/22787457/395857
    By gns-ank
    '''
    inch = 2.54
    if type(tupl[0]) == tuple:
        return tuple(i / inch for i in tupl[0])
    else:
        return tuple(i / inch for i in tupl)


def show_values(pc, fmt="%.2f", **kw):
    '''
    Heatmap with text in each cell with matplotlib's pyplot
    Source: http://stackoverflow.com/a/25074150/395857
    By HYRY
    '''
    pc.update_scalarmappable()
    ax = pc.axes
    for p, color, value in zip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
        x, y = p.vertices[:-2, :].mean(0)
        if np.all(color[:3] > 0.5):
            color = (0.0, 0.0, 0.0)
        else:
            color = (1.0, 1.0, 1.0)
        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)


def get_cmap():
    '''
    http://stackoverflow.com/questions/37517587/how-can-i-change-the-intensity-of-a-colormap-in-matplotlib
    '''
    cmap = cm.get_cmap('RdBu', 256)  # set how many colors you want in color map
    # modify colormap
    alpha = 1.0
    colors = []
    for ind in range(cmap.N):
        c = []
        if ind < 128 or ind > 210: continue
        for x in cmap(ind)[:3]: c.append(min(1, x * alpha))
        colors.append(tuple(c))
    my_cmap = matplotlib.colors.ListedColormap(colors, name='my_name')
    return my_cmap


def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels, ax, correct_orientation=False,
            cmap='RdBu', fmt="%.2f", graph_filepath='', normalize=False, remove_diagonal=False):
    '''
    Inspired by:
    - http://stackoverflow.com/a/16124677/395857
    - http://stackoverflow.com/a/25074150/395857
    '''
    if normalize:
        AUC = sklearn.preprocessing.normalize(AUC, norm='l1', axis=1)

    if remove_diagonal:
        matrix = np.copy(AUC)
        np.fill_diagonal(matrix, 0)
        if len(xticklabels) > 2:
            matrix[:, -1] = 0
            matrix[-1, :] = 0
        values = matrix.flatten()
    else:
        values = AUC.flatten()
    vmin = values.min()
    vmax = values.max()

    # c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)
    c = ax.pcolor(AUC, edgecolors='k', linestyle='dashed', linewidths=0.2, cmap=get_cmap(), vmin=vmin, vmax=vmax)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)

    # set tick labels
    ax.set_xticklabels(xticklabels, minor=False)
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title, y=1.08)

    plt.tight_layout()

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    # Remove last blank column
    plt.xlim((0, AUC.shape[1]))

    # Turn off all the ticks
    ax = plt.gca()
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # Add color bar
    plt.colorbar(c)

    # Add text in each cell
    show_values(c, fmt=fmt)

    # Proper orientation (origin at the top left instead of bottom left)
    if correct_orientation:
        ax.invert_yaxis()
        ax.xaxis.tick_top()

    if graph_filepath != '':
        plt.savefig(graph_filepath, dpi=300, format='png', bbox_inches='tight')
        plt.close()


def plot_classification_report(classification_report, title='Classification report ', cmap='RdBu',
                               figsize=(12, 9), ax=None):
    '''
    Plot scikit-learn classification report.
    Extension based on http://stackoverflow.com/a/31689645/395857
    '''

    from matplotlib.cbook import MatplotlibDeprecationWarning
    import warnings
    warnings.simplefilter('ignore', MatplotlibDeprecationWarning)

    classes = []
    plotMat = []
    support = []
    class_names = []

    lines = classification_report.split('\n')
    for line in lines[2: (len(lines) - 1)]:
        t = line.strip().replace('avg / total', 'micro-avg').split()
        if len(t) < 2: continue
        classes.append(t[0])
        v = [float(x) * 100 for x in t[1: len(t) - 1]]
        support.append(int(t[-1]))
        class_names.append(t[0])
        plotMat.append(v)

    xlabel = 'Metrics'
    ylabel = 'Classes'
    xticklabels = ['Precision', 'Recall', 'F1-score']
    yticklabels = ['{0} ({1})'.format(class_names[idx], sup) for idx, sup in enumerate(support)]
    #    figure_width = 16
    #    figure_height = len(class_names) + 8
    correct_orientation = True

    # Plot it out
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
    else:
        fig = plt.gcf()
        fig.sca(ax)

    heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, ax, correct_orientation, cmap=cmap)

    # resize
    # fig.set_size_inches(cm2inch(figsize[0], figsize[1]))


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens, ax=None):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    if not ax is None:
        plt.gcf().sca(ax)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, y=1.08)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    ax.xaxis.set_label_position('top')


def plot_classifier_summary(y_true, y_pred, target_names, figsize=(12, 5)):
    fig, ax = plt.subplots(1, 2, figsize=figsize)

    plot_classification_report(classification_report(y_true, y_pred, target_names=target_names), ax=ax[0])
    plot_confusion_matrix(confusion_matrix(y_true, y_pred), target_names, False, ax=ax[1])


####################################################################################################################

from sklearn.manifold import TSNE


def plot_scatter_plots(X, y_pred, y_proba, y_true, target_names, figsize=(12, 4)):
    tsne = TSNE(n_components=2, init='pca', random_state=0)
    tsne_data = tsne.fit_transform(X)

    idx = y_pred != y_true

    # set up figure
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=figsize)

    # Plot
    ax1.scatter(tsne_data[np.where(y_true == 1), 0], tsne_data[np.where(y_true == 1), 1],
                c='r', label=target_names[1])
    ax1.scatter(tsne_data[np.where(y_true == 0), 0], tsne_data[np.where(y_true == 0), 1],
                c='b', label=target_names[0])

    ax1.scatter(tsne_data[idx, 0], tsne_data[idx, 1], alpha=.8, lw=2, label="Error",
                facecolors='none', edgecolors='black', marker='o', s=80)

    ax2.scatter(y_proba[np.where(y_true == 1), 0], y_proba[np.where(y_true == 1), 1], c='r', label=target_names[1])
    ax2.scatter(y_proba[np.where(y_true == 0), 0], y_proba[np.where(y_true == 0), 1], c='b', label=target_names[0])

    ax2.scatter(y_proba[idx, 0], y_proba[idx, 1], alpha=.8, lw=2, label="Error",
                facecolors='none', edgecolors='black', marker='o', s=80)

    ax1.axes.get_xaxis().set_ticks([])
    ax1.axes.get_yaxis().set_ticks([])
    ax2.axes.get_xaxis().set_ticks([])
    ax2.axes.get_yaxis().set_ticks([])

    fig.suptitle('Scatter Plots', fontsize=20, fontweight='bold')
    plt.legend(loc=2, borderaxespad=.1, scatterpoints=1, bbox_to_anchor=(1.05, 1))

    fig.text(.25, .05, 'TSNE Test Data', fontsize=15)
    fig.text(.65, .05, 'CLF Proba Data', fontsize=15)


###################################################################################################################

def classifier_summary_report(X, y_true, y_pred, target_names):
    valid_score = metrics.f1_score(y_true, y_pred)
    acc_score = metrics.accuracy_score(y_true, y_pred)
    roc_score = metrics.roc_auc_score(y_true, y_pred)
    loss_score = metrics.log_loss(y_true, y_pred)

    print("Note: weighted average f1-score \n",
          metrics.classification_report(y_true, y_pred, target_names=target_names)
          )

    display(
        'Data points=%d' % X.shape[0],
        'Features=%d' % X.shape[1],
        'Class dist.=%f' % np.mean(y_true),
        'F1 valid=%f' % valid_score,
        'ACC=%f' % acc_score,
        'ROC_AUC=%f' % roc_score,
        'LOG_LOSS=%f' % loss_score,
        'Misclassified=%d' % np.sum(y_true != y_pred),
        'Data points=' + str([i for (i, v) in enumerate(y_true != y_pred) if v][:20])
    )


###################################################################################################################

def class_info(classes):
    counts = Counter(classes)
    total = sum(counts.values())
    print("class percentages:")
    for cls in counts.keys():
        print("%6s: % 7d  =  % 5.1f%%" % (cls, counts[cls], counts[cls] / total * 100))


def dataset_statistics(X_train, y_train, X_valid, y_valid, X_test, y_test, target_names):
    print("")
    print("Dataset statistics:")
    print("===================")
    print("%s %d" % ("number of features:".ljust(30), X_train.shape[1]))
    print("%s %d" % ("number of classes:".ljust(30), np.unique(y_train).shape[0]))
    print("%s %s" % ("data type:".ljust(30), X_train.dtype))
    print("%s %d (size=%dMB)"
          % ("number of train samples:".ljust(30), X_train.shape[0], int(X_train.nbytes / 1e6)))
    print("%s %d (size=%dMB)"
          % ("number of validation samples:".ljust(30), X_valid.shape[0], int(X_valid.nbytes / 1e6)))
    print("%s %d (size=%dMB)"
          % ("number of test samples:".ljust(30), X_test.shape[0], int(X_test.nbytes / 1e6)))
    print("%s %s" % ("classes".ljust(30), str(target_names)))
    class_info(y_train)


###################################################################################################################

def plot_loss_curve(train_loss, train_score=None, valid_loss=None, valid_score=None, train_lr=None):
    train_epochs = np.linspace(1, len(train_loss), len(train_loss))

    fig, ax = plt.subplots(1, 2, figsize=(14, 6))

    if not train_loss is None:
        loss_train_min = np.min(train_loss)
        ax[0].plot(train_epochs, train_loss, color="r",
                   label="Trainings loss (min %.4f)" % loss_train_min)  # alpha=0.3)

    if not valid_loss is None:
        loss_valid_min = np.min(valid_loss)
        ax[0].plot(train_epochs, valid_loss, color="b",
                   label="Validation loss (min %.4f)" % loss_valid_min)  # alpha=0.3)

    if not train_lr is None:
        ax0 = ax[0].twinx()
        ax0.plot(train_epochs, train_lr, color="g", label="Learning Rate")  # alpha=0.3)
        ax0.set_ylabel('lr')

    ax[0].set_title("Loss")
    ax[0].set_xlim(0, np.max(train_epochs))
    #     ax[0].set_ylim(0, 1)
    ax[0].set_xlabel('epochs')
    ax[0].set_ylabel('loss')
    ax[0].grid(True)
    ax[0].legend(loc="best")

    if not train_score is None:
        score_train_max = np.max(train_score)
        ax[1].plot(train_epochs, train_score, color="r",
                   label="Trainings score (max %.4f)" % score_train_max)

    if not valid_score is None:
        score_valid_max = np.max(valid_score)
        ax[1].plot(train_epochs, valid_score, color="b",
                   label="Validation score (max %.4f)" % score_valid_max)

    ax[1].set_title("Score")
    ax[1].set_xlim(0, np.max(train_epochs))
    ax[1].set_ylim(0.0, 1.02)
    ax[1].set_xlabel('epochs')
    ax[1].set_ylabel('score')
    ax[1].grid(True)
    ax[1].legend(loc="best")

    plt.legend(loc="best")


#####################################################################################################################

# https://stackoverflow.com/questions/42480111/model-summary-in-pytorch
# https://github.com/fchollet/keras/blob/master/keras/utils/layer_utils.py

def model_summary(model, line_length=None, positions=None):
    """Prints a summary of a model.
    # Arguments
        model: model instance.
        line_length: Total length of printed lines
            (e.g. set this to adapt the display to different
            terminal window sizes).
        positions: Relative or absolute positions of log elements in each line.
            If not provided, defaults to `[.33, .55, .67, 1.]`.
        print_fn: Print function to use.
            It will be called on each line of the summary.
            You can set it to a custom function
            in order to capture the string summary.
    """
    out_str = ""
    line_length = line_length or 80
    positions = positions or [.45, .85, 1.]
    if positions[-1] <= 1:
        positions = [int(line_length * p) for p in positions]
    # header names for the different log elements
    to_display = ['Layer (type)', 'Shape', 'Param #']

    def print_row(fields, positions):
        line = ''
        for i in range(len(fields)):
            if i > 0:
                line = line[:-1] + ' '
            line += str(fields[i])
            line = line[:positions[i]]
            line += ' ' * (positions[i] - len(line))
        return line

    out_str += "Summary for model: " + model.__class__.__name__ + "\n"
    out_str += '_' * line_length + "\n"
    out_str += print_row(to_display, positions) + "\n"
    out_str += '=' * line_length + "\n"

    def print_module_summary(name, module):
        count_params = sum([np.prod(p.size()) for p in module.parameters()])
        output_shape = tuple([tuple(p.size()) for p in module.parameters()])
        cls_name = module.__class__.__name__
        fields = [name + ' (' + cls_name + ')', output_shape, count_params]
        return print_row(fields, positions)

    module_count = len(set(model.modules()))
    for i, item in enumerate(model.named_modules()):
        name, module = item
        cls_name = str(module.__class__)
        if not 'torch' in cls_name or 'container' in cls_name:
            continue

        out_str += print_module_summary(name, module) + "\n"
        if i == module_count - 1:
            out_str += '=' * line_length + "\n"
        else:
            out_str += '_' * line_length + "\n"

    trainable_count = 0
    non_trainable_count = 0
    for name, param in model.named_parameters():
        if 'bias' in name or 'weight' in name:
            trainable_count += np.prod(param.size())
        else:
            non_trainable_count += np.prod(param.size())

    out_str += 'Total params:         {:,}'.format(trainable_count + non_trainable_count) + "\n"
    out_str += 'Trainable params:     {:,}'.format(trainable_count) + "\n"
    out_str += '_' * line_length + "\n"
    return out_str


#####################################################################################################################

def layer_weight(data):
    mean = np.mean(data)
    std = np.std(data)

    hist, bins = np.histogram(data, bins=50)
    width = np.diff(bins)
    center = (bins[:-1] + bins[1:]) / 2

    return {'mean': mean,
            'std': std,
            'hist': hist,
            'center': center,
            'width': width
            }


def plot_layer_stats(net):
    def to_np(x):
        return x.data.cpu().numpy()

    for name, module in net.named_modules():
        weight_attr = ['weight', 'weight_ih_l0', 'weight_hh_l0']
        weight_list = [w for w in weight_attr if hasattr(module, w)]

        bias_attr = ['bias', 'bias_ih_l0', 'bias_hh_l0']
        bias_list = [b for b in bias_attr if hasattr(module, b)]

        if not (weight_list and bias_list):
            continue

        for idx in range(len(weight_attr)):
            plt.figure(idx, figsize=(10, 4))

            if hasattr(module, weight_attr[idx]):
                if type(getattr(module, weight_attr[idx])) is torch.nn.parameter.Parameter:
                    w = layer_weight(to_np(getattr(module, weight_attr[idx])))

                    ax = plt.subplot2grid((1, 2), (0, 0))
                    ax.set_title("Module: %s-" % name + weight_attr[idx] +
                                 "\n Mean # %.4f" % w['mean'] + " STD # %.2e" % w['std'])
                    ax.bar(w['center'], w['hist'], align='center', width=w['width'])

            if hasattr(module, bias_attr[idx]):
                if type(getattr(module, bias_attr[idx])) is torch.nn.parameter.Parameter:
                    b = layer_weight(to_np(getattr(module, bias_attr[idx])))

                    ax = plt.subplot2grid((1, 2), (0, 1))
                    ax.set_title("Module: %s-" % name + bias_attr[idx] +
                                 "\n Mean # %.4f" % b['mean'] + " STD # %.2e" % b['std'])
                    ax.bar(b['center'], b['hist'], align='center', width=b['width'])

            plt.show()


#####################################################################################################################

def plot_model_graph(var, params):
    """ Produces Graphviz representation of PyTorch autograd graph

    Blue nodes are the Variables that require grad, orange are Tensors
    saved for backward in torch.autograd.Function

    Args:
        var: output Variable
        params: dict of (name, Variable) to add names to node that
            require grad (TODO: make optional)
    """
    param_map = {id(v): k for k, v in params.items()}

    node_attr = dict(style='filled',
                     shape='box',
                     align='left',
                     fontsize='12',
                     ranksep='0.1',
                     height='0.2')
    dot = Digraph(node_attr=node_attr, graph_attr=dict(size="8,8"))
    seen = set()

    def size_to_str(size):
        # noinspection PyRedundantParentheses
        return '(' + (', ').join(['%d' % v for v in size]) + ')'

    def add_nodes(var):
        if var not in seen:
            if torch.is_tensor(var):
                dot.node(str(id(var)), size_to_str(var.size()), fillcolor='orange')
            elif hasattr(var, 'variable'):
                u = var.variable
                node_name = '%s\n %s' % (param_map.get(id(u)), size_to_str(u.size()))
                dot.node(str(id(var)), node_name, fillcolor='lightblue')
            else:
                dot.node(str(id(var)), str(type(var).__name__))
            seen.add(var)
            if hasattr(var, 'next_functions'):
                for u in var.next_functions:
                    if u[0] is not None:
                        dot.edge(str(id(u[0])), str(id(var)))
                        add_nodes(u[0])
            if hasattr(var, 'saved_tensors'):
                for t in var.saved_tensors:
                    dot.edge(str(id(t)), str(id(var)))
                    add_nodes(t)

    add_nodes(var.grad_fn)
    return dot

###################################################################################################################

# from imblearn.base import *
# from imblearn.utils import check_target_type, hash_X_y
# import logging
#
#
# class OutlierSampler(SamplerMixin):
#     def __init__(self, threshold=1.5, memory=None, verbose=0):
#         self.threshold = threshold
#         self.verbose = verbose
#         self.logger = logging.getLogger(__name__)
#
#         self.X_hash_, self.y_hash_ = None, None
#
#     def sample(self, X, y):
#         # Check the consistency of X and y
#         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
#
#         check_is_fitted(self, 'X_hash_')
#         self._check_X_y(X, y)
#
#         X_out, y_out = self._sample(X, y)
#
#         return X_out, y_out
#
#     def _sample(self, X, y):
#         outliers = []
#         for col in X.T:  # loop over feature columns
#             Q1 = np.percentile(col, 25)  # Calculate Q1 (25th percentile of the data) for the given feature
#             Q3 = np.percentile(col, 75)  # Calculate Q3 (75th percentile of the data) for the given feature
#
#             step = self.threshold * (Q3 - Q1)  # Use the interquartile range to calculate an outlier step
#
#             feature_outliers = np.where(~((col >= Q1 - step) & (col <= Q3 + step)))[0]
#             outliers.extend(feature_outliers)
#
#         # Find the data points that where considered outliers for more than one feature
#         multi_feature_outliers = list((Counter(outliers) - Counter(set(outliers))).keys())
#
#         X_out = np.delete(X, multi_feature_outliers, axis=0)
#         y_out = np.delete(y, multi_feature_outliers, axis=0)
#
#         if self.verbose:
#             print('Sampled - reduced points form / to: ', X.shape, X_out.shape)
#         return X_out, y_out
#
#     def fit(self, X, y):
#         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
#         y = check_target_type(y)
#         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
#
#         self._fit(X, y)
#
#         return self
#
#     def _fit(self, X, y):
#         if self.verbose:
#             print('OutlierSampler Fitted X/y: ', X.shape, y.shape)
#         return self
#
#     def fit_sample(self, X, y):
#         return self.fit(X, y).sample(X, y)

###################################################################################################################


In [0]:
import inspect
import os
import shutil

import tensorflow as tf
from tqdm import tqdm


#######################################################################################################################


class TensorboardLogger(object):
    """ Visualize the training results of running a pytorch model to Tensorboard """

    def __init__(self, root_dir="./", experiment_dir="model", verbose=0):
        self.root_dir = root_dir
        self.experiment_dir = experiment_dir
        self.verbose = verbose

        self.LOG_DIR_NAME = 'logs'

        self.iterable = None

        self.last_logged_values = []
        self.epoch = -1

        date_time = HYPERPARAMETERS.create_timestamp()
        path = os.path.join(self.root_dir, self.LOG_DIR_NAME, self.experiment_dir, date_time)

        if os.path.exists(path):
            shutil.rmtree(path)

        os.makedirs(path)

        self.writer = tf.summary.FileWriter(path)

    def set_itr(self, iterable):
        self.iterable = iterable
        return self

    def __iter__(self):

        assert self.iterable is not None

        for obj in self.iterable:
            self.epoch = obj
            yield obj

    def log_values(self, train_loss, train_score, train_lr,
                   valid_loss, valid_score, best_score_epoch, best_score):

        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        self.last_logged_values = [(i, values[i]) for i in args[1:]]

        summary = tf.Summary(value=[
            tf.Summary.Value(tag='train_loss', simple_value=train_loss),
            tf.Summary.Value(tag='train_score', simple_value=train_score),
            tf.Summary.Value(tag='train_lr', simple_value=train_lr),
            tf.Summary.Value(tag='valid_loss', simple_value=valid_loss),
            tf.Summary.Value(tag='valid_score', simple_value=valid_score),
            tf.Summary.Value(tag='best_score_epoch', simple_value=best_score_epoch),
            tf.Summary.Value(tag='best_score', simple_value=best_score)
        ])
        self.writer.add_summary(summary, self.epoch)

    def __repr__(self):
        fmt_str = self.__class__.__name__ + '\n'
        fmt_str += '    Last Epoch/LR:    {} / {}\n'.format(self.epoch, self.last_logged_values[2][1])
        fmt_str += '    Train Loss/Score: {} / {}\n'.format(self.last_logged_values[0][1],
                                                            self.last_logged_values[1][1])
        fmt_str += '    Valid Loss/Score: {} / {}\n'.format(self.last_logged_values[3][1],
                                                            self.last_logged_values[4][1])
        fmt_str += '    Best Epoch/Score: {} / {}\n'.format(self.last_logged_values[5][1],
                                                            self.last_logged_values[6][1])
        return fmt_str


#######################################################################################################################


class PytorchLogger(object):
    """ Visualize the training results of running a pytorch model using Tqdm """

    def __init__(self, tqdm_cls=tqdm):
        self.iterable = None
        self.last_logged_values = []
        self.epoch = -1
        self.tqdm_cls = tqdm_cls

    def set_itr(self, iterable):
        self.iterable = iterable
        self.iterable = self.tqdm_cls(iterable)
        self.iterable.set_description('Epoch')
        return self

    def __iter__(self):
        assert self.iterable is not None

        for obj in self.iterable:
            self.epoch = obj
            yield obj

    def log_values(self, train_loss, train_score, train_lr,
                   valid_loss, valid_score, best_score_epoch, best_score):
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        self.last_logged_values = [(i, values[i]) for i in args[1:]]

        self.iterable.set_postfix(last="%i" % self.epoch + "/%.4f" % train_loss + "/%.4f" % train_score,
                                  lr=train_lr,
                                  best="%i" % best_score_epoch + "/%.4f" % best_score)

    def __repr__(self):
        fmt_str = self.__class__.__name__ + '\n'
        fmt_str += '    Last Epoch/LR:    {} / {}\n'.format(self.epoch, self.last_logged_values[2][1])
        fmt_str += '    Train Loss/Score: {} / {}\n'.format(self.last_logged_values[0][1],
                                                            self.last_logged_values[1][1])
        fmt_str += '    Valid Loss/Score: {} / {}\n'.format(self.last_logged_values[3][1],
                                                            self.last_logged_values[4][1])
        fmt_str += '    Best Epoch/Score: {} / {}\n'.format(self.last_logged_values[5][1],
                                                            self.last_logged_values[6][1])
        return fmt_str

#######################################################################################################################


In [10]:
! pip install watermark

Collecting watermark
  Downloading https://files.pythonhosted.org/packages/4b/dc/fb451c174b4f603231875c9ca7116d1b81cdd635172f0fbab248b1d94cd5/watermark-1.8.1-py3-none-any.whl
Installing collected packages: watermark
Successfully installed watermark-1.8.1


### Конец нужных классов и функций

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt

import random

from tqdm import tqdm_notebook
import numpy as np
import math, copy, time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import torchtext
from torchtext.data import Field

logger = logging.getLogger(__name__)

%load_ext watermark
%watermark -a "tb" -d -v -m -p sys,numpy,pandas,sklearn,torch,IPython
gpu_stat()

tb 2019-04-23 

CPython 3.6.7
IPython 5.5.0

sys 3.6.7 (default, Oct 22 2018, 11:32:17) 
[GCC 8.2.0]
numpy 1.16.2
pandas 0.24.2
sklearn 0.20.3
torch 1.0.1.post2
IPython 5.5.0

compiler   : GCC 8.2.0
system     : Linux
release    : 4.14.79+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit
GPU Name: Tesla T4
GPU Memory: 14.7GB
CUDA Version: (10, 0, 0)
GPU Free/Total Memory: 99%


In [0]:
# torch.cuda.is_available = lambda : False
# torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = True

In [0]:
H = HYPERPARAMETERS({
    'EXPERIMENT': 'Eng2Ger',
    'DESCRIPTION': 'Transformer model',
    'TIMESTAMP': HYPERPARAMETERS.create_timestamp(),

    'MODEL_NAME': 'Eng2Ger_TRANSFORMER',

    'PRELOAD_MODEL_PATH': None,

    'ROOT_DIR': 'data',

    'TARGET_ENCODING': 'sts',  # ' ctc

    'BATCH_SIZE': 64,
    'NUM_WORKERS': 8,

    'EMBEDDING_SIZE': 256,
    'EMBEDDING_DROPOUT': 0.2,
    'RNN_HIDDEN_SIZE': 256,
    'RNN_NUM_LAYERS': 2,
    'RNN_DROPOUT': 0.2,
    'BIDIRECTIONAL': True,

    'LR': 0.0003,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 200.0)), 0.01),
    'WEIGHT_DECAY': 0,
    'MOMENTUM': 0.9,
    'NESTEROV': True,

    'LABEL_SMOOTHING' : 0.2,

    'MAX_GRAD_NORM': 1,

    'MAX_EPOCHS': 30,

    'STOPPING_PATIENCE': 80,

    'CHECKPOINT_INTERVAL': 10,
    'CHECKPOINT_RESTORE': False,

    'USE_CUDA': torch.cuda.is_available(),

    'SEED': 123456,
    
    'SEQ_MAX_LEN' :         50,
    'SRC_VOCAB_MAX_SIZE' :  50000,
    'TGT_VOCAB_MAX_SIZE' :  50000,

})

In [0]:
random.seed(H.SEED)
np.random.seed(H.SEED)
torch.manual_seed(H.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(H.SEED)
    torch.cuda.manual_seed_all(H.SEED)

In [0]:
SYM_SOS = '<sos>'
SYM_EOS = '<eos>'
SYM_PAD = '<pad>'
IDX_SOS = -1
IDX_EOS = -1
IDX_PAD = -1

In [16]:
!python -m spacy download en


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [18]:
!python -m spacy download de

Collecting de_core_news_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.0.0/de_core_news_sm-2.0.0.tar.gz#egg=de_core_news_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.0.0/de_core_news_sm-2.0.0.tar.gz (38.2MB)
[K    100% |████████████████████████████████| 38.2MB 105.1MB/s 
[?25hInstalling collected packages: de-core-news-sm
  Running setup.py install for de-core-news-sm ... [?25ldone
[?25hSuccessfully installed de-core-news-sm-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/de

    You can now load the model via spacy.load('de')



In [0]:
class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

        for word in keep_words:
            self.addWord(word)

In [0]:
import pickle

In [0]:
with open('gdrive/My Drive/Colab Notebooks/pairs.pickle', 'rb') as handle:
    pairs = pickle.load(handle)

In [0]:
with open('gdrive/My Drive/Colab Notebooks/voc.pickle', 'rb') as handle:
    voc = pickle.load(handle)

In [0]:
def column(matrix, i):
    return [row[i] for row in matrix]

In [0]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [0]:
import spacy

spacy_en = spacy.load('en')
spacy_de = spacy.load('de')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer( text )]
    return text.split()

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]
    return text.split()

preproc = lambda seq: [SYM_SOS] + seq + [SYM_EOS]

src = Field(sequential=True, tokenize=tokenize_en, lower=True, batch_first=True, 
            include_lengths=True)
tgt = Field(sequential=True, tokenize=tokenize_de, lower=True, batch_first=True, 
            include_lengths=True, preprocessing=preproc)

In [0]:
def len_filter(example):
    return len(example.src) <= H.SEQ_MAX_LEN and len(example.tgt) <= H.SEQ_MAX_LEN

path = "gdrive/My Drive/Colab Notebooks/enggerdata.tsv"
SRC_FIELD_NAME = 'src'
TGT_FIELD_NAME = 'tgt'

train_data, valid_data, test_data = torchtext.data.TabularDataset(
    path=path, format='tsv',
    fields=[(SRC_FIELD_NAME, src), (TGT_FIELD_NAME, tgt)],
    filter_pred=len_filter
    ).split(split_ratio=[0.8, 0.1, 0.1])

In [0]:
class Vocabulary(object):
    def __init__(self, vocab):
        self.vocab = vocab
        
    def __call__(self, val):
        if isinstance(val, str):
            res = self.vocab.stoi[val] if val in self.vocab.stoi else None
        elif isinstance(val, int):
            res = self.vocab.itos[val] if val <= self.__len__() else None
        else:
            raise RuntimeError
        return res   
    
    def __len__(self):
        return len(self.vocab.itos)
    
    def __repr__(self):
        return 'Vocab(size=' + str(len(self.vocab.itos)) + ')'

In [36]:
src.build_vocab(train_data, max_size=H.SRC_VOCAB_MAX_SIZE, min_freq=2)
tgt.build_vocab(train_data, max_size=H.TGT_VOCAB_MAX_SIZE, min_freq=2)

input_vocab = Vocabulary(src.vocab)
output_vocab = Vocabulary(tgt.vocab)

print(input_vocab, output_vocab)

IDX_PAD = output_vocab(SYM_PAD)
IDX_SOS = output_vocab(SYM_SOS)
IDX_EOS = output_vocab(SYM_EOS)

IDX_PAD, IDX_SOS, IDX_EOS

Vocab(size=9510) Vocab(size=15657)


(1, 3, 2)

In [38]:
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits(
                                (train_data, valid_data, test_data), 
                                batch_size=H.BATCH_SIZE, repeat=False, 
                                sort=False, sort_within_batch=True, 
                                sort_key=lambda x: len(x.src))


batch = next(train_iter.__iter__())
input_variables = getattr(batch, 'src')
target_variables = getattr(batch, 'tgt')

len(train_iter), len(valid_iter), len(test_iter)

(2115, 265, 265)

In [0]:
for idx_batch, batch in enumerate(train_iter):
    inputs_cpu, input_sizes_cpu = getattr(batch, SRC_FIELD_NAME)
    labels_cpu, label_sizes_cpu = getattr(batch, TGT_FIELD_NAME)
    break

In [0]:
import math

import numpy as np
import torch
import torch.nn as nn


class PositionalEncoding(nn.Module):
    def __init__(self, dim, dropout=0.0, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dim = dim
        self.dropout = dropout

        pe = torch.zeros(max_len, dim)
        position = torch.arange(0.0, max_len).unsqueeze(1).float()
        div_term = torch.exp((torch.arange(0.0, dim, 2) * -(math.log(10000.0) / dim)))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)
        self.dropout = nn.Dropout(p=self.dropout)

    def forward(self, x):
        x = x * math.sqrt(self.dim)
        x = x + self.pe[:, :x.size(1)]
        x = self.dropout(x)
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, d_model, droput):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.droput = droput

        self.d_head = d_model // self.num_heads

        self.fc_query = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)
        self.fc_key = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)
        self.fc_value = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)

        self.fc_concat = nn.Linear(self.num_heads * self.d_head, self.d_model, bias=False)

        self.softmax = nn.Softmax(dim=1)

        self.attn_dropout = nn.Dropout(self.droput)
        self.dropout = nn.Dropout(self.droput)

        self.norm = nn.LayerNorm(self.d_model)

    def _prepare_proj(self, x):
        """Reshape the projectons to apply softmax on each head
        """
        b, l, d = x.size()
        return x.view(b, l, self.num_heads, self.d_head).transpose(1, 2).contiguous().view(b * self.num_heads, l,
                                                                                           self.d_head)

    def forward(self, query, key, value, mask):
        b, len_query = query.size(0), query.size(1)
        len_key = key.size(1)

        # project inputs to multi-heads
        proj_query = self.fc_query(query)  # batch_size x len_query x h*d_head
        proj_key = self.fc_key(key)  # batch_size x len_key x h*d_head
        proj_value = self.fc_value(value)  # batch_size x len_key x h*d_head

        # prepare the shape for applying softmax
        proj_query = self._prepare_proj(proj_query)  # batch_size*h x len_query x d_head
        proj_key = self._prepare_proj(proj_key)  # batch_size*h x len_key x d_head
        proj_value = self._prepare_proj(proj_value)  # batch_size*h x len_key x d_head

        # get dotproduct softmax attns for each head
        attns = torch.bmm(proj_query, proj_key.transpose(1, 2))  # batch_size*h x len_query x len_key
        attns = attns / math.sqrt(self.d_head)
        attns = attns.view(b, self.num_heads, len_query, len_key)
        attns = attns.masked_fill_(mask.unsqueeze(1), -float('inf'))
        attns = self.softmax(attns.view(-1, len_key))

        # return mean attention from all heads as coverage
        coverage = torch.mean(attns.view(b, self.num_heads, len_query, len_key), dim=1)

        attns = self.attn_dropout(attns)
        attns = attns.view(b * self.num_heads, len_query, len_key)

        # apply attns on value
        out = torch.bmm(attns, proj_value)  # batch_size*h x len_query x d_head
        out = out.view(b, self.num_heads, len_query, self.d_head).transpose(1, 2).contiguous()

        out = self.fc_concat(out.view(b, len_query, self.num_heads * self.d_head))

        out = self.dropout(out).add_(query)
        out = self.norm(out)
        return out, coverage

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(PositionwiseFeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.fc = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        self.drop = nn.Dropout(self.dropout)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, inputs):
        out = self.fc(inputs)
        out = self.drop(out).add_(inputs)
        out = self.norm(out)
        return out


class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, dropout, d_ff):
        super(EncoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.attention = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.ff = PositionwiseFeedForward(self.d_model, self.d_ff, self.dropout)

    def forward(self, query, key, value, mask):
        out, _ = self.attention(query, key, value, mask)
        out = self.ff(out)
        return out


class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, dropout, d_ff):
        super(DecoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.attention_tgt = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.attention_src = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.ff = PositionwiseFeedForward(d_model, self.d_ff, self.dropout)

    def forward(self, query, key, value, context, mask_tgt, mask_src):
        out, _ = self.attention_tgt(query, key, value, mask_tgt)
        out, coverage = self.attention_src(out, context, context, mask_src)
        out = self.ff(out)
        return out, coverage


class Encoder(nn.Module):
    def __init__(self, vocab_size, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=1):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.d_model = d_model
        self.padding_idx = padding_idx
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.dropout = dropout

        self.embeddings = nn.Embedding(self.vocab_size, self.d_model, padding_idx=self.padding_idx)

        self.pos_emb = PositionalEncoding(self.d_model, self.dropout, max_len=512)

        self.layers = nn.ModuleList(
            [EncoderLayer(self.num_heads, self.d_model, self.dropout, self.d_ff) for _ in range(self.num_layers)]
        )

    def forward(self, src):
        context = self.embeddings(src)  # batch_size x len_src x d_model

        context = self.pos_emb(context)

        mask_src = src.data.eq(self.padding_idx).unsqueeze(1)
        for _, layer in enumerate(self.layers):
            context = layer(context, context, context, mask_src)  # batch_size x len_src x d_model
        return context, mask_src


class Decoder(nn.Module):
    def __init__(self, vocab_size, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=1):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.d_model = d_model
        self.padding_idx = padding_idx
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.dropout = dropout

        self.embedding = nn.Embedding(self.vocab_size, self.d_model, padding_idx=self.padding_idx)

        self.pos_emb = PositionalEncoding(self.d_model, self.dropout, max_len=512)

        self.layers = nn.ModuleList(
            [DecoderLayer(self.num_heads, self.d_model, self.dropout, self.d_ff) for _ in range(self.num_layers)]
        )

        self.fc = nn.Linear(self.d_model, self.vocab_size, bias=True)

        # tie weight between word embedding and generator
        self.fc.weight = self.embedding.weight

        self.logsoftmax = nn.LogSoftmax(dim=1)

        # pre-save a mask to avoid future information in self-attentions in decoder
        # save as a buffer, otherwise will need to recreate it and move to GPU during every call
        mask = torch.ByteTensor(np.triu(np.ones((self.d_model, self.d_model)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

    def forward(self, tgt, context, mask_src):
        out = self.embedding(tgt)  # batch_size x len_tgt x d_model

        out = self.pos_emb(out)

        len_tgt = tgt.size(1)
        mask_tgt = tgt.data.eq(self.padding_idx).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
        mask_tgt = torch.gt(mask_tgt, 0)
        for _, layer in enumerate(self.layers):
            out, coverage = layer(out, out, out, context, mask_tgt, mask_src)  # batch_size x len_tgt x d_model

        out = self.fc(out)  # batch_size x len_tgt x bpe_size

        out = self.logsoftmax(out.view(-1, self.vocab_size))
        return out, coverage


class Transformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=1):
        super(Transformer, self).__init__()
        self.src_vocab = src_vocab
        self.src_vocab_size = len(src_vocab)
        self.tgt_vocab = tgt_vocab
        self.tgt_vocab_size = len(tgt_vocab)
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.num_layers = num_layers
        self.dropout = dropout
        self.padding_idx = padding_idx

        self.encode = Encoder(self.src_vocab_size, self.num_heads, self.d_model, self.dropout, self.d_ff,
                              self.num_layers, self.padding_idx)
        self.decode = Decoder(self.tgt_vocab_size, self.num_heads, self.d_model, self.dropout, self.d_ff,
                              self.num_layers, self.padding_idx)

    def forward(self, src, src_sizes, tgt, tgt_sizes, ):
        context, mask_src = self.encode(src)
        outputs, _ = self.decode(tgt, context, mask_src)

        probas = outputs.view(src.size(0), -1, self.tgt_vocab_size)
        
        return probas, tgt_sizes-1

    def decode_greedy(self, inputs, max_seq_length=50, fixed_length=False):

        self.eval()
        with torch.no_grad():
            
            idx_sos, idx_eos = self.tgt_vocab('<sos>'), self.tgt_vocab('<eos>')

            context, mask_src = self.encode(inputs)

            batch_size = inputs.size(0)
            decode_input = torch.ones(batch_size, 1).fill_(idx_sos).type_as(inputs)

            dec_output_sizes = torch.LongTensor(batch_size).fill_(max_seq_length).type_as(inputs)

            dec_outputs = []
            for step in range(max_seq_length):
                outputs, _ = self.decode(decode_input, context, mask_src)
                outputs = outputs.view(batch_size, -1, self.tgt_vocab_size)

                dec_outputs.append(outputs[:, step, :].unsqueeze(1))

                preds = torch.max(outputs[:, -1, :], dim=1)[1]

                dec_output_sizes[preds.eq(idx_eos) * dec_output_sizes.gt(step)] = step
                if not fixed_length and dec_output_sizes.le(step + 1).all():
                    dec_output_sizes += 1
                    break

                decode_input = torch.cat([decode_input, preds.unsqueeze(1)], dim=1)

            dec_outputs = torch.cat(dec_outputs, dim=1)

        return dec_outputs, dec_output_sizes

    def decode_beam(self, inputs, labels=None, max_seq_length=50, beam_size=64, alpha=0.1, beta=0.3):

        context, mask_src = self.encode(inputs)

        max_seq_len = labels.size(1) if labels is not None else max_seq_length

        dec_outputs = []
        for idx in range(context.size(0)):
            target, _ = beam_search(self, self.tgt_vocab, context[idx].unsqueeze(0), mask_src[idx].unsqueeze(0),
                                    beam_size=beam_size, alpha=alpha, beta=beta, max_seq_len=max_seq_len)
            dec_outputs.append(target)

        return dec_outputs


def beam_search(model, vocab, context, mask_src, beam_size=64, alpha=0.1, beta=0.3, max_seq_len=64):
    probas = []
    preds = []
    probs = []
    coverage_penalties = []

    vocab_size = len(vocab)
    idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')

    decode_inputs = torch.LongTensor([idx_sos]).unsqueeze(1)
    if next(model.parameters()).is_cuda:
        decode_inputs = decode_inputs.cuda()

    decode_outputs, coverage = model.decode(decode_inputs, context, mask_src)

    scores, scores_idx = decode_outputs.view(-1).topk(beam_size)
    beam_idx = scores_idx / vocab_size
    pred_idx = (scores_idx - beam_idx * vocab_size).view(beam_size, -1)

    decode_inputs = torch.cat((decode_inputs.repeat(beam_size, 1), pred_idx), 1)
    context = context.repeat(beam_size, 1, 1)

    remaining_beams = beam_size
    for step in range(max_seq_len):
        decode_outputs, coverage = model.decode(decode_inputs, context, mask_src)

        decode_outputs = decode_outputs.view(remaining_beams, -1, vocab_size)
        decode_outputs = scores.unsqueeze(1) + decode_outputs[:, -1, :]
        scores, scores_idx = decode_outputs.view(-1).topk(remaining_beams)

        beam_idx = scores_idx / vocab_size
        pred_idx = (scores_idx - beam_idx * vocab_size).view(remaining_beams, -1)

        decode_inputs = torch.cat((decode_inputs[beam_idx], pred_idx), 1)

        index = decode_inputs[:, -1].eq(idx_eos) + decode_inputs[:, -1].eq(idx_pad)
        finished = index.nonzero().flatten()
        continue_idx = (index ^ 1).nonzero().flatten()

        for idx in finished:
            probas.append(scores[idx].item())
            preds.append(decode_inputs[idx, :].tolist())
            probs.append(coverage[idx, :, :])

            atten_prob = torch.sum(coverage[idx, :, :], dim=0)
            coverage_penalty = torch.log(atten_prob.masked_select(atten_prob.le(1)))
            coverage_penalty = beta * torch.sum(coverage_penalty).item()
            coverage_penalties.append(coverage_penalty)

            remaining_beams -= 1

        if len(continue_idx) > 0:
            scores = scores.index_select(0, continue_idx)
            decode_inputs = decode_inputs.index_select(0, continue_idx)
            context = context.index_select(0, continue_idx)

        if remaining_beams <= 0:
            break

    len_penalties = [math.pow(len(pred), alpha) for pred in preds]
    #     final_scores = [probas[i] / len_penalties[i] + coverage_penalties[i] for i in range(len(preds))]
    final_scores = [probas[i] / len_penalties[i] for i in range(len(preds))]

    sorted_scores_arg = sorted(range(len(preds)), key=lambda i: -final_scores[i])

    best_beam = sorted_scores_arg[0]

    return preds[best_beam], probs[best_beam]


In [0]:
model_cpu = Transformer(input_vocab, output_vocab, num_heads=8, d_model=512, 
                        dropout=0.1, d_ff=1024, num_layers=6, padding_idx=IDX_PAD)

outputs_cpu = model_cpu(inputs_cpu, input_sizes_cpu, labels_cpu, label_sizes_cpu)

# print(outputs_cpu.shape, output_sizes_cpu.shape)

# outputs_cpu, output_sizes_cpu = model_cpu.decode_greedy(inputs_cpu, labels_cpu)

# print(outputs_cpu.shape, output_sizes_cpu.shape)

# outputs_cpu, output_sizes_cpu = model_cpu.decode_beam(inputs_cpu, labels_cpu)

# print(outputs_cpu.shape, output_sizes_cpu.shape)

In [0]:
class STSDecoder(object):
    def __init__(self, vocab):
        self.vocab = vocab

    @staticmethod
    def decode_labels(labels, label_sizes, vocab):
        idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')
        lseq = []
        for seq, size in zip(labels, label_sizes):
            lseq.append(
                ' '.join([vocab(c.item()) for c in seq[0:size - 1] if c.item() not in [idx_sos, idx_eos, idx_pad]])
            )

        return lseq

    @staticmethod
    def decode_probas(probas, probas_sizes, vocab, probabilities=False):
        max_vals, max_indices = torch.max(probas, 2)
        idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')

        decoded_seq = []
        for seq_idx, seq_len, seq_proba in zip(max_indices.cpu(), probas_sizes, max_vals):
            txt, probas = '', []

            for i in range(min(seq_len, len(seq_idx))):
                c = seq_idx[i].item()
                if c in [idx_sos, idx_eos, idx_pad]:
                    continue
                txt += vocab(c) + ' '
                probas.append(math.exp(seq_proba[i].item()))

            if probabilities:
                decoded_seq.append((txt.strip(), stats.mean(probas) if len(probas) > 0 else 0))
            else:
                decoded_seq.append(txt.strip())
        return decoded_seq

    def __call__(self, inputs, inputs_sizes, labels=None, label_sizes=None, probabilities=False):

        decoder_seq = self.decode_probas(inputs, inputs_sizes, self.vocab, probabilities=probabilities)

        label_seq = None
        if labels is not None and label_sizes is not None:
            label_seq = self.decode_labels(labels, label_sizes, self.vocab)

        return decoder_seq, label_seq

In [0]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, tgt_vocab_size, label_smoothing=0.0, padding_idx=0):
        super(LabelSmoothingLoss, self).__init__()
        assert 0.0 < label_smoothing <= 1.0
        self.ignore_index = padding_idx

        smoothing_value = label_smoothing / (tgt_vocab_size - 2)
        one_hot = torch.full((tgt_vocab_size,), smoothing_value)
        one_hot[self.ignore_index] = 0
        self.register_buffer('one_hot', one_hot.unsqueeze(0))

        self.confidence = 1.0 - label_smoothing

    def forward(self, outputs, output_sizes, targets, target_sizes):
        b, t, c = outputs.size()
        outputs = outputs.view(b * t, c)

        b, t = targets.size()
        targets = targets.view(b * t)        
        
        model_prob = self.one_hot.repeat(targets.size(0), 1)
        model_prob.scatter_(1, targets.unsqueeze(1), self.confidence)
        model_prob.masked_fill_((targets == self.ignore_index).unsqueeze(1), 0)

        return F.kl_div(outputs, model_prob, reduction='sum')

In [0]:
class NoamOptimizer(optim.Adam):
    def __init__(self, params, d_model, factor=2, warmup_steps=4000, betas=(0.9, 0.98), eps=1e-9):
        super(NoamOptimizer, self).__init__(params, betas=betas, eps=eps)
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.lr = 0
        self.step_num = 0
        self.factor = factor

    def step(self, closure=None):
        self.step_num += 1
        self.lr = self.lrate(self.step_num)
        for group in self.param_groups:
            group['lr'] = self.lr
        super(NoamOptimizer, self).step()

    def lrate(self, epoch):
        return self.factor * self.d_model ** (-0.5) * min(epoch ** (-0.5), epoch * self.warmup_steps ** (-1.5))


In [0]:
from torch import nn

class AccuracyScorer(nn.Module):

    def __init__(self, pad_index=0):
        super(AccuracyScorer, self).__init__()

        self.pad_index = pad_index

    def forward(self, outputs, output_sizes, targets, target_sizes):

        batch_size, seq_len, vocabulary_size = outputs.size()

        outputs = outputs.view(batch_size * seq_len, vocabulary_size)
        targets = targets.view(batch_size * seq_len)

        predicts = outputs.argmax(dim=1)
        corrects = predicts == targets

        corrects.masked_fill_((targets == self.pad_index), 0)

        correct_count = corrects.sum().item()
        count = (targets != self.pad_index).sum().item()

        return correct_count / float(count)

In [0]:
# https://discuss.pytorch.org/t/implementation-of-function-like-numpy-roll/964/8
def roll(x, shift, dim=-1, fill_pad = None):

    if 0 == shift:
        return x

    elif shift < 0:
        shift = -shift
        gap = x.index_select(dim, torch.arange(shift).to(x.device))
        if fill_pad is not None:
            gap = fill_pad * torch.ones_like(gap, device=x.device)
        return torch.cat([x.index_select(dim, torch.arange(shift, x.size(dim)).to(x.device)), gap], dim=dim)

    else:
        shift = x.size(dim) - shift
        gap = x.index_select(dim, torch.arange(shift, x.size(dim)).to(x.device))
        if fill_pad is not None:
            gap = fill_pad * torch.ones_like(gap, device=x.device)
        return torch.cat([gap, x.index_select(dim, torch.arange(shift).to(x.device))], dim=dim)

In [0]:
m = Metric([('train_loss', np.inf), ('train_score', np.inf), ('valid_loss', np.inf), ('valid_score', 0),
            ('train_lr', 0), ('valid_cer', np.inf)])

model = Transformer(input_vocab, output_vocab, num_heads=8, d_model=512,  dropout=0.1, d_ff=1024, 
                    num_layers=6, padding_idx=IDX_PAD)

for p in model.parameters():
    if p.dim() > 1:
        torch_weight_init(p)

if H.USE_CUDA:
    model.cuda()

logging.info(model_summary(model, line_length=100))

# if H.PRELOAD_MODEL_PATH:
#     path = os.path.join(H.EXPERIMENT, H.PRELOAD_MODEL_PATH)
#     state = torch.load(path)
#     model.load_state_dict(state)
#     logging.info("Preloaded model: {}".format(path))

if H.PRELOAD_MODEL_PATH:
    path = os.path.join(H.EXPERIMENT, H.PRELOAD_MODEL_PATH)
    state = torch.load(path)
    model.load_state_dict(state)
    logging.info("Preloaded model: {}".format(path))    
    
    
criterion = LabelSmoothingLoss(len(output_vocab), label_smoothing=H.LABEL_SMOOTHING, padding_idx=IDX_PAD)
if H.USE_CUDA:
    criterion.cuda()
    
sts_decoder = STSDecoder(output_vocab)

#scorer = Scorer()
scorer = AccuracyScorer(pad_index=IDX_PAD)

optimizer = optim.Adam(list(filter(lambda p: p.requires_grad, model.parameters())),
                       amsgrad=False,
                       betas=(0.9, 0.999),
                       eps=1e-08,
                       lr=H.LR,
                       weight_decay=H.WEIGHT_DECAY)

# optimizer = NoamOptimizer(list(filter(lambda p:p.requires_grad, model.parameters())),
#                           d_model=256, factor=2, warmup_steps=20000, betas=(0.9, 0.98), eps=1e-9)

stopping = Stopping(model, patience=H.STOPPING_PATIENCE)

scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[H.LR_LAMBDA])

tlogger = TensorboardLogger(root_dir=H.EXPERIMENT, experiment_dir=H.TIMESTAMP)  # PytorchLogger()

checkpoint = Checkpoint(model, optimizer, stopping, m,
                        root_dir=H.EXPERIMENT, experiment_dir=H.TIMESTAMP, restore_from=-1,
                        interval=H.CHECKPOINT_INTERVAL, verbose=0)


In [0]:
# path = os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar')
# state = torch.load(path)
# model.load_state_dict(state)

In [57]:
epoch_start = 1
if H.CHECKPOINT_RESTORE:
    epoch_start = checkpoint.restore() + 1
#     train_loader.batch_sampler.shuffle(epoch_start)

epoch = epoch_start
try:
    epoch_itr = tlogger.set_itr(range(epoch_start, H.MAX_EPOCHS + 1))

    for epoch in epoch_itr:
        
#         with DelayedKeyboardInterrupt():

        model.train(True)

#         scheduler.step()
    
        train_lr = [float(param_group['lr']) for param_group in optimizer.param_groups][0]

        total_size, total_loss, total_score = 0, 0.0, 0.0
        for idx_batch, batch in enumerate(train_iter):
            inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
            labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
            if next(model.parameters()).is_cuda:
                inputs, labels = inputs.cuda(), labels.cuda()

            probas, proba_sizes = model(inputs, input_sizes, labels, label_sizes) 

            loss = criterion(probas, proba_sizes, roll(labels, -1, dim=-1, fill_pad=IDX_PAD), label_sizes-1)
            total_loss += loss.item()      
            
            preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels, label_sizes)
            total_score += scorer(preds_seq, label_seq)
#             total_score += scorer(probas, proba_sizes, labels, label_sizes)
            
            total_size += inputs.size(0)

            optimizer.zero_grad()
            loss.backward()
            
            if H.MAX_GRAD_NORM is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), H.MAX_GRAD_NORM)
            optimizer.step()

            del probas
            del loss
            
        m.train_loss = total_loss / total_size
        m.train_score = 1.0 - min(1.0, total_score / total_size)
        m.train_lr = train_lr
    
        #-----------------------------------------------------------
        
        model.eval()
        
        with torch.no_grad():

            hypotheses = []
            references = []
            total_size, total_loss, total_score = 0, 0.0, 0.0
            for idx_batch, batch in enumerate(valid_iter):
                inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
                labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
                if next(model.parameters()).is_cuda:
                    inputs, labels = inputs.cuda(), labels.cuda()

                probas, proba_sizes = model.decode_greedy(inputs, labels.size(1), fixed_length=True)
                
                loss = criterion(probas, proba_sizes, roll(labels, -1, dim=-1, fill_pad=IDX_PAD), label_sizes-1)
                total_loss += loss.item()      

                preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels, label_sizes)
                total_score += scorer(preds_seq, label_seq)
#                 total_score += scorer(probas, proba_sizes, labels, label_sizes)

                total_size += inputs.size(0)
                
            del probas
            del loss

        m.valid_loss = total_loss / total_size
        m.valid_score = 1.0 - min(1.0, total_score / total_size)

        if checkpoint:
            checkpoint.step(epoch)

        stopping_flag = stopping.step(epoch, m.valid_loss, m.valid_score)

        epoch_itr.log_values(m.train_loss, m.train_score, m.train_lr, m.valid_loss, m.valid_score,
                             stopping.best_score_epoch, stopping.best_score)

        if stopping_flag:
            logger.info(
                "Early stopping at epoch: %d, score %f" % (stopping.best_score_epoch, stopping.best_score))
            break

#             train_loader.batch_sampler.shuffle(epoch)

except KeyboardInterrupt:
    logger.info("Training interrupted at: {}".format(epoch))
    pass

checkpoint.create(epoch)

model.load_state_dict(stopping.best_score_state)
torch.save(model.state_dict(), os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))

logger.info(repr(tlogger))
logger.info(repr(stopping))
logger.info(repr(checkpoint))

logger.info("Training end.")

TypeError: ignored

In [0]:
model_pre = Transformer(input_vocab, output_vocab, num_heads=8, d_model=512,  dropout=0.1, d_ff=1024, 
                    num_layers=6, padding_idx=IDX_PAD)


if H.USE_CUDA:
    model_pre.cuda()

path = os.path.join(H.EXPERIMENT, 'Eng2Ger_TRANSFORMER' + '.tar')
state = torch.load(path)
model_pre.load_state_dict(state)

scorer = Scorer()


In [0]:
hypotheses = []
references = []
for idx_batch, batch in enumerate(test_iter):
    inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
    labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
    if next(model_pre.parameters()).is_cuda:
        inputs, labels = inputs.cuda(), labels.cuda()

    probas, proba_sizes = model_pre.decode_greedy(inputs, labels.size(1))
    break


In [0]:
%%time 

model_pre.eval()
with torch.no_grad():

    hypotheses = []
    references = []
    for idx_batch, batch in enumerate(test_iter):
        inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
        labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
        if next(model_pre.parameters()).is_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()

        probas, proba_sizes = model_pre.decode_greedy( inputs, H.SEQ_MAX_LEN)
        
        preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels, label_sizes)

        hypotheses.extend(preds_seq)
        references.extend(label_seq)
        

CPU times: user 1min 13s, sys: 87.8 ms, total: 1min 13s
Wall time: 1min 13s


In [0]:
from lib.scorer import Scorer

bleu = Scorer.get_moses_multi_bleu(hypotheses, references, lowercase=False)
wer, cer = Scorer.get_wer_cer(hypotheses, references)
acc = Scorer.get_acc(hypotheses, references)


print('Test Summary \n'
            'Bleu: {bleu:.3f}\n'
            'WER:  {wer:.3f}\n'
            'CER:  {cer:.3f}\n'
            'ACC:  {acc:.3f}'.format(bleu=bleu, wer=wer * 100, cer=cer * 100, acc=acc * 100))


Test Summary 
Bleu: 38.390
WER:  39.463
CER:  37.274
ACC:  18.671


In [0]:
%%time

model_pre.eval()
with torch.no_grad():

    hypotheses = []
    references = []
    for idx_batch, batch in enumerate(test_iter):
        inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
        labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
        if next(model.parameters()).is_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()

        context, mask_src = model_pre.encode(inputs)
        
        max_seq_len = labels.size(1) if labels is not None else H.MAX_SEQ_LENGTH

        outputs = model_pre.decode_beam(inputs, None, max_seq_len, beam_size=20, alpha=0.1, beta=0.3)

        for entry in outputs:
            hypotheses.append(' '.join([output_vocab(t) for t in entry if t not in [IDX_PAD, IDX_SOS, IDX_EOS]]))

        references.extend(STSDecoder.decode_labels(labels[:,1:], label_sizes-1, output_vocab)) 


In [0]:
from lib.scorer import Scorer

bleu = Scorer.get_moses_multi_bleu(hypotheses, references, lowercase=False)
wer, cer = Scorer.get_wer_cer(hypotheses, references)
acc = Scorer.get_acc(hypotheses, references)


print('Test Summary \n'
            'Bleu: {bleu:.3f}\n'
            'WER:  {wer:.3f}\n'
            'CER:  {cer:.3f}\n'
            'ACC:  {acc:.3f}'.format(bleu=bleu, wer=wer * 100, cer=cer * 100, acc=acc * 100))


Test Summary 
Bleu: 40.270
WER:  35.531
CER:  34.209
ACC:  22.051


In [0]:
while True:
    seq_str = input("Type in a source sequence:")
    print(">> ", seq_str)
    if not len(seq_str):
        break
    #seq = seq_str.strip().lower().split()
    seq = tokenize_en(seq_str.strip().lower())
    print(seq)

    seq_id = [input_vocab(tok) for tok in seq]

    model_pre.eval()
    with torch.no_grad():

        src_id_seq = torch.LongTensor(seq_id).view(1, -1)
        src_id_seq = src_id_seq.cuda() if torch.cuda.is_available() else src_id_seq
        
        probas, proba_sizes = model_pre.decode_greedy( src_id_seq)

        tgt_seq = STSDecoder.decode_probas(probas, proba_sizes, output_vocab)
        
        print("<< ", ' '.join(tgt_seq))

print("Finished.")

Type in a source sequence:I am at home.
>>  I am at home.
['i', 'am', 'at', 'home', '.']
<<  ich bin zu hause .
Type in a source sequence:
>>  
Finished.
