In [1]:
!pip install tensorflow==1.14
!pip install tensorflow-gpu==1.14



In [2]:
# -*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf


def margin_loss(labels, raw_logits, margin=0.4, downweight=0.5):
    """Penalizes deviations from margin for each logit.
    Each wrong logit costs its distance to margin. For negative logits margin is
    0.1 and for positives it is 0.9. First subtract 0.5 from all logits. Now
    margin is 0.4 from each side.
    Args:
    labels: tensor, one hot encoding of ground truth.
    raw_logits: tensor, model predictions in range [0, 1]
    margin: scalar, the margin after subtracting 0.5 from raw_logits.
    downweight: scalar, the factor for negative cost.
    Returns:
    A tensor with cost for each data point of shape [batch_size].
    """
    logits = raw_logits - 0.5
    positive_cost = labels * tf.cast(tf.less(logits, margin),
                                     tf.float32) * tf.pow(logits - margin, 2)
    negative_cost = (1 - labels) * tf.cast(
        tf.greater(logits, -margin), tf.float32) * tf.pow(logits + margin, 2)
    return 0.5 * positive_cost + downweight * 0.5 * negative_cost


def createVocabulary(input_path, output_path, pad=True, unk=True):
    if not isinstance(input_path, str):
        raise TypeError('input_path should be string')

    if not isinstance(output_path, str):
        raise TypeError('output_path should be string')

    vocab = {}
    with open(input_path, 'r') as fd, \
            open(output_path, 'w+') as out:
        for line in fd:
            line = line.rstrip('\r\n')
            words = line.split()

            for w in words:
                if w == '_UNK':
                    break
                if str.isdigit(w) == True:
                    w = '0'
                if w in vocab:
                    vocab[w] += 1
                else:
                    vocab[w] = 1
        init_vocab = []
        if pad:
            init_vocab.append('_PAD')
        if unk:
            init_vocab.append('_UNK')
        vocab = sorted(vocab, key=vocab.get, reverse=True) + init_vocab

        for v in vocab:
            out.write(v + '\n')


def loadVocabulary(path):
    if not isinstance(path, str):
        raise TypeError('path should be a string')

    vocab = []
    rev = []
    with open(path) as fd:
        for line in fd:
            line = line.rstrip('\r\n')
            rev.append(line)
        vocab = dict([(x, y) for (y, x) in enumerate(rev)])

    return {'vocab': vocab, 'rev': rev}


def sentenceToIds(data, vocab, unk):
    if not isinstance(vocab, dict):
        raise TypeError('vocab should be a dict that contains vocab and rev')
    vocab = vocab['vocab']
    if isinstance(data, str):
        words = data.split()
    elif isinstance(data, list):
        words = data
    else:
        raise TypeError('data should be a string or a list contains words')

    ids = []
    if unk:
        for w in words:
            if str.isdigit(w) == True:
                w = '0'
            ids.append(vocab.get(w, vocab['_UNK']))
    else:
        for w in words:
            if str.isdigit(w) == True:
                w = '0'
            ids.append(vocab.get(w))

    return ids


def padSentence(s, max_length, vocab):
    return s + [vocab['vocab']['_PAD']] * (max_length - len(s))


# compute f1 score is modified from conlleval.pl
def __startOfChunk(prevTag, tag, prevTagType, tagType, chunkStart=False):
    if prevTag == 'B' and tag == 'B':
        chunkStart = True
    if prevTag == 'I' and tag == 'B':
        chunkStart = True
    if prevTag == 'O' and tag == 'B':
        chunkStart = True
    if prevTag == 'O' and tag == 'I':
        chunkStart = True

    if prevTag == 'E' and tag == 'E':
        chunkStart = True
    if prevTag == 'E' and tag == 'I':
        chunkStart = True
    if prevTag == 'O' and tag == 'E':
        chunkStart = True
    if prevTag == 'O' and tag == 'I':
        chunkStart = True

    if tag != 'O' and tag != '.' and prevTagType != tagType:
        chunkStart = True
    return chunkStart


def __endOfChunk(prevTag, tag, prevTagType, tagType, chunkEnd=False):
    if prevTag == 'B' and tag == 'B':
        chunkEnd = True
    if prevTag == 'B' and tag == 'O':
        chunkEnd = True
    if prevTag == 'I' and tag == 'B':
        chunkEnd = True
    if prevTag == 'I' and tag == 'O':
        chunkEnd = True

    if prevTag == 'E' and tag == 'E':
        chunkEnd = True
    if prevTag == 'E' and tag == 'I':
        chunkEnd = True
    if prevTag == 'E' and tag == 'O':
        chunkEnd = True
    if prevTag == 'I' and tag == 'O':
        chunkEnd = True

    if prevTag != 'O' and prevTag != '.' and prevTagType != tagType:
        chunkEnd = True
    return chunkEnd


def __splitTagType(tag):
    s = tag.split('-')
    if len(s) > 2 or len(s) == 0:
        raise ValueError('tag format wrong. it must be B-xxx.xxx')
    if len(s) == 1:
        tag = s[0]
        tagType = ""
    else:
        tag = s[0]
        tagType = s[1]
    return tag, tagType


def computeF1Score(correct_slots, pred_slots):
    correctChunk = {}
    correctChunkCnt = 0
    foundCorrect = {}
    foundCorrectCnt = 0
    foundPred = {}
    foundPredCnt = 0
    correctTags = 0
    tokenCount = 0
    for correct_slot, pred_slot in zip(correct_slots, pred_slots):
        inCorrect = False
        lastCorrectTag = 'O'
        lastCorrectType = ''
        lastPredTag = 'O'
        lastPredType = ''
        for c, p in zip(correct_slot, pred_slot):
            correctTag, correctType = __splitTagType(c)
            predTag, predType = __splitTagType(p)

            if inCorrect == True:
                if __endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
                        __endOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
                        (lastCorrectType == lastPredType):
                    inCorrect = False
                    correctChunkCnt += 1
                    if lastCorrectType in correctChunk:
                        correctChunk[lastCorrectType] += 1
                    else:
                        correctChunk[lastCorrectType] = 1
                elif __endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) != \
                        __endOfChunk(lastPredTag, predTag, lastPredType, predType) or \
                        (correctType != predType):
                    inCorrect = False

            if __startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
                    __startOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
                    (correctType == predType):
                inCorrect = True

            if __startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True:
                foundCorrectCnt += 1
                if correctType in foundCorrect:
                    foundCorrect[correctType] += 1
                else:
                    foundCorrect[correctType] = 1

            if __startOfChunk(lastPredTag, predTag, lastPredType, predType) == True:
                foundPredCnt += 1
                if predType in foundPred:
                    foundPred[predType] += 1
                else:
                    foundPred[predType] = 1

            if correctTag == predTag and correctType == predType:
                correctTags += 1

            tokenCount += 1

            lastCorrectTag = correctTag
            lastCorrectType = correctType
            lastPredTag = predTag
            lastPredType = predType

        if inCorrect == True:
            correctChunkCnt += 1
            if lastCorrectType in correctChunk:
                correctChunk[lastCorrectType] += 1
            else:
                correctChunk[lastCorrectType] = 1

    if foundPredCnt > 0:
        precision = 100 * correctChunkCnt / foundPredCnt
    else:
        precision = 0

    if foundCorrectCnt > 0:
        recall = 100 * correctChunkCnt / foundCorrectCnt
    else:
        recall = 0

    if (precision + recall) > 0:
        f1 = (2 * precision * recall) / (precision + recall)
    else:
        f1 = 0

    return f1, precision, recall


class DataProcessor(object):
    def __init__(self, in_path, slot_path, intent_path, in_vocab, slot_vocab, intent_vocab, shuffle=False):
        self.__fd_in = open(in_path, 'r').readlines()
        self.__fd_slot = open(slot_path, 'r').readlines()
        self.__fd_intent = open(intent_path, 'r').readlines()
        if shuffle:
            self.shuffle()
        self.__in_vocab = in_vocab
        self.__slot_vocab = slot_vocab
        self.__intent_vocab = intent_vocab
        self.end = 0

    def close(self):
        self.__fd_in.close()
        self.__fd_slot.close()
        self.__fd_intent.close()

    def shuffle(self):
        from sklearn.utils import shuffle
        self.__fd_in, self.__fd_slot, self.__fd_intent = shuffle(self.__fd_in, self.__fd_slot, self.__fd_intent)

    def get_batch(self, batch_size):
        in_data = []
        slot_data = []
        slot_weight = []
        length = []
        intents = []

        batch_in = []
        batch_slot = []
        max_len = 0

        in_seq = []
        slot_seq = []
        intent_seq = []
        temp=''
        for i in range(batch_size):
            try:
                inp = self.__fd_in.pop()
            except IndexError:
                self.end = 1
                break
            slot = self.__fd_slot.pop()
            intent = self.__fd_intent.pop()
            inp = inp.rstrip()
            slot = slot.rstrip()
            intent = intent.rstrip()
            if temp=='':
              in_seq.append(inp)
              slot_seq.append(slot)
              intent_seq.append(intent)
            else:
              inp=inp+' '
              inp=inp+temp
              in_seq.append(inp)
              slot=slot+' O'
              slot_seq.append(slot)
              intent_seq.append(intent)
            temp=intent
            iii = inp
            sss = slot
            inp = sentenceToIds(inp, self.__in_vocab, unk=True)
            slot = sentenceToIds(slot, self.__slot_vocab, unk=True)
            intent = sentenceToIds(intent, self.__intent_vocab, unk=False)
            if None not in intent:
                batch_in.append(np.array(inp))
                batch_slot.append(np.array(slot))
                length.append(len(inp))
                intents.append(intent[0])
            if len(inp) != len(slot):
                print(iii, sss)
                print(inp, slot)
                exit(0)
            if len(inp) > max_len:
                max_len = len(inp)

        length = np.array(length)
        intents = np.array(intents)
        for i, s in zip(batch_in, batch_slot):
            in_data.append(padSentence(list(i), max_len, self.__in_vocab))
            slot_data.append(padSentence(list(s), max_len, self.__slot_vocab))

        in_data = np.array(in_data)
        slot_data = np.array(slot_data)

        for s in slot_data:
            weight = np.not_equal(s, np.full(s.shape, self._DataProcessor__slot_vocab['vocab']['_PAD']))
            weight = weight.astype(np.float32)
            slot_weight.append(weight)
        slot_weight = np.array(slot_weight)
        return in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# -*- coding: utf-8 -*-
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""## Functions for working with arbitrarily nested sequences of elements.

This module can perform operations on nested structures. A nested structure is a
Python sequence, tuple (including `namedtuple`), or dict that can contain
further sequences, tuples, and dicts.

The utilities here assume (and do not check) that the nested structures form a
'tree', i.e., no references in the structure of the input of these functions
should be recursive.

Example structures: `((3, 4), 5, (6, 7, (9, 10), 8))`, `(np.array(0),
  (np.array([3, 4]), tf.constant([3, 4])))`
"""

import collections as _collections

import six as _six


def _sorted(dict_):
    """Returns a sorted list of the dict keys, with error if keys not sortable."""
    try:
        return sorted(_six.iterkeys(dict_))
    except TypeError:
        raise TypeError("nest only supports dicts with sortable keys.")


def _sequence_like(instance, args):
    """Converts the sequence `args` to the same type as `instance`.

    Args:
      instance: an instance of `tuple`, `list`, `namedtuple`, `dict`, or
          `collections.OrderedDict`.
      args: elements to be converted to the `instance` type.

    Returns:
      `args` with the type of `instance`.
    """
    if isinstance(instance, dict):
        # Pack dictionaries in a deterministic order by sorting the keys.
        # Notice this means that we ignore the original order of `OrderedDict`
        # instances. This is intentional, to avoid potential bugs caused by mixing
        # ordered and plain dicts (e.g., flattening a dict but using a
        # corresponding `OrderedDict` to pack it back).
        result = dict(zip(_sorted(instance), args))
        return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
    elif (isinstance(instance, tuple) and
          hasattr(instance, "_fields") and
          isinstance(instance._fields, _collections.Sequence) and
          all(isinstance(f, _six.string_types) for f in instance._fields)):
        # This is a namedtuple
        return type(instance)(*args)
    else:
        # Not a namedtuple
        return type(instance)(args)


def _yield_value(iterable):
    if isinstance(iterable, dict):
        # Iterate through dictionaries in a deterministic order by sorting the
        # keys. Notice this means that we ignore the original order of `OrderedDict`
        # instances. This is intentional, to avoid potential bugs caused by mixing
        # ordered and plain dicts (e.g., flattening a dict but using a
        # corresponding `OrderedDict` to pack it back).
        for key in _sorted(iterable):
            yield iterable[key]
    else:
        for value in iterable:
            yield value


def _yield_flat_nest(nest):
    for n in _yield_value(nest):
        if is_sequence(n):
            for ni in _yield_flat_nest(n):
                yield ni
        else:
            yield n


# Used by `_warn_once` to remember which warning messages have been given.
_ALREADY_WARNED = {}


def _warn_once(message):
    """Logs a warning message, once per unique string."""
    if message not in _ALREADY_WARNED:
        _ALREADY_WARNED[message] = True


def is_sequence(seq):
    """Returns a true if its input is a collections.Sequence (except strings).

    Args:
      seq: an input sequence.

    Returns:
      True if the sequence is a not a string and is a collections.Sequence or a
      dict.
    """
    if isinstance(seq, dict):
        return True
    if isinstance(seq, set):
        _warn_once("Sets are not currently considered sequences, but this may "
                   "change in the future, so consider avoiding using them.")
    return (isinstance(seq, _collections.Sequence)
            and not isinstance(seq, _six.string_types))


def flatten(nest):
    """Returns a flat list from a given nested structure.

    If `nest` is not a sequence, tuple, or dict, then returns a single-element
    list: `[nest]`.

    In the case of dict instances, the sequence consists of the values, sorted by
    key to ensure deterministic behavior. This is true also for `OrderedDict`
    instances: their sequence order is ignored, the sorting order of keys is
    used instead. The same convention is followed in `pack_sequence_as`. This
    correctly repacks dicts and `OrderedDict`s after they have been flattened,
    and also allows flattening an `OrderedDict` and then repacking it back using
    a correponding plain dict, or vice-versa.
    Dictionaries with non-sortable keys cannot be flattened.

    Args:
      nest: an arbitrarily nested structure or a scalar object. Note, numpy
          arrays are considered scalars.

    Returns:
      A Python list, the flattened version of the input.

    Raises:
      TypeError: The nest is or contains a dict with non-sortable keys.
    """
    if is_sequence(nest):
        return list(_yield_flat_nest(nest))
    else:
        return [nest]


def _recursive_assert_same_structure(nest1, nest2, check_types):
    """Helper function for `assert_same_structure`."""
    is_sequence_nest1 = is_sequence(nest1)
    if is_sequence_nest1 != is_sequence(nest2):
        raise ValueError(
            "The two structures don't have the same nested structure.\n\n"
            "First structure: %s\n\nSecond structure: %s." % (nest1, nest2))

    if not is_sequence_nest1:
        return  # finished checking

    if check_types:
        type_nest1 = type(nest1)
        type_nest2 = type(nest2)
        if type_nest1 != type_nest2:
            raise TypeError(
                "The two structures don't have the same sequence type. First "
                "structure has type %s, while second structure has type %s."
                % (type_nest1, type_nest2))

        if isinstance(nest1, dict):
            keys1 = set(_six.iterkeys(nest1))
            keys2 = set(_six.iterkeys(nest2))
            if keys1 != keys2:
                raise ValueError(
                    "The two dictionaries don't have the same set of keys. First "
                    "structure has keys {}, while second structure has keys {}."
                        .format(keys1, keys2))

    nest1_as_sequence = [n for n in _yield_value(nest1)]
    nest2_as_sequence = [n for n in _yield_value(nest2)]
    for n1, n2 in zip(nest1_as_sequence, nest2_as_sequence):
        _recursive_assert_same_structure(n1, n2, check_types)


def assert_same_structure(nest1, nest2, check_types=True):
    """Asserts that two structures are nested in the same way.

    Args:
      nest1: an arbitrarily nested structure.
      nest2: an arbitrarily nested structure.
      check_types: if `True` (default) types of sequences are checked as
          well, including the keys of dictionaries. If set to `False`, for example
          a list and a tuple of objects will look the same if they have the same
          size.

    Raises:
      ValueError: If the two structures do not have the same number of elements or
        if the two structures are not nested in the same way.
      TypeError: If the two structures differ in the type of sequence in any of
        their substructures. Only possible if `check_types` is `True`.
    """
    len_nest1 = len(flatten(nest1)) if is_sequence(nest1) else 1
    len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
    if len_nest1 != len_nest2:
        raise ValueError("The two structures don't have the same number of "
                         "elements.\n\nFirst structure (%i elements): %s\n\n"
                         "Second structure (%i elements): %s"
                         % (len_nest1, nest1, len_nest2, nest2))
    _recursive_assert_same_structure(nest1, nest2, check_types)


def flatten_dict_items(dictionary):
    """Returns a dictionary with flattened keys and values.

    This function flattens the keys and values of a dictionary, which can be
    arbitrarily nested structures, and returns the flattened version of such
    structures:

    ```python
    example_dictionary = {(4, 5, (6, 8)): ("a", "b", ("c", "d"))}
    result = {4: "a", 5: "b", 6: "c", 8: "d"}
    flatten_dict_items(example_dictionary) == result
    ```

    The input dictionary must satisfy two properties:

    1. Its keys and values should have the same exact nested structure.
    2. The set of all flattened keys of the dictionary must not contain repeated
       keys.

    Args:
      dictionary: the dictionary to zip

    Returns:
      The zipped dictionary.

    Raises:
      TypeError: If the input is not a dictionary.
      ValueError: If any key and value have not the same structure, or if keys are
        not unique.
    """
    if not isinstance(dictionary, dict):
        raise TypeError("input must be a dictionary")
    flat_dictionary = {}
    for i, v in _six.iteritems(dictionary):
        if not is_sequence(i):
            if i in flat_dictionary:
                raise ValueError(
                    "Could not flatten dictionary: key %s is not unique." % i)
            flat_dictionary[i] = v
        else:
            flat_i = flatten(i)
            flat_v = flatten(v)
            if len(flat_i) != len(flat_v):
                raise ValueError(
                    "Could not flatten dictionary. Key had %d elements, but value had "
                    "%d elements. Key: %s, value: %s."
                    % (len(flat_i), len(flat_v), flat_i, flat_v))
            for new_i, new_v in zip(flat_i, flat_v):
                if new_i in flat_dictionary:
                    raise ValueError(
                        "Could not flatten dictionary: key %s is not unique."
                        % (new_i))
                flat_dictionary[new_i] = new_v
    return flat_dictionary


def _packed_nest_with_indices(structure, flat, index):
    """Helper function for pack_sequence_as.

    Args:
      structure: Substructure (list / tuple / dict) to mimic.
      flat: Flattened values to output substructure for.
      index: Index at which to start reading from flat.

    Returns:
      The tuple (new_index, child), where:
        * new_index - the updated index into `flat` having processed `structure`.
        * packed - the subset of `flat` corresponding to `structure`,
                   having started at `index`, and packed into the same nested
                   format.

    Raises:
      ValueError: if `structure` contains more elements than `flat`
        (assuming indexing starts from `index`).
    """
    packed = []
    for s in _yield_value(structure):
        if is_sequence(s):
            new_index, child = _packed_nest_with_indices(s, flat, index)
            packed.append(_sequence_like(s, child))
            index = new_index
        else:
            packed.append(flat[index])
            index += 1
    return index, packed


def pack_sequence_as(structure, flat_sequence):
    """Returns a given flattened sequence packed into a given structure.

    If `structure` is a scalar, `flat_sequence` must be a single-element list;
    in this case the return value is `flat_sequence[0]`.

    If `structure` is or contains a dict instance, the keys will be sorted to
    pack the flat sequence in deterministic order. This is true also for
    `OrderedDict` instances: their sequence order is ignored, the sorting order of
    keys is used instead. The same convention is followed in `pack_sequence_as`.
    This correctly repacks dicts and `OrderedDict`s after they have been
    flattened, and also allows flattening an `OrderedDict` and then repacking it
    back using a correponding plain dict, or vice-versa.
    Dictionaries with non-sortable keys cannot be flattened.

    Args:
      structure: Nested structure, whose structure is given by nested lists,
          tuples, and dicts. Note: numpy arrays and strings are considered
          scalars.
      flat_sequence: flat sequence to pack.

    Returns:
      packed: `flat_sequence` converted to have the same recursive structure as
        `structure`.

    Raises:
      ValueError: If `flat_sequence` and `structure` have different
        element counts.
      TypeError: `structure` is or contains a dict with non-sortable keys.
    """
    if not is_sequence(flat_sequence):
        raise TypeError("flat_sequence must be a sequence")

    if not is_sequence(structure):
        if len(flat_sequence) != 1:
            raise ValueError("Structure is a scalar but len(flat_sequence) == %d > 1"
                             % len(flat_sequence))
        return flat_sequence[0]

    flat_structure = flatten(structure)
    if len(flat_structure) != len(flat_sequence):
        raise ValueError(
            "Could not pack sequence. Structure had %d elements, but flat_sequence "
            "had %d elements.  Structure: %s, flat_sequence: %s."
            % (len(flat_structure), len(flat_sequence), structure, flat_sequence))

    _, packed = _packed_nest_with_indices(structure, flat_sequence, 0)
    return _sequence_like(structure, packed)


def map_structure(func, *structure, **check_types_dict):
    """Applies `func` to each entry in `structure` and returns a new structure.

    Applies `func(x[0], x[1], ...)` where x[i] is an entry in
    `structure[i]`.  All structures in `structure` must have the same arity,
    and the return value will contain the results in the same structure.

    Args:
      func: A callable that accepts as many arguments as there are structures.
      *structure: scalar, or tuple or list of constructed scalars and/or other
        tuples/lists, or scalars.  Note: numpy arrays are considered as scalars.
      **check_types_dict: only valid keyword argument is `check_types`. If set to
        `True` (default) the types of iterables within the structures have to be
        same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
        exception). To allow this set this argument to `False`.

    Returns:
      A new structure with the same arity as `structure`, whose values correspond
      to `func(x[0], x[1], ...)` where `x[i]` is a value in the corresponding
      location in `structure[i]`. If there are different sequence types and
      `check_types` is `False` the sequence types of the first structure will be
      used.

    Raises:
      TypeError: If `func` is not callable or if the structures do not match
        each other by depth tree.
      ValueError: If no structure is provided or if the structures do not match
        each other by type.
      ValueError: If wrong keyword arguments are provided.
    """
    if not callable(func):
        raise TypeError("func must be callable, got: %s" % func)

    if not structure:
        raise ValueError("Must provide at least one structure")

    if check_types_dict:
        if "check_types" not in check_types_dict or len(check_types_dict) > 1:
            raise ValueError("Only valid keyword argument is check_types")
        check_types = check_types_dict["check_types"]
    else:
        check_types = True

    for other in structure[1:]:
        assert_same_structure(structure[0], other, check_types=check_types)

    flat_structure = [flatten(s) for s in structure]
    entries = zip(*flat_structure)

    return pack_sequence_as(
        structure[0], [func(*x) for x in entries])


def _yield_flat_up_to(shallow_tree, input_tree):
    """Yields elements `input_tree` partially flattened up to `shallow_tree`."""
    if is_sequence(shallow_tree):
        for shallow_branch, input_branch in zip(_yield_value(shallow_tree),
                                                _yield_value(input_tree)):
            for input_leaf in _yield_flat_up_to(shallow_branch, input_branch):
                yield input_leaf
    else:
        yield input_tree


def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
    """Asserts that `shallow_tree` is a shallow structure of `input_tree`.

    That is, this function tests if the `input_tree` structure can be created from
    the `shallow_tree` structure by replacing its leaf nodes with deeper
    tree structures.

    Examples:

    The following code will raise an exception:
    ```python
      shallow_tree = ["a", "b"]
      input_tree = ["c", ["d", "e"], "f"]
      assert_shallow_structure(shallow_tree, input_tree)
    ```

    The following code will not raise an exception:
    ```python
      shallow_tree = ["a", "b"]
      input_tree = ["c", ["d", "e"]]
      assert_shallow_structure(shallow_tree, input_tree)
    ```

    Args:
      shallow_tree: an arbitrarily nested structure.
      input_tree: an arbitrarily nested structure.
      check_types: if `True` (default) the sequence types of `shallow_tree` and
        `input_tree` have to be the same.

    Raises:
      TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
      TypeError: If the sequence types of `shallow_tree` are different from
        `input_tree`. Only raised if `check_types` is `True`.
      ValueError: If the sequence lengths of `shallow_tree` are different from
        `input_tree`.
    """
    if is_sequence(shallow_tree):
        if not is_sequence(input_tree):
            raise TypeError(
                "If shallow structure is a sequence, input must also be a sequence. "
                "Input has type: %s." % type(input_tree))

        if check_types and not isinstance(input_tree, type(shallow_tree)):
            raise TypeError(
                "The two structures don't have the same sequence type. Input "
                "structure has type %s, while shallow structure has type %s."
                % (type(input_tree), type(shallow_tree)))

        if len(input_tree) != len(shallow_tree):
            raise ValueError(
                "The two structures don't have the same sequence length. Input "
                "structure has length %s, while shallow structure has length %s."
                % (len(input_tree), len(shallow_tree)))

        for shallow_branch, input_branch in zip(shallow_tree, input_tree):
            assert_shallow_structure(shallow_branch, input_branch,
                                     check_types=check_types)


def flatten_up_to(shallow_tree, input_tree):
    """Flattens `input_tree` up to `shallow_tree`.

    Any further depth in structure in `input_tree` is retained as elements in the
    partially flatten output.

    If `shallow_tree` and `input_tree` are not sequences, this returns a
    single-element list: `[input_tree]`.

    Use Case:

    Sometimes we may wish to partially flatten a nested sequence, retaining some
    of the nested structure. We achieve this by specifying a shallow structure,
    `shallow_tree`, we wish to flatten up to.

    The input, `input_tree`, can be thought of as having the same structure as
    `shallow_tree`, but with leaf nodes that are themselves tree structures.

    Examples:

    ```python
    input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
    shallow_tree = [[True, True], [False, True]]

    flattened_input_tree = flatten_up_to(shallow_tree, input_tree)
    flattened_shallow_tree = flatten_up_to(shallow_tree, shallow_tree)

    # Output is:
    # [[2, 2], [3, 3], [4, 9], [5, 5]]
    # [True, True, False, True]
    ```

    ```python
    input_tree = [[('a', 1), [('b', 2), [('c', 3), [('d', 4)]]]]]
    shallow_tree = [['level_1', ['level_2', ['level_3', ['level_4']]]]]

    input_tree_flattened_as_shallow_tree = flatten_up_to(shallow_tree, input_tree)
    input_tree_flattened = flatten(input_tree)

    # Output is:
    # [('a', 1), ('b', 2), ('c', 3), ('d', 4)]
    # ['a', 1, 'b', 2, 'c', 3, 'd', 4]
    ```

    Non-Sequence Edge Cases:

    ```python
    flatten_up_to(0, 0)  # Output: [0]
    flatten_up_to(0, [0, 1, 2])  # Output: [[0, 1, 2]]
    flatten_up_to([0, 1, 2], 0)  # Output: TypeError
    flatten_up_to([0, 1, 2], [0, 1, 2])  # Output: [0, 1, 2]
    ```

    Args:
      shallow_tree: a possibly pruned structure of input_tree.
      input_tree: an arbitrarily nested structure or a scalar object.
        Note, numpy arrays are considered scalars.

    Returns:
      A Python list, the partially flattened version of `input_tree` according to
      the structure of `shallow_tree`.

    Raises:
      TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
      TypeError: If the sequence types of `shallow_tree` are different from
        `input_tree`.
      ValueError: If the sequence lengths of `shallow_tree` are different from
        `input_tree`.
    """
    assert_shallow_structure(shallow_tree, input_tree)
    return list(_yield_flat_up_to(shallow_tree, input_tree))


def map_structure_up_to(shallow_tree, func, *inputs):
    """Applies a function or op to a number of partially flattened inputs.

    The `inputs` are flattened up to `shallow_tree` before being mapped.

    Use Case:

    Sometimes we wish to apply a function to a partially flattened
    sequence (for example when the function itself takes sequence inputs). We
    achieve this by specifying a shallow structure, `shallow_tree` we wish to
    flatten up to.

    The `inputs`, can be thought of as having the same structure as
    `shallow_tree`, but with leaf nodes that are themselves tree structures.

    This function therefore will return something with the same base structure as
    `shallow_tree`.

    Examples:

    ```python
    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
    op_tuple = collections.namedtuple("op_tuple", "add, mul")
    inp_val = ab_tuple(a=2, b=3)
    inp_ops = ab_tuple(a=op_tuple(add=1, mul=2), b=op_tuple(add=2, mul=3))
    out = map_structure_up_to(inp_val, lambda val, ops: (val + ops.add) * ops.mul,
                              inp_val, inp_ops)

    # Output is: ab_tuple(a=6, b=15)
    ```

    ```python
    data_list = [[2, 4, 6, 8], [[1, 3, 5, 7, 9], [3, 5, 7]]]
    name_list = ['evens', ['odds', 'primes']]
    out = map_structure_up_to(
        name_list,
        lambda name, sec: "first_{}_{}".format(len(sec), name),
        name_list, data_list)

    # Output is: ['first_4_evens', ['first_5_odds', 'first_3_primes']]
    ```

    Args:
      shallow_tree: a shallow tree, common to all the inputs.
      func: callable which will be applied to each input individually.
      *inputs: arbitrarily nested combination of objects that are compatible with
          shallow_tree. The function `func` is applied to corresponding
          partially flattened elements of each input, so the function must support
          arity of `len(inputs)`.

    Raises:
      TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
      TypeError: If the sequence types of `shallow_tree` are different from
        `input_tree`.
      ValueError: If the sequence lengths of `shallow_tree` are different from
        `input_tree`.

    Returns:
      result of repeatedly applying `func`, with same structure as
      `shallow_tree`.
    """
    if not inputs:
        raise ValueError("Cannot map over no sequences")
    for input_tree in inputs:
        assert_shallow_structure(shallow_tree, input_tree)

    # Flatten each input separately, apply the function to corresponding elements,
    # then repack based on the structure of the first input.
    all_flattened_up_to = [flatten_up_to(shallow_tree, input_tree)
                           for input_tree in inputs]
    results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
    return pack_sequence_as(structure=shallow_tree, flat_sequence=results)


def get_traverse_shallow_structure(traverse_fn, structure):
    """Generates a shallow structure from a `traverse_fn` and `structure`.

    `traverse_fn` must accept any possible subtree of `structure` and return
    a depth=1 structure containing `True` or `False` values, describing which
    of the top-level subtrees may be traversed.  It may also
    return scalar `True` or `False` "traversal is OK / not OK for all subtrees."

    Examples are available in the unit tests (nest_test.py).

    Args:
      traverse_fn: Function taking a substructure and returning either a scalar
        `bool` (whether to traverse that substructure or not) or a depth=1
        shallow structure of the same type, describing which parts of the
        substructure to traverse.
      structure: The structure to traverse.

    Returns:
      A shallow structure containing python bools, which can be passed to
      `map_structure_up_to` and `flatten_up_to`.

    Raises:
      TypeError: if `traverse_fn` returns a sequence for a non-sequence input,
        or a structure with depth higher than 1 for a sequence input,
        or if any leaf values in the returned structure or scalar are not type
        `bool`.
    """
    to_traverse = traverse_fn(structure)
    if not is_sequence(structure):
        if not isinstance(to_traverse, bool):
            raise TypeError("traverse_fn returned structure: %s for non-structure: %s"
                            % (to_traverse, structure))
        return to_traverse
    level_traverse = []
    if isinstance(to_traverse, bool):
        if not to_traverse:
            # Do not traverse this substructure at all.  Exit early.
            return False
        else:
            # Traverse the entire substructure.
            for branch in _yield_value(structure):
                level_traverse.append(
                    get_traverse_shallow_structure(traverse_fn, branch))
    elif not is_sequence(to_traverse):
        raise TypeError("traverse_fn returned a non-bool scalar: %s for input: %s"
                        % (to_traverse, structure))
    else:
        # Traverse some subset of this substructure.
        assert_shallow_structure(to_traverse, structure)
        for t, branch in zip(_yield_value(to_traverse), _yield_value(structure)):
            if not isinstance(t, bool):
                raise TypeError(
                    "traverse_fn didn't return a depth=1 structure of bools.  saw: %s "
                    " for structure: %s" % (to_traverse, structure))
            if t:
                level_traverse.append(
                    get_traverse_shallow_structure(traverse_fn, branch))
            else:
                level_traverse.append(False)
    return _sequence_like(structure, level_traverse)

In [4]:
# -*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf

#import nest


def mkMask(input_tensor, maxLen):
    shape_of_input = tf.shape(input_tensor)
    shape_of_output = tf.concat(axis=0, values=[shape_of_input, [maxLen]])

    oneDtensor = tf.reshape(input_tensor, shape=(-1,))
    flat_mask = tf.sequence_mask(oneDtensor, maxlen=maxLen)
    return tf.reshape(flat_mask, shape_of_output)


def reduce_avg(reduce_target, lengths, dim):
    """
    Args:
        reduce_target : shape(d_0, d_1,..,d_dim, .., d_k)
        lengths : shape(d0, .., d_(dim-1))
        dim : which dimension to average, should be a python number
    """
    shape_of_lengths = lengths.get_shape()
    shape_of_target = reduce_target.get_shape()
    if len(shape_of_lengths) != dim:
        raise ValueError(('Second input tensor should be rank %d, ' +
                          'while it got rank %d') % (dim, len(shape_of_lengths)))
    if len(shape_of_target) < dim + 1:
        raise ValueError(('First input tensor should be at least rank %d, ' +
                          'while it got rank %d') % (dim + 1, len(shape_of_target)))

    rank_diff = len(shape_of_target) - len(shape_of_lengths) - 1
    mxlen = tf.shape(reduce_target)[dim]
    mask = mkMask(lengths, mxlen)
    if rank_diff != 0:
        len_shape = tf.concat(axis=0, values=[tf.shape(lengths), [1] * rank_diff])
        mask_shape = tf.concat(axis=0, values=[tf.shape(mask), [1] * rank_diff])
    else:
        len_shape = tf.shape(lengths)
        mask_shape = tf.shape(mask)
    lengths_reshape = tf.reshape(lengths, shape=len_shape)
    mask = tf.reshape(mask, shape=mask_shape)

    mask_target = reduce_target * tf.cast(mask, dtype=reduce_target.dtype)

    red_sum = tf.reduce_sum(mask_target, axis=[dim], keep_dims=False)
    red_avg = red_sum / (tf.to_float(lengths_reshape) + 1e-30)
    return red_avg


def reduce_sum(reduce_target, lengths, dim):
    """
    Args:
        reduce_target : shape(d_0, d_1,..,d_dim, .., d_k)
        lengths : shape(d0, .., d_(dim-1))
        dim : which dimension to average, should be a python number
    """
    shape_of_lengths = lengths.get_shape()
    shape_of_target = reduce_target.get_shape()
    if len(shape_of_lengths) != dim:
        raise ValueError(('Second input tensor should be rank %d, ' +
                          'while it got rank %d') % (dim, len(shape_of_lengths)))
    if len(shape_of_target) < dim + 1:
        raise ValueError(('First input tensor should be at least rank %d, ' +
                          'while it got rank %d') % (dim + 1, len(shape_of_target)))

    rank_diff = len(shape_of_target) - len(shape_of_lengths) - 1
    mxlen = tf.shape(reduce_target)[dim]
    mask = mkMask(lengths, mxlen)
    if rank_diff != 0:
        len_shape = tf.concat(axis=0, values=[tf.shape(lengths), [1] * rank_diff])
        mask_shape = tf.concat(axis=0, values=[tf.shape(mask), [1] * rank_diff])
    else:
        len_shape = tf.shape(lengths)
        mask_shape = tf.shape(mask)
    lengths_reshape = tf.reshape(lengths, shape=len_shape)
    mask = tf.reshape(mask, shape=mask_shape)

    mask_target = reduce_target * tf.cast(mask, dtype=reduce_target.dtype)

    red_sum = tf.reduce_sum(mask_target, axis=[dim], keep_dims=False)

    return red_sum


def embed_lookup_last_dim(embedding, ids):
    '''
        embedding: shape(b_sz, tstp, emb_sz)
        ids : shape(b_sz, tstp)
    '''
    input_shape = tf.shape(embedding)
    time_steps = input_shape[0]

    def _create_ta(name, dtype):
        return tf.TensorArray(dtype=dtype,
                              size=time_steps,
                              tensor_array_name=name)

    input_ta = _create_ta('input_ta', embedding.dtype)
    fetch_ta = _create_ta('fetch_ta', ids.dtype)
    output_ta = _create_ta('output_ta', embedding.dtype)
    input_ta = input_ta.unpack(embedding)
    fetch_ta = fetch_ta.unpack(ids)

    def loop_body(time, output_ta):
        embed = input_ta.read(time)  # shape(tstp, emb_sz) type of float32
        fetch_id = fetch_ta.read(time)  # shape(tstp) type of int32
        out_emb = tf.nn.embedding_lookup(embed, fetch_id)
        output_ta = output_ta.write(time, out_emb)

        next_time = time + 1
        return next_time, output_ta

    time = tf.constant(0)
    _, output_ta = tf.while_loop(cond=lambda time, *_: time < time_steps,
                                 body=loop_body, loop_vars=(time, output_ta),
                                 swap_memory=True)
    ret_t = output_ta.pack()  # shape(b_sz, tstp, embd_sz)
    return ret_t


def entry_stop_gradients(target, mask):
    '''
    Args:
        target: a tensor
        mask: a boolean tensor that broadcast to the rank of that to target tensor
    Returns:
        ret: a tensor have the same value of target,
            but some entry will have no gradient during backprop
    '''
    mask_h = tf.logical_not(mask)

    mask = tf.cast(mask, dtype=target.dtype)
    mask_h = tf.cast(mask_h, dtype=target.dtype)
    ret = tf.stop_gradient(mask_h * target) + mask * target

    return ret


def last_dim_linear(inputs, output_size, bias, scope):
    '''
    Args:
        input: shape(b_sz, ..., rep_sz)
        output_size: a scalar, python number
    '''
    bias_start = 0.0
    input_shape = tf.shape(inputs)
    out_shape = tf.concat(axis=0, values=[input_shape[:-1], [output_size]])
    input_size = int(inputs.get_shape()[-1])
    unbatch_input = tf.reshape(inputs, shape=[-1, input_size])

    unbatch_output = linear(unbatch_input, output_size, bias=bias,
                            bias_start=bias_start, scope=scope)
    batch_output = tf.reshape(unbatch_output, shape=out_shape)

    return batch_output  # shape(b_sz, ..., output_size)


def linear(args, output_size, bias, bias_start=0.0, scope=None):
    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.

    Args:
      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
      output_size: int, second dimension of W[i].
      bias: boolean, whether to add a bias term or not.
      bias_start: starting value to initialize the bias; 0 by default.
      scope: (optional) Variable scope to create parameters in.

    Returns:
      A 2D Tensor with shape [batch x output_size] equal to
      sum_i(args[i] * W[i]), where W[i]s are newly created matrices.

    Raises:
      ValueError: if some of the arguments has unspecified or wrong shape.
    """
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    total_arg_size = 0
    shapes = [a.get_shape() for a in args]
    for shape in shapes:
        if shape.ndims != 2:
            raise ValueError("linear is expecting 2D arguments: %s" % shapes)
        if shape[1].value is None:
            raise ValueError("linear expects shape[1] to be provided for shape %s, "
                             "but saw %s" % (shape, shape[1]))
        else:
            total_arg_size += shape[1].value

    dtype = [a.dtype for a in args][0]

    with tf.variable_scope(scope or 'Linear') as outer_scope:
        weights = tf.get_variable(
            "weights", [total_arg_size, output_size], dtype=dtype)
        if len(args) == 1:
            res = tf.matmul(args[0], weights)
        else:
            res = tf.matmul(tf.concat(args, 1), weights)
        if not bias:
            return res
        with tf.variable_scope(outer_scope) as inner_scope:
            inner_scope.set_partitioner(None)
            biases = tf.get_variable(
                "biases", [output_size],
                dtype=dtype,
                initializer=tf.constant_initializer(bias_start, dtype=dtype))
    return tf.nn.bias_add(res, biases)


def masked_softmax(inp, seqLen):
    seqLen = tf.where(tf.equal(seqLen, 0), tf.ones_like(seqLen), seqLen)
    if len(inp.get_shape()) != len(seqLen.get_shape()) + 1:
        raise ValueError('rank of seqLen should be %d, but have the rank %d.\n'
                         % (len(inp.get_shape()) - 1, len(seqLen.get_shape())))
    mask = mkMask(seqLen, tf.shape(inp)[-1])
    masked_inp = tf.where(mask, inp, tf.ones_like(inp) * (-np.Inf))
    ret = tf.nn.softmax(masked_inp)
    return ret


from tensorflow.python.client import device_lib


def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']


from tensorflow.python.framework import ops
from tensorflow.python.ops import gen_math_ops


def batch_gather(params, indices, name=None):
    """Gather slices from `params` according to `indices` with leading batch dims.
    This operation assumes that the leading dimensions of `indices` are dense,
    and the gathers on the axis corresponding to the last dimension of `indices`.
    More concretely it computes:
    result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]]
    Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM],
    `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be
    a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`.
    In the case in which indices is a 1D tensor, this operation is equivalent to
    `tf.gather`.
    See also `tf.gather` and `tf.gather_nd`.
    Args:
      params: A Tensor. The tensor from which to gather values.
      indices: A Tensor. Must be one of the following types: int32, int64. Index
          tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the
          last dimension of `indices` itself.
      name: A name for the operation (optional).
    Returns:
      A Tensor. Has the same type as `params`.
    Raises:
      ValueError: if `indices` has an unknown shape.
    """

    with ops.name_scope(name):
        indices = ops.convert_to_tensor(indices, name="indices")
        params = ops.convert_to_tensor(params, name="params")
        indices_shape = tf.shape(indices)
        params_shape = tf.shape(params)
        ndims = indices.shape.ndims
        if ndims is None:
            raise ValueError("batch_gather does not allow indices with unknown "
                             "shape.")
        batch_indices = indices
        accum_dim_value = 1
        for dim in range(ndims - 1, 0, -1):
            dim_value = params_shape[dim - 1]
            accum_dim_value *= params_shape[dim]
            dim_indices = gen_math_ops._range(0, dim_value, 1)
            dim_indices *= accum_dim_value
            dim_shape = tf.stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim),
                                 axis=0)
            batch_indices += tf.reshape(dim_indices, dim_shape)

        flat_indices = tf.reshape(batch_indices, [-1])
        outer_shape = params_shape[ndims:]
        flat_inner_shape = gen_math_ops.prod(
            params_shape[:ndims], [0], False)

        flat_params = tf.reshape(
            params, tf.concat([[flat_inner_shape], outer_shape], axis=0))
        flat_result = tf.gather(flat_params, flat_indices)
        result = tf.reshape(flat_result, tf.concat([indices_shape, outer_shape], axis=0))
        final_shape = indices.get_shape()[:ndims - 1].merge_with(
            params.get_shape()[:ndims - 1])
        final_shape = final_shape.concatenate(indices.get_shape()[ndims - 1])
        final_shape = final_shape.concatenate(params.get_shape()[ndims:])
        result.set_shape(final_shape)
        return result

In [5]:
# -*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf


def _softmax_with_mask(logits, lens, axis=-1):
    """Helper function for softmax on variable-length sequences.
        Args:
            logits: The logits before softmax. Shape is [batch, type_num, class_num]
            lens: The length of the sequence. Shape is [batch, type_num].
            axis: The axis to apply softmax operator on.
        Returns:
             A tensor with softmax-ed values. Same shape as logits.
    """
    exp_logits = tf.exp(logits)
    mask = tf.sequence_mask(lens, maxlen=tf.shape(logits)[axis], dtype=tf.float32)
    masked_exp_logits = tf.multiply(exp_logits, mask)
    masked_exp_logits_sum = tf.reduce_sum(masked_exp_logits, axis)
    return tf.clip_by_value(tf.div(masked_exp_logits, tf.expand_dims(masked_exp_logits_sum, axis)), 1e-37, 1e+37)


def _squash(input_tensor):
    """Applies norm nonlinearity (squash) to a capsule layer.
        Args:
            input_tensor: Input tensor. Shape is [batch, num_channels, num_atoms] for a
              fully connected capsule layer or
              [batch, num_channels, num_atoms, height, width] for a convolutional
              capsule layer.
        Returns:
            A tensor with same shape as input (rank 3) for output of this layer.
    """
    with tf.name_scope('norm_non_linearity'):
        norm = tf.norm(input_tensor, axis=2, keep_dims=True)
        norm_squared = norm * norm
        return (input_tensor / norm) * (norm_squared / (1 + norm_squared))


def _leaky_routing(logits, output_dim):
    """Adds extra dimmension to routing logits.
    This enables active capsules to be routed to the extra dim if they are not a
    good fit for any of the capsules in layer above.
    Args:
      logits: The original logits. shape is
        [input_capsule_num, output_capsule_num] if fully connected. Otherwise, it
        has two more dimmensions.
      output_dim: The number of units in the second dimmension of logits.
    Returns:
      Routing probabilities for each pair of capsules. Same shape as logits.
    """
    leak = tf.zeros_like(logits, optimize=True)
    leak = tf.reduce_sum(leak, axis=2, keep_dims=True)
    leaky_logits = tf.concat([leak, logits], axis=2)
    leaky_routing = tf.nn.softmax(leaky_logits, dim=2)
    return tf.split(leaky_routing, [1, output_dim], 2)[1]


def _update_routing(votes, biases, logit_shape, num_dims, input_dim, output_dim,
                    num_routing=3, leaky=True):
    """Sums over scaled votes and applies squash to compute the activations.
    Iteratively updates routing logits (scales) based on the similarity between
    the activation of this layer and the votes of the layer below.
    Args:
      votes: tensor, The transformed outputs of the layer below.
      biases: tensor, Bias variable.
      logit_shape: tensor, shape of the logit to be initialized.
      num_dims: scalar, number of dimmensions in votes. For fully connected
        capsule it is 4, for convolutional 6.
      input_dim: scalar, number of capsules in the input layer.
      output_dim: scalar, number of capsules in the output layer.
      num_routing: scalar, Number of routing iterations.
      leaky: boolean, if set use leaky routing.
    Returns:
      The activation tensor of the output layer after num_routing iterations.
    """
    votes_t_shape = [3, 0, 1, 2]
    for i in range(num_dims - 4):
        votes_t_shape += [i + 4]
    r_t_shape = [1, 2, 3, 0]
    for i in range(num_dims - 4):
        r_t_shape += [i + 4]
    votes_trans = tf.transpose(votes, votes_t_shape)

    def _body(i, logits, activations, routes):
        """Routing while loop."""
        if leaky:
            route = _leaky_routing(logits, output_dim)
        else:
            route = tf.nn.softmax(logits, dim=2)
        preactivate_unrolled = route * votes_trans
        preact_trans = tf.transpose(preactivate_unrolled, r_t_shape)
        preactivate = tf.reduce_sum(preact_trans, axis=1) + biases
        activation = _squash(preactivate)
        activations = activations.write(i, activation)
        routes = routes.write(i, route)
        # distances: [batch, input_dim, output_dim]
        act_3d = tf.expand_dims(activation, 1)
        tile_shape = np.ones(num_dims, dtype=np.int32).tolist()
        tile_shape[1] = input_dim
        act_replicated = tf.tile(act_3d, tile_shape)
        distances = tf.reduce_sum(votes * act_replicated, axis=3)
        logits += distances
        return (i + 1, logits, activations, routes)

    activations = tf.TensorArray(
        dtype=tf.float32, size=num_routing, clear_after_read=False)
    routes = tf.TensorArray(
        dtype=tf.float32, size=num_routing, clear_after_read=False)
    logits = tf.fill(logit_shape, 0.0)
    i = tf.constant(0, dtype=tf.int32)
    _, logits, activations, routes = tf.while_loop(
        lambda i, logits, activations, routes: i < num_routing,
        _body,
        loop_vars=[i, logits, activations, routes],
        swap_memory=True)

    return activations.read(num_routing - 1), logits, routes.read(num_routing - 1)


class Capsule:
    def __init__(self, input_dim, input_atoms, output_dim, output_atoms, layer_name):
        self.input_dim = input_dim
        self.input_atoms = input_atoms
        self.output_dim = output_dim
        self.output_atoms = output_atoms
        with tf.variable_scope(layer_name):
            self.weights = tf.get_variable(name='w',
                                           shape=[1, input_dim, input_atoms, output_dim * output_atoms],
                                           dtype=tf.float32)
            self.biases = tf.get_variable(name='b', shape=[output_dim, output_atoms], dtype=tf.float32,
                                          initializer=tf.zeros_initializer())

    def vote_and_route(self, input_tensor, leaky=False):
        with tf.name_scope('Wx_plus_b'):
            input_tiled = tf.tile(tf.expand_dims(input_tensor, -1),
                                  [1, 1, 1, self.output_dim * self.output_atoms])
            votes = tf.reduce_sum(input_tiled * self.weights, axis=2)
            votes_reshaped = tf.reshape(votes,
                                        [-1, self.input_dim, self.output_dim, self.output_atoms])
        with tf.name_scope('routing'):
            input_shape = tf.shape(input_tensor)
            logit_shape = tf.stack([input_shape[0], self.input_dim, self.output_dim])
            activations, weights_c, route = _update_routing(
                votes=votes_reshaped,
                biases=self.biases,
                logit_shape=logit_shape,
                num_dims=4,
                input_dim=self.input_dim,
                output_dim=self.output_dim,
                leaky=leaky,
                num_routing=3)
        return activations, weights_c, route

In [6]:
# -*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf
from tensorflow.python.layers import base as base_layer

#from TfUtils import mkMask

_EPSILON = 1e-9
_MIN_NUM = -np.Inf


class Capsule(base_layer.Layer):
    def __init__(self, out_caps_num, out_caps_dim, iter_num=3, wrr_dim=(1, 1), reuse=None):
        super(Capsule, self).__init__(_reuse=reuse)
        self.out_caps_num = out_caps_num
        self.out_caps_dim = out_caps_dim
        self.iter_num = iter_num
        self.reuse=reuse
        self.w_rr = tf.get_variable(name='w_rr', shape=(1, 1, wrr_dim[0], wrr_dim[1]))

    def call(self, in_caps, seqLen, caps_ihat=None, re_routing=False):
        caps_uhat = shared_routing_uhat(in_caps, self.out_caps_num, self.out_caps_dim, scope='rnn_caps_uhat')
        if not re_routing:
            V, S, C, B = masked_routing_iter(caps_uhat, seqLen, self.iter_num, caps_ihat, w_rr=None)
        else:
            V, S, C, B = masked_routing_iter(caps_uhat, seqLen, self.iter_num, caps_ihat, w_rr=self.w_rr)
        return V, C, B


def shared_routing_uhat(caps, out_caps_num, out_caps_dim, scope=None):
    '''

    Args:
        caps: # shape(b_sz, caps_num, caps_dim)
        out_caps_num: #number of output capsule
        out_caps_dim: #dimension of output capsule
    Returns:
        caps_uhat: shape(b_sz, caps_num, out_caps_num, out_caps_dim)
    '''
    b_sz = tf.shape(caps)[0]
    tstp = tf.shape(caps)[1]

    with tf.variable_scope(scope or 'shared_routing_uhat'):
        '''shape(b_sz, caps_num, out_caps_num*out_caps_dim)'''
        caps_uhat = tf.layers.dense(caps, out_caps_num * out_caps_dim, activation=tf.tanh)
        caps_uhat = tf.reshape(caps_uhat, shape=[b_sz, tstp, out_caps_num, out_caps_dim])
    return caps_uhat


def masked_routing_iter(caps_uhat, seqLen, iter_num, caps_ihat=None, w_rr=None):
    '''

    Args:
        caps_uhat:  shape(b_sz, tstp, out_caps_num, out_caps_dim)
        seqLen:     shape(b_sz)
        iter_num:   number of iteration

    Returns:
        V_ret:      #shape(b_sz, out_caps_num, out_caps_dim)
    '''
    assert iter_num > 0
    b_sz = tf.shape(caps_uhat)[0]
    tstp = tf.shape(caps_uhat)[1]
    out_caps_num = int(caps_uhat.get_shape()[2])
    seqLen = tf.where(tf.equal(seqLen, 0), tf.ones_like(seqLen), seqLen)
    mask = mkMask(seqLen, tstp)  # shape(b_sz, tstp)
    floatmask = tf.cast(tf.expand_dims(mask, axis=-1), dtype=tf.float32)  # shape(b_sz, tstp, 1)
    B = tf.zeros([b_sz, tstp, out_caps_num], dtype=tf.float32)
    C_list = list()
    for i in range(iter_num):
        B_logits = B
        C = tf.nn.softmax(B, axis=2)  # shape(b_sz, tstp, out_caps_num)
        C = tf.expand_dims(C * floatmask, axis=-1)  # shape(b_sz, tstp, out_caps_num, 1)
        weighted_uhat = C * caps_uhat  # shape(b_sz, tstp, out_caps_num, out_caps_dim)
        C_list.append(C)
        S = tf.reduce_sum(weighted_uhat, axis=1)  # shape(b_sz, out_caps_num, out_caps_dim)
        V = _squash(S, axes=[2])  # shape(b_sz, out_caps_num, out_caps_dim)
        V = tf.expand_dims(V, axis=1)  # shape(b_sz, 1, out_caps_num, out_caps_dim)
        if caps_ihat == None:
            B = tf.reduce_sum(caps_uhat * V, axis=-1) + B  # shape(b_sz, tstp, out_caps_num)
        else:
            B = tf.reduce_sum(caps_uhat * V, axis=-1) + 0.1 * tf.squeeze(
                tf.matmul(tf.matmul(caps_uhat, tf.tile(w_rr, [tf.shape(caps_uhat)[0], tf.shape(caps_uhat)[1], 1, 1])),
                          tf.tile(caps_ihat, [1, tf.shape(caps_uhat)[1], 1, 1])),
                axis=-1) + B  # shape(b_sz, tstp, out_caps_num)
    V_ret = tf.squeeze(V, axis=[1])  # shape(b_sz, out_caps_num, out_caps_dim)
    S_ret = S
    C_ret = tf.squeeze(tf.stack(C_list), axis=[4])
    return V_ret, S_ret, C_ret, B_logits


def margin_loss1(y_true, y_pred):
    """
    :param y_true: [None, n_classes]
    :param y_pred: [None, n_classes]
    :return: a scalar loss value.
    """
    L = y_true * tf.square(tf.maximum(0., 0.9 - y_pred)) + \
        0.5 * (1 - y_true) * tf.square(tf.maximum(0., y_pred - 0.1))

    assert_inf_L = tf.Assert(tf.logical_not(tf.reduce_any(tf.is_inf(L))),
                             ['assert_inf_L', L], summarize=100)
    assert_nan_L = tf.Assert(tf.logical_not(tf.reduce_any(tf.is_nan(L))),
                             ['assert_nan_L', L], summarize=100)
    with tf.control_dependencies([assert_inf_L, assert_nan_L]):
        ret = tf.reduce_mean(tf.reduce_sum(L, axis=1))
    return ret


def _squash(in_caps, axes):
    '''
    Squashing function corresponding to Eq. 1
    Args:
        in_caps:  a tensor
        axes:     dimensions along which to apply squash

    Returns:
        vec_squashed:   squashed tensor

    '''
    vec_squared_norm = tf.reduce_sum(tf.square(in_caps), axis=axes, keepdims=True)
    scalar_factor = vec_squared_norm / (1 + vec_squared_norm) / tf.sqrt(vec_squared_norm + _EPSILON)
    vec_squashed = scalar_factor * in_caps  # element-wise
    return vec_squashed

In [7]:
# -*- coding: utf-8 -*-
import tensorflow as tf




def build_model(input_data, input_size, sequence_length, slot_size, intent_size, intent_dim, layer_size, embed_dim,
                num_rnn=1, isTraining=True, iter_slot=2, iter_intent=2, re_routing=True):
    cell_fw_list = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(layer_size) for _ in range(num_rnn)])
    cell_bw_list = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(layer_size) for _ in range(num_rnn)])

    if isTraining == True:
        cell_fw_list = tf.contrib.rnn.DropoutWrapper(cell_fw_list, input_keep_prob=0.8,
                                                     output_keep_prob=0.8)
        cell_bw_list = tf.contrib.rnn.DropoutWrapper(cell_bw_list, input_keep_prob=0.8,
                                                     output_keep_prob=0.8)

    embedding = tf.get_variable('embedding', [input_size, embed_dim],
                                initializer=tf.contrib.layers.xavier_initializer())
    inputs = tf.nn.embedding_lookup(embedding, input_data)

    with tf.variable_scope('slot_capsule'):
        H, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
            [cell_fw_list],
            [cell_bw_list],
            inputs=inputs,
            sequence_length=sequence_length,
            dtype=tf.float32)
        sc = Capsule(slot_size, layer_size, reuse=tf.AUTO_REUSE, iter_num=iter_slot, wrr_dim=(layer_size, intent_dim))
        slot_capsule, routing_weight, routing_logits = sc(H, sequence_length, re_routing=False)
    with tf.variable_scope('slot_proj'):
        slot_p = tf.reshape(routing_logits, [-1, slot_size])
    with tf.variable_scope('intent_capsule'):
        intent_capsule, intent_routing_weight, _ = Capsule(intent_size, intent_dim, reuse=tf.AUTO_REUSE,
                                                           iter_num=iter_intent)(slot_capsule, slot_size)
    with tf.variable_scope('intent_proj'):
        intent = intent_capsule
    outputs = [slot_p, intent, routing_weight, intent_routing_weight]
    if re_routing:
        pred_intent_index_onehot = tf.one_hot(tf.argmax(tf.norm(intent_capsule, axis=-1), axis=-1), intent_size)
        pred_intent_index_onehot = tf.tile(tf.expand_dims(pred_intent_index_onehot, 2),
                                           [1, 1, tf.shape(intent_capsule)[2]])
        intent_capsule_max = tf.reduce_sum(tf.multiply(intent_capsule, tf.cast(pred_intent_index_onehot, tf.float32)),
                                           axis=1,
                                           keepdims=False)
        caps_ihat = tf.expand_dims(tf.expand_dims(intent_capsule_max, 1), 3)
        with tf.variable_scope('slot_capsule', reuse=True):
            slot_capsule_new, routing_weight_new, routing_logits_new = sc(H, sequence_length, caps_ihat=caps_ihat,
                                                                          re_routing=True)
        with tf.variable_scope('slot_proj', reuse=True):
            slot_p_new = tf.reshape(routing_logits_new, [-1, slot_size])
        outputs = [slot_p_new, intent, routing_weight_new, intent_routing_weight]
    return outputs

In [8]:
# -*- coding: utf-8 -*-
import argparse
import logging
import os

import numpy as np
import tensorflow as tf



# Processing Units logs
log_device_placement = False
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

parser = argparse.ArgumentParser(allow_abbrev=False)
# Network
parser.add_argument("--num_units", type=int, default=512, help="Network size.", dest='layer_size',required=False)
parser.add_argument("--embed_dim", type=int, default=1024, help="Embedding dim.", dest='embed_dim',required=False)
parser.add_argument("--intent_dim", type=int, default=128, help="Intent dim.", dest='intent_dim',required=False)
parser.add_argument("--model_type", type=str, default='full', help="""full(default) | without_rerouting.
                                                                    full: full model with re-routing
                                                                    without_rerouting: model without re-routing""",required=False)
parser.add_argument("--num_rnn", type=int, default=1, help="Num of layers for stacked RNNs.",required=False)
parser.add_argument("--iter_slot", type=int, default=2, help="Num of iteration for slots.",required=False)
parser.add_argument("--iter_intent", type=int, default=2, help="Num of iteration for intents.",required=False)

# Training Environment
parser.add_argument("--optimizer", type=str, default='adam', help="Optimizer.",required=False)
parser.add_argument("--batch_size", type=int, default=8, help="Batch size.",required=False)
parser.add_argument("--learning_rate", type=float, default=0.001, help="Batch size.",required=False)
parser.add_argument("--margin", type=float, default=0.4, help="Margin in the max-margin loss.",required=False)
parser.add_argument("--downweight", type=float, default=0.5, help="Downweight for the max-margin loss.",required=False)
parser.add_argument("--max_epochs", type=int, default=20, help="Max epochs to train.",required=False)
parser.add_argument("--no_early_stop", action='store_false', dest='early_stop',
                    help="Disable early stop, which is based on sentence level accuracy.",required=False)
parser.add_argument("--patience", type=int, default=10, help="Patience to wait before stop.",required=False)
parser.add_argument("--run_name", type=str, default='capsule_nlu', help="Run name.",required=False)

# Model and Data
parser.add_argument("--dataset", type=str, default='snips', help="""Type 'snips' to use dataset provided by us or enter what ever you named your own dataset.
                Note, if you don't want to use this part, enter --dataset=''. It can not be None""",required=False)
parser.add_argument("--model_path", type=str, default='./model', help="Path to save model.",required=False)
parser.add_argument("--vocab_path", type=str, default='./vocab', help="Path to vocabulary files.",required=False)
parser.add_argument("--train_data_path", type=str, default='train', help="Path to training data files.",required=False)
parser.add_argument("--test_data_path", type=str, default='test', help="Path to testing data files.",required=False)
parser.add_argument("--valid_data_path", type=str, default='valid', help="Path to validation data files.",required=False)
parser.add_argument("--input_file", type=str, default='seq.in', help="Input file name.",required=False)
parser.add_argument("--slot_file", type=str, default='seq.out', help="Slot file name.",required=False)
parser.add_argument("--intent_file", type=str, default='label', help="Intent file name.",required=False)

arg = parser.parse_args(''.split())
logs_path = './log/' + arg.run_name

# Print arguments
for k, v in sorted(vars(arg).items()):
    print(k, '=', v)
print()

# Optimzers
if arg.optimizer == 'adam':
    opt = tf.train.AdamOptimizer(learning_rate=arg.learning_rate)
elif arg.optimizer == 'rmsprop':
    opt = tf.train.RMSPropOptimizer(learning_rate=arg.learning_rate)
elif arg.optimizer == 'adadelta':
    opt = tf.train.AdadeltaOptimizer(learning_rate=arg.learning_rate)
elif arg.optimizer == 'adagrad':
    opt = tf.train.AdagradOptimizer(learning_rate=arg.learning_rate)
else:
    print('unknown optimizer!')
    exit(1)

# Ablation
if arg.model_type == 'full':
    re_routing = True
elif arg.model_type == 'without_rerouting':
    re_routing = False
else:
    print('unknown model type!')
    exit(1)

# Full path to data will be: ./data/ + dataset + train/test/valid
if arg.dataset == None:
    print('name of dataset can not be None')
    exit(1)
elif arg.dataset == 'snips':
    print('use snips dataset')
elif arg.dataset == 'atis':
    print('use atis dataset')
else:
    print('use own dataset: ', arg.dataset)

full_train_path = os.path.join('./data', arg.dataset, arg.train_data_path)
full_test_path = os.path.join('./data', arg.dataset, arg.test_data_path)
full_valid_path = os.path.join('./data', arg.dataset, arg.valid_data_path)

# Create vocabulary and save vocab files in ./vocab
createVocabulary(os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/train', arg.input_file), os.path.join('/content/drive/My Drive/Capsule-NLU/vocab', 'in_vocab'))
createVocabulary(os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/train', arg.slot_file), os.path.join('/content/drive/My Drive/Capsule-NLU/vocab', 'slot_vocab'))
createVocabulary(os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/train', arg.intent_file), os.path.join('/content/drive/My Drive/Capsule-NLU/vocab', 'intent_vocab'),
                 pad=False, unk=False)

# Load vocab
in_vocab = loadVocabulary(os.path.join('/content/drive/My Drive/Capsule-NLU/vocab', 'in_vocab'))
slot_vocab = loadVocabulary(os.path.join('/content/drive/My Drive/Capsule-NLU/vocab', 'slot_vocab'))
intent_vocab = loadVocabulary(os.path.join('/content/drive/My Drive/Capsule-NLU/vocab', 'intent_vocab'))
intent_dim = arg.intent_dim


# Create training model
tf.reset_default_graph()
input_data = tf.placeholder(tf.int32, [None, None], name='inputs')  # word ids
sequence_length = tf.placeholder(tf.int32, [None], name="sequence_length")  # sequence length
global_step = tf.Variable(0, trainable=False, name='global_step')
slots = tf.placeholder(tf.int32, [None, None], name='slots')  # slot ids
slot_weights = tf.placeholder(tf.float32, [None, None], name='slot_weights')  # sequence mask
intent = tf.placeholder(tf.int32, [None], name='intent')  # intent label

with tf.variable_scope('model'):
    training_outputs = build_model(input_data, len(in_vocab['vocab']), sequence_length, len(slot_vocab['vocab']) - 2,
                                   len(intent_vocab['vocab']), intent_dim,
                                   layer_size=arg.layer_size, embed_dim=arg.embed_dim, num_rnn=arg.num_rnn,
                                   isTraining=True, iter_slot=arg.iter_slot, iter_intent=arg.iter_intent,
                                   re_routing=re_routing)

slots_shape = tf.shape(slots)
slots_reshape = tf.reshape(slots, [-1])
slot_outputs = training_outputs[0]
intent_outputs = training_outputs[1]
slot_routing_weight = training_outputs[2]
intent_routing_weight = training_outputs[3]
intent_outputs_norm = tf.norm(intent_outputs, axis=-1)

# Define slot loss
with tf.variable_scope('slot_loss'):
    slots_reshape_onehot = tf.one_hot(slots_reshape, len(slot_vocab['vocab']) - 2)  # [16*18, 74]
    crossent = tf.nn.softmax_cross_entropy_with_logits_v2(labels=slots_reshape_onehot, logits=slot_outputs)
    crossent = tf.reshape(crossent, slots_shape)
    slot_loss = tf.reduce_sum(crossent * slot_weights, 1)
    total_size = tf.reduce_sum(slot_weights, 1)
    total_size += 1e-12
    slot_loss = slot_loss / total_size

# Define intent loss
with tf.variable_scope('intent_loss'):
    intent_onehot = tf.one_hot(intent, len(intent_vocab['vocab']))
    marginloss =margin_loss(labels=intent_onehot, raw_logits=intent_outputs_norm, margin=arg.margin,
                             downweight=arg.downweight)
    intent_loss = tf.reduce_mean(marginloss, axis=-1)

# Specify the learning environment
params = tf.trainable_variables()
slot_params = []
for p in params:
    if 'slot' in p.name or 'embedding' in p.name:
        slot_params.append(p)
intent_params = []
for p in params:
    if 'intent' in p.name:
        intent_params.append(p)

gradients_slot = tf.gradients(slot_loss, slot_params)
gradients_intent = tf.gradients(intent_loss, intent_params)

clipped_gradients_slot, norm_slot = tf.clip_by_global_norm(gradients_slot, 5.0)
clipped_gradients_intent, norm_intent = tf.clip_by_global_norm(gradients_intent, 5.0)

gradient_norm_slot = norm_slot
gradient_norm_intent = norm_intent

update_slot = opt.apply_gradients(zip(clipped_gradients_slot, slot_params))
update_intent = opt.apply_gradients(zip(clipped_gradients_intent, intent_params), global_step=global_step)

training_outputs = [global_step, slot_loss, intent_loss, slot_routing_weight, intent_routing_weight, update_slot,
                    update_intent, gradient_norm_slot, gradient_norm_intent]
inputs = [input_data, sequence_length, slots, slot_weights, intent]

# Create Inference Model
with tf.variable_scope('model', reuse=True):
    inference_outputs = build_model(input_data, len(in_vocab['vocab']), sequence_length, len(slot_vocab['vocab']) - 2,
                                    len(intent_vocab['vocab']), intent_dim,
                                    layer_size=arg.layer_size, embed_dim=arg.embed_dim, num_rnn=arg.num_rnn,
                                    isTraining=False, iter_slot=arg.iter_slot, iter_intent=arg.iter_intent,
                                    re_routing=re_routing)

inference_intent_outputs_norm = tf.norm(inference_outputs[1], axis=-1)
inference_outputs = [inference_outputs[0], inference_outputs[1], inference_intent_outputs_norm, inference_outputs[2],
                     inference_outputs[3]]
inference_inputs = [input_data, sequence_length]

saver = tf.train.Saver()

# Start Training
with tf.Session(config=tf.ConfigProto(allow_soft_placement=False, log_device_placement=log_device_placement)) as sess:
    sess.run(tf.global_variables_initializer())
    logging.info('Training Start')
    epochs = 0
    eval_slot_loss = 0.0
    eval_intent_loss = 0.0
    eval_slot_p = 0.0
    data_processor = None
    line = 0
    num_loss = 0
    step = 0
    no_improve = 0

    # variables to store highest values among epochs, only use 'valid_err' for now
    valid_slot = 0
    test_slot = 0
    valid_intent = 0
    test_intent = 0
    valid_err = 0
    test_err = 0

    # Load from saved checkpoints
    # saver.restore(sess, './model/' + arg.run_name + ".ckpt")
    # logging.info("Model restored.")

    while True:
        if data_processor == None:
            data_processor = DataProcessor(os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/train', arg.input_file),
                                           os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/train', arg.slot_file),
                                           os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/train', arg.intent_file), in_vocab, slot_vocab,
                                           intent_vocab, shuffle=True)
        in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq = data_processor.get_batch(
            arg.batch_size)
        feed_dict = {input_data.name: in_data, slots.name: slot_data, slot_weights.name: slot_weight,
                     sequence_length.name: length, intent.name: intents}

        if len(in_data) != 0:
            ret = sess.run(training_outputs, feed_dict)
            eval_slot_loss += np.mean(ret[1])
            eval_intent_loss += np.mean(ret[2])

            line += len(in_data)
            step = ret[0]
            num_loss += 1

        if data_processor.end == 1:
            line = 0
            data_processor = None
            epochs += 1
            logging.info('Step: ' + str(step))
            logging.info('Epochs: ' + str(epochs))
            logging.info('Slot Loss: ' + str(eval_slot_loss / num_loss))
            logging.info('Intent Loss: ' + str(eval_intent_loss / num_loss))
            num_loss = 0
            eval_slot_loss = 0.0
            eval_slot_p = 0.0
            eval_intent_loss = 0.0
            save_path = os.path.join(arg.model_path, '_step_' + str(step) + '_epochs_' + str(epochs) + '.ckpt')


            def valid(in_path, slot_path, intent_path):
                data_processor_valid = DataProcessor(in_path, slot_path, intent_path, in_vocab, slot_vocab,
                                                     intent_vocab)
                pred_intents = []
                correct_intents = []
                slot_outputs = []
                correct_slots = []
                input_words = []

                while True:
                    in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq = data_processor_valid.get_batch(
                        arg.batch_size)
                    feed_dict = {input_data.name: in_data, sequence_length.name: length}
                    if len(in_data) != 0:
                        ret = sess.run(inference_outputs, feed_dict)
                        for i in ret[2]:
                            pred_intents.append(np.argmax(i))
                        for i in intents:
                            correct_intents.append(i)

                        pred_slots = ret[3][-1, :, :, :].reshape((slot_data.shape[0], slot_data.shape[1], -1))
                        for p, t, i, l, s in zip(pred_slots, slot_data, in_data, length, slot_seq):
                            p = np.argmax(p, 1)
                            tmp_pred = []
                            tmp_correct = []
                            tmp_input = []
                            for j in range(l):
                                tmp_pred.append(slot_vocab['rev'][p[j]])
                                tmp_correct.append(slot_vocab['rev'][t[j]])
                                tmp_input.append(in_vocab['rev'][i[j]])

                            slot_outputs.append(tmp_pred)
                            correct_slots.append(tmp_correct)
                            input_words.append(tmp_input)
                    if data_processor_valid.end == 1:
                        break
                pred_intents = np.array(pred_intents)
                correct_intents = np.array(correct_intents)
                from sklearn.metrics import classification_report
                logging.info(classification_report(y_true=correct_intents, y_pred=pred_intents, digits=4))
                accuracy = (pred_intents == correct_intents)
                semantic_error = accuracy
                accuracy = accuracy.astype(float)
                accuracy = np.mean(accuracy) * 100.0

                index = 0
                for t, p in zip(correct_slots, slot_outputs):
                    # Process Semantic Error
                    if len(t) != len(p):
                        raise ValueError('Error!!')

                    for j in range(len(t)):
                        if p[j] != t[j]:
                            semantic_error[index] = False
                            break
                    index += 1
                semantic_error = semantic_error.astype(float)
                semantic_error = np.mean(semantic_error) * 100.0

                f1, precision, recall = computeF1Score(correct_slots, slot_outputs)
                logging.info('slot f1: ' + str(f1))
                logging.info('intent accuracy: ' + str(accuracy))
                logging.info('semantic error(intent, slots are all correct): ' + str(semantic_error))

                return f1, accuracy, semantic_error, pred_intents, correct_intents, slot_outputs, correct_slots, input_words


            logging.info('Valid:')
            epoch_valid_slot, epoch_valid_intent, epoch_valid_err, valid_pred_intent, valid_correct_intent, valid_pred_slot, valid_correct_slot, valid_words = valid(
                os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/valid', arg.input_file), os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/valid', arg.slot_file),
                os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/valid', arg.intent_file))

            logging.info('Test:')
            epoch_test_slot, epoch_test_intent, epoch_test_err, test_pred_intent, test_correct_intent, test_pred_slot, test_correct_slot, test_words = valid(
                os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/test', arg.input_file), os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/test', arg.slot_file),
                os.path.join('/content/drive/My Drive/Capsule-NLU/data/snips/test', arg.intent_file))

            if epoch_valid_err <= valid_err:
                no_improve += 1
            else:
                valid_err = epoch_valid_err
                no_improve = 0
            if epochs == arg.max_epochs:
                break
            if arg.early_stop:
                if no_improve > arg.patience:
                    break

            save_path = saver.save(sess, './model/' + arg.run_name + "_" + str(epochs) + ".ckpt")
            # logging.info("Model saved in path: " + str(save_path))

batch_size = 8
dataset = snips
downweight = 0.5
early_stop = True
embed_dim = 1024
input_file = seq.in
intent_dim = 128
intent_file = label
iter_intent = 2
iter_slot = 2
layer_size = 512
learning_rate = 0.001
margin = 0.4
max_epochs = 20
model_path = ./model
model_type = full
num_rnn = 1
optimizer = adam
patience = 10
run_name = capsule_nlu
slot_file = seq.out
test_data_path = test
train_data_path = train
valid_data_path = valid
vocab_path = ./vocab

use snips dataset


2020-07-25 16:56:14,007 : INFO : NumExpr defaulting to 2 threads.


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API


Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API


Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API






Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where














Instructions for updating:
Use keras.layers.dense instead.


Instructions for updating:
Use keras.layers.dense instead.
































































2020-07-25 16:56:22,445 : INFO : Training Start
2020-07-25 17:00:16,759 : INFO : Step: 1636
2020-07-25 17:00:16,761 : INFO : Epochs: 1
2020-07-25 17:00:16,763 : INFO : Slot Loss: 0.4911196038558451
2020-07-25 17:00:16,764 : INFO : Intent Loss: 0.007022488338507196
2020-07-25 17:00:16,766 : INFO : Valid:
2020-07-25 17:00:19,939 : INFO :               precision    recall  f1-score   support

           0     0.9897    0.9600    0.9746       100
           1     1.0000    0.9600    0.9796       100
           2     0.9900    0.9900    0.9900       100
           3     1.0000    1.0000    1.0000       100
           4     0.9346    1.0000    0.9662       100
           5     1.0000    0.9600    0.9796       100
           6     0.9615    1.0000    0.9804       100

    accuracy                         0.9814       700
   macro avg     0.9823    0.9814    0.9815       700
weighted avg     0.9823    0.9814    0.9815       700

2020-07-25 17:00:19,969 : INFO : slot f1: 76.55076495132127
2020-

Instructions for updating:
Use standard file APIs to delete files with this prefix.


Instructions for updating:
Use standard file APIs to delete files with this prefix.
2020-07-25 17:24:32,870 : INFO : Step: 11452
2020-07-25 17:24:32,871 : INFO : Epochs: 7
2020-07-25 17:24:32,876 : INFO : Slot Loss: 0.009560559338676167
2020-07-25 17:24:32,877 : INFO : Intent Loss: 0.0010457240059476632
2020-07-25 17:24:32,879 : INFO : Valid:
2020-07-25 17:24:35,822 : INFO :               precision    recall  f1-score   support

           0     0.9898    0.9700    0.9798       100
           1     0.9615    1.0000    0.9804       100
           2     0.9899    0.9800    0.9849       100
           3     1.0000    1.0000    1.0000       100
           4     0.9890    0.9000    0.9424       100
           5     0.9167    0.9900    0.9519       100
           6     0.9900    0.9900    0.9900       100

    accuracy                         0.9757       700
   macro avg     0.9767    0.9757    0.9756       700
weighted avg     0.9767    0.9757    0.9756       700

2020-07-25 17:24:35,853 :