**Pointer-Generator Network, Beam Search and Bidirectional RNN Seq2Seq Model in Keras**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
import keras
from keras.layers import Embedding, Dense, LSTM, Input, TimeDistributed, Bidirectional
from keras.models import Model

"""Attention mechanism"""
# Define an attention layer
class Attention(keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)
 
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)   
        
        return context_vector

In [None]:
"""
Some parameters:

num_encoder_tokens = ?
num_decoder_tokens = ?
vocab_size = ?
embedding_size = ?
rnn_cell_size = ?

"""


"""Embedding layer"""

embedding_inputs = Input(shape=(None, num_encoder_tokens))
embedding_layer = Embedding(vocab_size, embedding_size) 

encoder_inputs = embedding_layer(embedding_inputs)


"""The bidirectional encoder with LSTM"""

encoder = Bidirectional(LSTM(256, dropout=0.2,
                            recurrent_dropout=0.2,
                            return_sequences=True,
                            return_state=True,
                            recurrent_activation='relu',
                            recurrent_initializer='glorot_uniform'), name="bi_lstm_0")(encoder_inputs)

encoder_output, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(rnn_cell_size,
                                                                            dropout=0.2,
                                                                            recurrent_dropout=0.2,
                                                                            return_sequences=True,
                                                                            return_state=True,
                                                                            recurrent_activation='relu',
                                                                            recurrent_initializer='glorot_uniform'))(encoder)

"""Concatenate the results of the two directions as the encoder states"""
encoder_state_h = Concatenate()([forward_h, backward_h])
encoder_state_c = Concatenate()([forward_c, backward_c])


In [None]:
"""Calculate attention distribution, P_vocab = softmax(V2(V1([context_vector, decoder_state])+b1)+b2)"""

def Attn_dist(context_vector, decoder_state, units):
    v1 = Dense(units)
    v2 = Dense(units)
    e = Concatenate()([decoder_state, context_vector])
    P_vocab = tf.nn.softmax(v2(v1(e))) #need to specify axis?
    
    return P_vocab

In [None]:
from keras import backend


decoder_inputs = Input(shape=(None, num_decoder_tokens))
encoder_states = [encoder_state_h, encoder_state_c]

"""Unidirectional decoder LSTM"""
decoder = LSTM(rnn_cell_size,
               dropout=0.2,
               recurrent_dropout=0.2,
               return_sequences=True,
               return_state=True,
               recurrent_activation='relu')

#decoder_out contains all the hidden states for all time steps. decoder_state_h is the final hidden state output
decoder_out, decoder_state_h, decoder_state_c = decoder(decoder_inputs, initial_state = encoder_states)



"""Use the attention network to obtain the context vector, then calculate attention distribution"""
units = ?

attention = Attention(units) 
context_vector = attention.call(decoder_state_h, decoder_output)

attn_dist = Attn_dist(context_vector, decoder_state_h)

**Incomplete decoder: need coverage mechanism and final_dist calc (output calc)**

In [None]:


def linear(args, output_size, bias, bias_start=0.0, scope=None):
  """
    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
    output_size: int, second dimension of W[i].
    bias: boolean, whether to add a bias term or not.
    bias_start: starting value to initialize the bias; 0 by default.
    scope: VariableScope for the created subgraph; defaults to "Linear".
  Returns:
    A 2D Tensor with shape [batch x output_size] equal to
    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
  Raises:
    ValueError: if some of the arguments has unspecified or wrong shape.
  """
  if args is None or (isinstance(args, (list, tuple)) and not args):
    raise ValueError("`args` must be specified")
  if not isinstance(args, (list, tuple)):
    args = [args]

  # Calculate the total size of arguments on dimension 1.
  total_arg_size = 0
  shapes = [a.get_shape().as_list() for a in args]
  for shape in shapes:
    if len(shape) != 2:
      raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes))
    if not shape[1]:
      raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes))
    else:
      total_arg_size += shape[1]

  # Computation.
  with tf.variable_scope(scope or "Linear"):
    matrix = tf.get_variable("Matrix", [total_arg_size, output_size])
    if len(args) == 1:
      res = tf.matmul(args[0], matrix)
    else:
      res = tf.matmul(tf.concat(axis=1, values=args), matrix)
    if not bias:
      return res
    bias_term = tf.get_variable(
        "Bias", [output_size], initializer=tf.constant_initializer(bias_start))
    
  return res + bias_term


# Calculate p_gen and organize output
p_gens = []
outputs = []


p_gen = linear([context_vector, decoder_state_c, decoder_state_h, decider_inputs], 1, True) 
p_gen = tf.sigmoid(p_gen)
p_gens.append(p_gen)



output = linear([decoder_out] + [context_vector], len(decoder_out), True) #decoder ouput size = len(decoder_out)
outputs.append(output)

**Beam Search Decoder (for inference)**

In [None]:
'''Beam search output, returning top_paths possible output sequences and their log likelihoods'''
decoder = backend.ctc_decode(OUTPUT,
                            input_length = context_vector.shape[0]
                            greedy=False,
                            beam_width=5
                            top_paths=1)

In [None]:
import tensorflow as tf 

print(tf.test.gpu_device_name())
