In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time

from six.moves import xrange

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

import numpy as np
import tensorflow as tf

import utils

from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve


tf.flags.DEFINE_string("checkpoint_dir", "data/tflogs/",
                       "Directory to save checkpoints and summaries of the model.")

tf.flags.DEFINE_float("learning_rate", 2e-4,
                      "Learning rate.")

tf.flags.DEFINE_float("max_gradient_norm", 5.0,
                      "Clip gradients to this norm.")

tf.flags.DEFINE_float("decision_threshold", 0.99,
                      "Decision threshold to predict a positive label.")

tf.flags.DEFINE_float("keep_prob_input", 0.8,
                      "Keep probability for dropout applied at embedding layer.")

tf.flags.DEFINE_float("keep_prob_output", 0.7,
                      "Keep probability for dropout applied at prediction layer.")

tf.flags.DEFINE_integer("batch_size", 128,
                        "Batch size to use during training.")

tf.flags.DEFINE_integer("embedding_size", 300,
                        "Size of each word embedding.")

tf.flags.DEFINE_integer("state_size", 300,
                        "Size of each hidden state.")

tf.flags.DEFINE_integer("num_layers", 1,
                        "Number of layers in the model.")

tf.flags.DEFINE_integer("num_epochs", 15,
                        "Number of epochs to train the model.")

tf.flags.DEFINE_integer("n_negative", 7,
                        "Number of negative examples to sample per pair of parallel sentences "
                        "in training dataset.")

tf.flags.DEFINE_integer("steps_per_checkpoint", 200,
                        "Number of steps to save a checkpoint.")

tf.flags.DEFINE_string("source_embeddings_path", None,
                       "Pretrained embeddings to initialize the source embeddings matrix.")

tf.flags.DEFINE_string("target_embeddings_path", None,
                       "Pretrained embeddings to initialize the target embeddings matrix.")

tf.flags.DEFINE_boolean("fix_pretrained", False,
                        "If true fix pretrained embeddings.")

tf.flags.DEFINE_boolean("use_lstm", False,
                        "If true use LSTM cells. Otherwise use GRU cells.")


FLAGS = tf.flags.FLAGS

In [None]:
def eval_epoch(sess, data_iterator, threshold, batch_size=128, summary_writer=None):
    """Evaluate dataset for one epoch."""
    sess.run(tf.local_variables_initializer())
    num_iter = int(np.ceil(data_iterator.size / batch_size))
    epoch_loss = 0
    for step in xrange(num_iter):
        source, target, label = data_iterator.next_batch(batch_size)
        source_len = utils.sequence_length(source)
        target_len = utils.sequence_length(target)
        feed_dict = {x_source: source, x_target: target, labels: label,
                     source_seq_length: source_len, target_seq_length: target_len,
                     decision_threshold: threshold}
        loss_value, epoch_accuracy,\
        epoch_precision, epoch_recall = sess.run([mean_loss, accuracy[1],
                                                  precision[1], recall[1]],
                                                  feed_dict=feed_dict)
        epoch_loss += loss_value
        if summary_writer and step % FLAGS.steps_per_checkpoint == 0:
            summary = sess.run(summaries, feed_dict=feed_dict)
            summary_writer.add_summary(summary, global_step=data_iterator.global_step)
    epoch_loss /= step
    epoch_f1 = utils.f1_score(epoch_precision, epoch_recall)
    print("  Testing:  Loss = {:.6f}, Accuracy = {:.4f}, "
          "Precision = {:.4f}, Recall = {:.4f}, F1 = {:.4f}"
          .format(epoch_loss, epoch_accuracy,
                  epoch_precision, epoch_recall, epoch_f1))
    
    
def average_pooling(rnn_outputs, seq_length):
    sum_rnn_outputs = tf.reduce_sum(tf.concat(rnn_outputs, axis=2), axis=1)
    seq_length = tf.expand_dims(tf.cast(seq_length, tf.float32), axis=1)
    return tf.divide(sum_rnn_outputs, seq_length)


def max_pooling(rnn_outputs):
    return tf.reduce_max(tf.concat(rnn_outputs, axis=2), axis=1)

### Load data

In [None]:
source_train_path = "data/europarl.en"
target_train_path = "data/europarl.fr"
source_vocab_path = "data/vocabulary.en"
target_vocab_path = "data/vocabulary.fr"
source_vocab_size = 200000
target_vocab_size = 200000

# Create vocabularies.
utils.create_vocabulary(source_vocab_path, source_train_path, source_vocab_size)
utils.create_vocabulary(target_vocab_path, target_train_path, target_vocab_size)

# Read vocabularies.
source_vocab, rev_source_vocab = utils.initialize_vocabulary(source_vocab_path)
target_vocab, rev_target_vocab = utils.initialize_vocabulary(target_vocab_path)

# Read training data set.
parallel_data = utils.read_data(source_train_path, target_train_path, source_vocab, target_vocab)

# Read validation data set.
source_valid_path = "data/newstest2012.en"
target_valid_path = "data/newstest2012.fr"
n_eval = 1000
valid_data = utils.read_data(source_valid_path, target_valid_path, source_vocab, target_vocab)
valid_data = valid_data[:n_eval]

In [None]:
print(parallel_data[-1])
print()
print(' '.join(rev_source_vocab[i] for i in parallel_data[-1][0]))
print()
print(' '.join(rev_target_vocab[i] for i in parallel_data[-1][1]))

In [None]:
print(valid_data[-1])
print(' '.join(rev_source_vocab[i] for i in valid_data[-1][0]))
print()
print(' '.join(rev_target_vocab[i] for i in valid_data[-1][1]))

### Build graph

In [None]:
utils.reset_graph()

# Params
source_vocab_size = len(source_vocab)
target_vocab_size = len(target_vocab)
learning_rate = FLAGS.learning_rate
embedding_size = FLAGS.embedding_size
state_size = FLAGS.state_size
batch_size = FLAGS.batch_size
num_layers = FLAGS.num_layers
max_gradient_norm = FLAGS.max_gradient_norm
use_lstm = FLAGS.use_lstm
fix_pretrained = FLAGS.fix_pretrained
source_embeddings_path = FLAGS.source_embeddings_path
target_embeddings_path = FLAGS.target_embeddings_path

use_average_pooling = False
use_max_pooling = False
pred_hidden_size = 128

# Placeholders.
x_source = tf.placeholder(tf.int32, [None, None], name="x_source")
source_seq_length = tf.placeholder(tf.int32, [None], name="source_seq_length")

x_target = tf.placeholder(tf.int32, [None, None], name="x_target")
target_seq_length = tf.placeholder(tf.int32, [None], name="target_seq_length")

labels = tf.placeholder(tf.float32, [None], name="labels")

input_dropout = tf.placeholder_with_default(1.0, [], name="input_dropout")
output_dropout = tf.placeholder_with_default(1.0, [], name="output_dropout")

decision_threshold = tf.placeholder_with_default(0.5, [], name="decision_threshold")

# Embedding layer.
with tf.variable_scope("embeddings"):
    if source_embeddings_path is not None and target_embeddings_path is not None:
        source_pretrained_embeddings,\
        target_pretrained_embeddings = get_pretrained_embeddings(
            source_embeddings_path,
            target_embeddings_path,
            source_vocab,
            target_vocab)
        assert source_pretrained_embeddings.shape[1] == target_pretrained_embeddings.shape[1]
        embedding_size = source_pretrained_embeddings.shape[1]
        if fix_pretrained:
            source_embeddings = tf.get_variable(
                "source_embeddings_matrix",
                [source_vocab_size, embedding_size],
                initializer=tf.constant_initializer(source_pretrained_embeddings),
                trainable=False)
            target_embeddings = tf.get_variable(
                "target_embeddings_matrix",
                [target_vocab_size, embedding_size],
                initializer=tf.constant_initializer(target_pretrained_embeddings),
                trainable=False)
        else:
            source_embeddings = tf.get_variable(
                "source_embeddings_matrix",
                [source_vocab_size, embedding_size],
                initializer=tf.constant_initializer(source_pretrained_embeddings))
            target_embeddings = tf.get_variable(
                "target_embeddings_matrix",
                [target_vocab_size, embedding_size],
                initializer=tf.constant_initializer(target_pretrained_embeddings))
    else:
        source_embeddings = tf.get_variable("source_embeddings_matrix",
                                            [source_vocab_size, embedding_size])
        target_embeddings = tf.get_variable("target_embeddings_matrix",
                                            [target_vocab_size, embedding_size])
    source_rnn_inputs = tf.nn.embedding_lookup(source_embeddings, x_source)
    target_rnn_inputs = tf.nn.embedding_lookup(target_embeddings, x_target)
    source_rnn_inputs = tf.nn.dropout(source_rnn_inputs, keep_prob=input_dropout,
                                      name="source_seq_embeddings")
    target_rnn_inputs = tf.nn.dropout(target_rnn_inputs, keep_prob=input_dropout,
                                      name="target_seq_embeddings")

# Siamese BiRNN.
with tf.variable_scope("siamese_birnn") as scope:
    if use_lstm:
        cell_fw = tf.contrib.rnn.LSTMCell(state_size, use_peepholes=True)
        cell_bw = tf.contrib.rnn.LSTMCell(state_size, use_peepholes=True)
    else:
        cell_fw = tf.contrib.rnn.GRUCell(state_size)
        cell_bw = tf.contrib.rnn.GRUCell(state_size)

    cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, output_keep_prob=output_dropout)
    cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, output_keep_prob=output_dropout)

    if num_layers > 1:
        cell_fw = tf.contrib.rnn.MultiRNNCell([cell_fw for _ in xrange(num_layers)])
        cell_bw = tf.contrib.rnn.MultiRNNCell([cell_bw for _ in xrange(num_layers)])

    with tf.variable_scope(scope):
        source_rnn_outputs, source_final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw,
                                                                                 cell_bw=cell_bw,
                                                                                 inputs=source_rnn_inputs,
                                                                                 sequence_length=source_seq_length,
                                                                                 dtype=tf.float32)
    with tf.variable_scope(scope, reuse=True):
        target_rnn_outputs, target_final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw,
                                                                                 cell_bw=cell_bw,
                                                                                 inputs=target_rnn_inputs,
                                                                                 sequence_length=target_seq_length,
                                                                                 dtype=tf.float32)
    state_size *= 2
    # Mean pooling only works for 1 layer bidirectional GRU.
    if use_average_pooling:
        source_final_state = average_pooling(source_rnn_outputs, source_seq_length)
        target_final_state = average_pooling(target_rnn_outputs, target_seq_length)
    elif use_max_pooling:
        source_final_state = max_pooling(source_rnn_outputs)
        target_final_state = max_pooling(target_rnn_outputs)
    else:
        source_final_state_fw, source_final_state_bw = source_final_state
        target_final_state_fw, target_final_state_bw = target_final_state
        if num_layers > 1:
            source_final_state_fw = source_final_state_fw[-1]
            source_final_state_bw = source_final_state_bw[-1]
            target_final_state_fw = target_final_state_fw[-1]
            target_final_state_bw = target_final_state_bw[-1]
        if use_lstm:
            source_final_state_fw = source_final_state_fw.h
            source_final_state_bw = source_final_state_bw.h
            target_final_state_fw = target_final_state_fw.h
            target_final_state_bw = target_final_state_bw.h
        source_final_state = tf.concat([source_final_state_fw, source_final_state_bw], axis=1)
        target_final_state = tf.concat([target_final_state_fw, target_final_state_bw], axis=1)

# Prediction layer.
with tf.variable_scope("prediction"):
    h_multiply = tf.multiply(source_final_state, target_final_state)
    h_abs_diff = tf.abs(tf.subtract(source_final_state, target_final_state))

    W_1 = tf.get_variable("W_1", [state_size, pred_hidden_size])
    W_2 = tf.get_variable("W_2", [state_size, pred_hidden_size])
    b_1 = tf.get_variable("b_1", [pred_hidden_size], initializer=tf.constant_initializer(0.0))

    h_semantic = tf.tanh(tf.matmul(h_multiply, W_1) + tf.matmul(h_abs_diff, W_2) + b_1)

    W_3 = tf.get_variable("W_3", [pred_hidden_size, 1]) # try initializer
    b_2 = tf.get_variable("b_2", [1], initializer=tf.constant_initializer(0.0))

    logits = tf.matmul(h_semantic, W_3) + b_2
    logits = tf.squeeze(logits, name = "logits")

# Sigmoid values and evaluation metrics.
with tf.name_scope("prediction_evaluation"):
    probs = tf.sigmoid(logits, name="probs")
    predicted_class = tf.cast(tf.greater(probs, decision_threshold), tf.float32, name="predicted_class")
    accuracy = tf.metrics.accuracy(labels, predicted_class, name="accuracy")
    precision = tf.metrics.precision(labels, predicted_class, name="precision")
    recall = tf.metrics.recall(labels, predicted_class, name="recall")

# Loss.
with tf.name_scope("cross_entropy"):
    losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels,
                                                     name="cross_entropy_per_sequence")
    mean_loss = tf.reduce_mean(losses, name="cross_entropy_loss")

# Optimization.
with tf.name_scope("optimization"):
    global_step = tf.Variable(0, trainable=False, name="global_step")
    optimizer = tf.train.AdamOptimizer(learning_rate)
    trainable_variables = tf.trainable_variables()
    gradients = tf.gradients(mean_loss, trainable_variables, name="gradients")
    clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm,
                                                            name="clipped_gradients")
    train_op = optimizer.apply_gradients(zip(clipped_gradients, trainable_variables), global_step=global_step)

# Summaries.
tf.summary.scalar("loss", mean_loss)
tf.summary.scalar("accuracy", accuracy[0])
tf.summary.scalar("precision", precision[0])
tf.summary.scalar("recall", recall[0])
tf.summary.scalar("logits" + "/sparsity", tf.nn.zero_fraction(logits))
tf.summary.histogram("logits" + "/activations", logits)
tf.summary.histogram("probs", probs)

# Add histogram for trainable variables.
for var in trainable_variables:
    tf.summary.histogram(var.op.name, var)

# Add histogram for gradients.
for grad, var in zip(clipped_gradients, trainable_variables):
    if grad is not None:
        tf.summary.histogram(var.op.name + "/gradients", grad)
tf.summary.scalar("global_norm", global_norm)
        
summaries = tf.summary.merge_all()
tf.add_to_collection(tf.GraphKeys.SUMMARY_OP, summaries)

saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

### Train model

In [None]:
sess = tf.Session()

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

train_iterator = utils.TrainingIteratorRandom(parallel_data, FLAGS.n_negative)
valid_iterator = utils.EvalIterator(valid_data)

train_summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.checkpoint_dir, "train"), sess.graph)
valid_summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.checkpoint_dir, "valid"), sess.graph)

epoch_loss = 0
epoch_completed = 0
batch_completed = 0

num_iter = int(np.ceil(train_iterator.size / FLAGS.batch_size * FLAGS.num_epochs))
start_time = time.time()
print("Training model for {} sentence pairs & validating on {} sentence pairs".
      format(train_iterator.size, valid_iterator.size))

for step in xrange(num_iter):
    source, target, label = train_iterator.next_batch(FLAGS.batch_size)
    source_len = utils.sequence_length(source)
    target_len = utils.sequence_length(target)
    feed_dict = {x_source: source, x_target: target, labels: label,
                 source_seq_length: source_len, target_seq_length: target_len,
                 input_dropout: FLAGS.keep_prob_input, output_dropout: FLAGS.keep_prob_output,
                 decision_threshold: FLAGS.decision_threshold}

    _, loss_value, epoch_accuracy,\
    epoch_precision, epoch_recall = sess.run([train_op, mean_loss, accuracy[1],
                                              precision[1], recall[1]], feed_dict=feed_dict)
    epoch_loss += loss_value
    batch_completed += 1
    # Write the model's training summaries.
    if step % FLAGS.steps_per_checkpoint == 0:
        summary = sess.run(summaries, feed_dict=feed_dict)
        train_summary_writer.add_summary(summary, global_step=step)
    # End of current epoch.
    if train_iterator.epoch_completed > epoch_completed:
        epoch_time = time.time() - start_time
        epoch_loss /= batch_completed
        epoch_f1 = utils.f1_score(epoch_precision, epoch_recall)
        epoch_completed += 1
        print("Epoch {} in {:.0f} sec\n"
              "  Training: Loss = {:.6f}, Accuracy = {:.4f}, "
              "Precision = {:.4f}, Recall = {:.4f}, F1 = {:.4f}"
              .format(epoch_completed, epoch_time, epoch_loss, epoch_accuracy,
                      epoch_precision, epoch_recall, epoch_f1))
        # Save a checkpoint.
        checkpoint_path = os.path.join(FLAGS.checkpoint_dir, "model.ckpt")
        saver.save(sess, checkpoint_path, global_step=step)
        # Evaluate model on the validation set.
        eval_epoch(sess, valid_iterator, FLAGS.decision_threshold, 1000, valid_summary_writer)
        # Initialize local variables for new epoch.
        batch_completed = 0
        epoch_loss = 0
        sess.run(tf.local_variables_initializer())
        start_time = time.time()

print("Training done with {} steps.".format(sess.run(global_step, feed_dict=feed_dict)))
train_summary_writer.close()
valid_summary_writer.close()

### Eval model

In [None]:
def inference(sess, data_iterator, probs_op, predicted_class_op, threshold, placeholders):
    """Get probability and predicted class of the examples in a data set."""
    x_source, source_seq_length,\
    x_target, target_seq_length,\
    labels, decision_threshold = placeholders

    num_iter = int(np.ceil(data_iterator.size / FLAGS.batch_size))
    probs = []
    predicted_class = []
    for step in xrange(num_iter):
        source, target, label = data_iterator.next_batch(FLAGS.batch_size)
        source_len = utils.sequence_length(source)
        target_len = utils.sequence_length(target)

        feed_dict = {x_source: source, x_target: target, labels: label,
                     source_seq_length: source_len, target_seq_length: target_len,
                     decision_threshold: threshold}
        batch_probs, batch_predicted_class = sess.run([probs_op, predicted_class_op], feed_dict=feed_dict)
        probs.extend(batch_probs.tolist())
        predicted_class.extend(batch_predicted_class.tolist())
    probs = np.array(probs[:data_iterator.size])
    predicted_class = np.array(predicted_class[:data_iterator.size], dtype=np.int)
    return probs, predicted_class


def evaluate(sess, data_iterator, probs_op, predicted_class_op, threshold, placeholders, save, plot):
    probs, predicted_class = inference(sess, data_iterator, probs_op, predicted_class_op, threshold, placeholders)
    true_class = data_iterator.epoch_data[:, 2].astype(int)

    # Evaluation at given decision threshold.
    precision_value = precision_score(true_class, predicted_class, pos_label=1)
    recall_value = recall_score(true_class, predicted_class, pos_label=1)
    f1_value = f1_score(true_class, predicted_class, pos_label=1)

    print("Evaluation metrics at decision threshold = {:.4f}\n"
          "Precision = {:.2f}, Recall = {:.2f}, F1 = {:.2f}\n"
          "-------------------------------------------------"
          .format(threshold, 100*precision_value, 100*recall_value, 100*f1_value))

    # Evaluation at best F1 value.
    precision_seq, recall_seq, threshold_seq = precision_recall_curve(true_class, probs, pos_label=1)
    f1_seq = utils.f1_score(precision_seq, recall_seq)
    index = np.argmax(f1_seq)
    precision_best = precision_seq[index]
    recall_best = recall_seq[index]
    f1_best = f1_seq[index]
    threshold_best = threshold_seq[index]

    print("Best scores if decision threshold = {:.4f}\n"
          "Precision = {:.2f}, Recall = {:.2f}, F1 = {:.2f}"
          .format(threshold_best, 100*precision_best, 100*recall_best, 100*f1_best))

    if save:
        np.savez(save, precision_value=precision_value, recall_value=recall_value,
                 f1_value=f1_value, threshold_value=threshold,
                 precision_best=precision_best, recall_best=recall_best,
                 f1_best=f1_best, threshold_best=threshold_best,
                 precision_seq=precision_seq, recall_seq=recall_seq,
                 f1_seq=f1_seq, threshold_seq=threshold_seq)
    if plot:
        utils.plot_precision_recall_curve(precision_seq, recall_seq, "")
        utils.plot_f1_threshold_curve(threshold_seq, f1_seq, "")

        
# Run evaluation.
source_test_path = "data/newstest2013.en"
target_test_path = "data/newstest2013.fr"
n_eval = 1000
test_data = utils.read_data(source_test_path, target_test_path, source_vocab, target_vocab)
test_data = test_data[:n_eval]
test_iterator = utils.EvalIterator(test_data)
placeholders = [x_source, source_seq_length, x_target, target_seq_length, labels, decision_threshold]
evaluate(sess, test_iterator, probs, predicted_class, FLAGS.decision_threshold,
         placeholders, None, None)