In [1]:
import numpy as np
import numpy.matlib
import math
import random
import os
import sys
import time
import tensorflow as tf
import pickle
import numpy.matlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from utils import *
%matplotlib inline

In [2]:
# declare parameters, same as the inference declaration
alphabet = ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
rnn_size = 400
tsteps = 150
batch_size = 32
placeholder_shape = [None, tsteps, 3]
kmixtures = 1
nmixtures = 8
v_len = len(alphabet) + 1 #plus one for <UNK> token
tsteps_per_ascii =25
text_length = tsteps // tsteps_per_ascii
save_path = './saved/model.ckpt'
data_dir = './data'
eos_prob = 0.4 # threshold probability for ending a stroke
train = True
data_scale = 50
grad_clip = 10.0
dropout = 0.85
optimizer = "rmsprop"
num_layers = 3

In [3]:
# logger = Logger(log_dir, train)
dataloader = DataLoader(data_dir, alphabet, batch_size, tsteps, data_scale, tsteps_per_ascii)

Training data found, proceed to loading
	loaded dataset:
		11315 train individual data points
		595 valid individual data points
		353 batches


In [4]:
# initial weight vector proposed in Alex_Graves Paper
LSTM_initializer = tf.truncated_normal_initializer(mean=0., stddev=.075, seed=None, dtype=tf.float32)

window_b_initializer = tf.truncated_normal_initializer(mean=-3.0, stddev=.25, seed=None, dtype=tf.float32)

cell = [None] * num_layers

for i in range(num_layers) :
    cell[i] = tf.contrib.rnn.LSTMCell(rnn_size, state_is_tuple=True, initializer=LSTM_initializer)


In [5]:
input_data = tf.placeholder(dtype=tf.float32, shape=placeholder_shape)
output_data = tf.placeholder(dtype=tf.float32, shape=placeholder_shape)
istate_cell = [None] *num_layers
outs_cell = [None] * num_layers
fstate_cell = [None] * num_layers

for i in range(num_layers) :
    istate_cell[i] = cell[i].zero_state(batch_size=batch_size, dtype=tf.float32)
    
inputs = [tf.squeeze(i, [1]) for i in tf.split(input_data, tsteps, 1)]

outs_cell[0], fstate_cell[0] = tf.contrib.legacy_seq2seq.rnn_decoder(inputs, istate_cell[0], cell[0], loop_function=None, scope='cell0')

In [6]:
#attention mechanism
def get_phi(length, a, b, k):
    u = np.linspace(0, length-1 , length) 
    e = tf.multiply(b, - tf.square(tf.subtract(k,u)))
    phi = tf.multiply(a, tf.exp(e))
    return tf.reduce_sum(phi, 1, keep_dims=True)

# get the soft window 
def get_window(coef):
    [a, b, k, c] = coef
    length = c.get_shape()[1].value #number of items in sequence
    phi = get_phi(length, a, b, k)
    window = tf.squeeze(tf.matmul(phi,c), [1])
    return window, phi

# soft window parameters 
def get_coef(i, out_cell, kmixtures, prev_k, char_seq, reuse=True):
    hidden = out_cell.get_shape()[1]
    n_out = 3*kmixtures
    with tf.variable_scope('window',reuse=reuse):
        window_w = tf.get_variable("window_w", [hidden, n_out], initializer=LSTM_initializer)
        window_b = tf.get_variable("window_b", [n_out], initializer=window_b_initializer)
    co = tf.nn.xw_plus_b(out_cell, window_w, window_b) 
    abk = tf.exp(tf.reshape(co, [-1, 3*kmixtures,1]))
    a, b, k = tf.split(abk, 3, 1) 
    k = k + prev_k
    return a, b, k, char_seq


In [7]:
#initial parameters
init_kappa = tf.placeholder(dtype=tf.float32, shape=[None, kmixtures, 1]) 
char_seq = tf.placeholder(dtype=tf.float32, shape=[None, text_length, v_len])
wavg_prev_kappa = init_kappa
prev_window = char_seq[:,0,:]

#add soft window to the top of the first LSTM layer 
reuse = False
for i in range(len(outs_cell[0])):
    coef = get_coef(i, outs_cell[0][i], kmixtures, wavg_prev_kappa, char_seq,  reuse=reuse)
    (_, _, next_kappa, _) = coef
    window, phi = get_window(coef)
    #combine first layer output, soft-window, and original input text
    outs_cell[0][i] = tf.concat((outs_cell[0][i], window, inputs[i]), 1)
    wavg_prev_kappa = tf.reduce_mean(next_kappa, reduction_indices=1, keep_dims=True) # mean along kmixtures dimension
    reuse = True

(alpha, beta, next_kappa, _) = coef

In [8]:
# ----- finish building second recurrent cell
for i in range(1, num_layers):
    scope = 'cell' + str(i)
    outs_cell[i], fstate_cell[i]= tf.contrib.legacy_seq2seq.rnn_decoder(outs_cell[i-1], istate_cell[i], cell[i], \
                                                    loop_function=None, scope=scope) #use scope from training

r_out = tf.reshape(tf.concat(outs_cell[num_layers - 1], 1), [-1, rnn_size]) #concat outputs for efficiency

In [9]:
#put a dense cap on top of the rnn cells (to interface with the mixture density network)
n_out = 1 + nmixtures * 6 # params = end_of_stroke + 6 parameters per Gaussian
with tf.variable_scope('mdn_dense'):
    output_w = tf.get_variable("output_w", [rnn_size, n_out], initializer=LSTM_initializer)
    output_b = tf.get_variable("output_b", [n_out], initializer=LSTM_initializer)

output = tf.nn.xw_plus_b(r_out, output_w, output_b) #data flows through dense nn


In [10]:
# MDN above the last LSTM layer
def gaussian2d(x1, x2, m1, m2, s1, s2, r):
    # define gaussian mdn (eq 24, 25 from http://arxiv.org/abs/1308.0850)
    (sub1, sub2) = (tf.subtract(x1, m1), tf.subtract(x2, m2))
    sum_1 = tf.square(tf.div(sub1, s1)) + tf.square(tf.div(sub2, s2))    
    Z = sum_1 - 2*tf.div(tf.multiply(rho, tf.multiply(sub1, sub2)), tf.multiply(s1, s2))
    reg = 2*np.pi*tf.multiply(tf.multiply(s1, s2), tf.sqrt(1 - tf.square(r)))
    gaussian = tf.div(tf.exp(tf.div(-Z,2* (1 - tf.square(r)))), reg)
    return gaussian

In [11]:
#coefficient for MDN layer, detail function can be found in related papers
def get_mdn_coef(Z):
    global pi_hat, m1_hat, m2_hat, s1_hat, s2_hat, r_hat
    pi_hat, m1_hat, m2_hat, s1_hat, s2_hat, r_hat = tf.split(Z[:, 1:], 6, 1)
    eos = tf.sigmoid(-1*Z[:, 0:1])
    pi = tf.nn.softmax(pi_hat) # softmax
    m1 = m1_hat; m2 = m2_hat # leave mu1, mu2 as they are
    s1 = tf.exp(s1_hat); s2 = tf.exp(s2_hat) # exp for sigmas
    r = tf.tanh(r_hat) # tanh for rho (squish between -1 and 1)

    return [[eos, pi, m1_hat, m2_hat, tf.exp(s1_hat), tf.exp(s2_hat), tf.tanh(r_hat)], [pi_hat, m1_hat, m2_hat, s1_hat, s2_hat, r_hat]]

In [12]:
# loss function from the paper
def get_loss(pi, x1_data, x2_data, eos_data, mu1, mu2, sigma1, sigma2, rho, eos):
    gaussian = gaussian2d(x1_data, x2_data, mu1, mu2, sigma1, sigma2, rho)
    term1 = tf.reduce_sum(tf.multiply(gaussian, pi), 1, keep_dims=True)
    term1 = -tf.log(tf.maximum(term1, 1e-20))
    term2 = -tf.log(tf.multiply(eos, eos_data) + tf.multiply(1-eos, 1-eos_data))
    return tf.reduce_sum(term1 + term2)

In [13]:
flat_target_data = tf.reshape(output_data,[-1, 3])
[x1_data, x2_data, eos_data] = tf.split(flat_target_data, 3, 1)
retval = get_mdn_coef(output)
[eos, pi, mu1, mu2, sigma1, sigma2, rho] = retval[0]
pi_hat, mu1_hat, mu2_hat, sigma1_hat, sigma2_hat, rho_hat = retval[1]

In [14]:
loss = get_loss(pi, x1_data, x2_data, eos_data, mu1, mu2, sigma1, sigma2, rho, eos)
cost = loss / (batch_size * tsteps)

# initial variables for training
m_learning_rate = tf.Variable(0.0, trainable=False)
m_decay = tf.Variable(0.0, trainable=False)
m_momentum = tf.Variable(0.0, trainable=False)

tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)

if optimizer == 'adam':
    m_optimizer = tf.train.AdamOptimizer(learning_rate=m_learning_rate)
elif optimizer == 'rmsprop':
    m_optimizer = tf.train.RMSPropOptimizer(learning_rate=m_learning_rate, decay=m_decay, momentum=m_momentum)
else:
    raise ValueError("Optimizer type not recognized")
train_op = m_optimizer.apply_gradients(zip(grads, tvars))

#load data from files
input, output, _ , seq = dataloader.validation_data()
valid_inputs = {input_data: input, output_data: output, char_seq: seq}

#initialize training
sess = tf.InteractiveSession()
saver = tf.train.Saver(tf.global_variables())

In [None]:
print("start training...")
#misc parameters for training
momentum = 0.9
decay = 0.95
remember_rate = 0.99
nepochs = 100
learning_rate = 1e-4
lr_decay = 1.0
nbatches = 500
save = 500
total_step = nepochs * nbatches

#initialize the network
sess.run(tf.global_variables_initializer())
sess.run(tf.assign(m_decay, decay ))
sess.run(tf.assign(m_momentum, momentum ))

for e in range(0, nepochs):
    sess.run(tf.assign(m_learning_rate, learning_rate * (lr_decay ** e)))
    
    c = [None] * num_layers
    h = [None] * num_layers
    for counter in range(num_layers):
        c[counter] = istate_cell[counter].c.eval()
        h[counter] = istate_cell[counter].h.eval()
    
    kappa = np.zeros((batch_size, kmixtures, 1))

    for b in range(nbatches):
        
        # current step
        i = e * nbatches + b
            
        #save model for every given point
        if i % save == 0 and (i != 0):
            saver.save(sess, save_path, global_step = i)
            print("model saved at" + str(i))
        
        #load next batch for training
        x, y, s, ch = dataloader.next_batch()

        #feed the training set into network
        feed = {input_data: x, output_data: y, char_seq: ch, init_kappa: kappa}
        for j in range(num_layers):
            feed[istate_cell[j].c] = c[j]
            feed[istate_cell[j].h] = h[j]
        
        #run the network
        sess.run([cost, train_op], feed)
        feed.update(valid_inputs)
        feed[init_kappa] = np.zeros((batch_size, kmixtures, 1))
        sess.run([cost], feed)

        #print out the process
        if i % 10 == 0: 
            print(str(i) + '/' + str(total_step))

start training...
0/50000
