# Char-RNN
This example is a character-level Recurrent Neural Network. We intended to feeding the network some sequence of character and it should predict the next character in the sequences.

## Imports

In [1]:
import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np
import helper
import time
from datetime import timedelta
from helper import txt_reader, nn_layer_util
import os,sys

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Load Data
Try to load some data from <b>input.txt</b>. Using helper functions in <b>txt_reader</b>, the file was converted into a list of words.
After that, we calculate the total number of unique words in the dataset. To convenient the Neural Network, we then convert every words in the dataset into indices.

In [2]:
data_path = "data/oneday/input.txt"

# convert txt to list of characters, type=str
data_char = helper.txt_reader.txt2char(data_path)
# calculating the numbers of unique characters in the list
chars, vocab_size = helper.txt_reader.unique_element(data_char)
print "Total chars: {}, Num vocabulary : {}".format(len(data_char), vocab_size)

# building a map between chars & indices
chars_indices = dict((w,i) for i, w in enumerate(chars))
indices_chars = dict((i,w) for i, w in enumerate(chars))
# converting the dataset to indices
data_idx = [chars_indices[w] for w in data_char]

Total chars: 109892, Num vocabulary : 77


### Print data
Try to print some data to check the convertion.

In [3]:
print data_char[200:300]
print data_idx[200:300]

id.

	'Ah - you mean we have to change the world?' the boy replied.

	'No, not all of the world,
[59, 54, 12, 3, 2, 3, 2, 1, 7, 26, 58, 4, 11, 4, 75, 65, 71, 4, 63, 55, 51, 64, 4, 73, 55, 4, 58, 51, 72, 55, 4, 70, 65, 4, 53, 58, 51, 64, 57, 55, 4, 70, 58, 55, 4, 73, 65, 68, 62, 54, 25, 7, 4, 70, 58, 55, 4, 52, 65, 75, 4, 68, 55, 66, 62, 59, 55, 54, 12, 3, 2, 3, 2, 1, 7, 39, 65, 10, 4, 64, 65, 70, 4, 51, 62, 62, 4, 65, 56, 4, 70, 58, 55, 4, 73, 65, 68, 62, 54, 10]


## Setting Variables


In [4]:
# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 64 # number of steps to unroll the RNN for
learning_rate = 1e-4

### Placeholder variables

Everytime we execute the TensorFlow computational graph, we can feed different values to the Placeholder variables. These Placeholder variable are multi-dimensional array called <b>tensor</b> and the data-type is set to <b>float32</b>. 

In [5]:
# Output: [seq_length, vocab_size]
x = tf.placeholder(tf.float32, shape=[seq_length, vocab_size], name='x')
x2 = tf.placeholder(tf.float32, shape=[2, vocab_size], name='x_test')
# Label y associated with X 
# Output: [seq_length, vocab_size]
# Each example's class is represent in vector e.g. For class 5 = [0,0,0,0,0,1,0,0,0,0]
y_true = tf.placeholder(tf.float32, shape=[seq_length, vocab_size], name='y_true')
y2 = tf.placeholder(tf.float32, shape=[2, vocab_size], name='y_test')

# Class-number y associated with X 
# Output: [seq_length]
# Each example's class is represent in number e.g. [5]
y_true_cls = tf.argmax(y_true, dimension=1)

In [6]:
# some tensorflow variables
global_step = tf.Variable(initial_value=0, name='global_step', trainable=False)
hprev = tf.Variable(tf.zeros(shape=(hidden_size, seq_length), name='hprev', dtype=tf.float32), trainable=True)


### Helper-function - RNN layer
Create a new rnn layer, using the function: W*x + W*(x-1) + b
<br><b>hprev</b> is the output of the function from previous layer.

In [7]:
def new_rnn_layer(inputs,
                  hprev,
                  vocab_size,
                  hidden_size,
                  batch_size = 32):
    with tf.variable_scope('rnn_layer') as scope:
        # Creating a simple rnn cell
        # Create new weights, for filters with the given shape.
        weights_x = helper.nn_layer_util.new_weights(shape=(vocab_size, hidden_size))
        weights_h = helper.nn_layer_util.new_weights(shape=(hidden_size, hidden_size))
        weights_y = helper.nn_layer_util.new_weights(shape=(hidden_size, vocab_size))

        # Create new biases, one for each filter.
        biases_h = helper.nn_layer_util.new_biases(length=hidden_size)
        biases_y = helper.nn_layer_util.new_biases(length=vocab_size)
        
        hyp, outputs = {}, {}
        hyp[-1] = np.copy(hprev)
        layer = tf.matmul(inputs, weights_x) + tf.transpose(tf.matmul(weights_h, hprev))
        hyp = tf.tanh(tf.nn.bias_add(layer, biases_h))
        outputs = tf.nn.bias_add(tf.matmul(hyp, weights_y), biases_y)
    return outputs

### Helper-function - Get batch sequence
Since we are not feeding the whole data file into the RNN at once. Therefore, we are going feed batches with size of <b>seq_length</b> to the RNN each time. Similar to CNN, instead of feeding an image and predict its class, we feed a character from the sequence at once to predict the next character in the sequence. That's why, the <b>y_batch</b> will be starting from <font color="red">p+1 ~ p+seq_length+1</font>.

In [8]:
p = 0 # Global seq. counter p
def get_batch(hidden_size, data, seq_length, d2i):
    global p
    # Number of images in the training-set.
    if p+seq_length+1 >= len(data) or global_step == 0:
        hprev = np.zeros((hidden_size, 1)) # Reset RNN memory
        p = 0 # Go to the start of data
    x_tmp = [d2i[i] for i in data[p:p+seq_length]]
    y_tmp = [d2i[i] for i in data[p+1:p+seq_length+1]]
    x_batch = np.zeros((seq_length, vocab_size))
    y_batch = np.zeros((seq_length, vocab_size))
    x_batch[np.arange(seq_length), x_tmp] = 1
    y_batch[np.arange(seq_length), y_tmp] = 1
    
    return x_batch, y_batch

## Architecture (RNN part)
This part is the implementation of the whole RNN.

CNN-Architecture: <br>
RNN - Output(Softmax)

In [9]:
# RNN layer
layer_rnn = new_rnn_layer(inputs=x, hprev= hprev, vocab_size=vocab_size, hidden_size=hidden_size)

In [10]:
# Class vector e.g. For class 5 = [0,0,0,0,0,1,0,0,0,0] 
y_pred = tf.nn.softmax(layer_rnn)
# Use argmax to convert y from class vector to class labels e.g. 5
y_pred_cls = tf.argmax(y_pred, dimension=1)

# Architecture (Optimisation Part)

### Cost-function to be optimised

In [11]:
# Using Logistic Cost Function
# - [y * log(h(x)) + (1-y) * log(1-h(x))]
# A cost is output for each image
# Output: [num of image,]
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_rnn,labels=y_true)
# Computes the mean of all cost, resulting a single value
cost = tf.reduce_mean(cross_entropy)
# Collecting accuracy for TensorBoard
tf.summary.scalar("cost", cost)

<tf.Tensor 'cost:0' shape=() dtype=string>

### Optimization Method
AdamOptimizer which is an advanced form of Gradient Descent we can use for minimise the cost.
<br>
**Note that optimization is not performed at this point. In fact, nothing is calculated at all, we just add the optimizer-object to the TensorFlow graph for later execution.

In [12]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost, global_step=global_step)

### Performance Measures
We need a few more performance measures to display the progress to the user.
This is a vector of booleans whether the predicted class equals the true class of each image.

In [13]:
# Vector of booleans whether the predicted class equals the true class of each image.
# Output: [num of image,]
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
# Cast booleans to floats, False = 0, True = 1
# Calculate the average of these number
# Output: float32
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Collecting accuracy for TensorBoard
tf.summary.scalar("accuracy", accuracy)

<tf.Tensor 'accuracy:0' shape=() dtype=string>

### Saver
Save variables of the neural network to reloaded quickly without having to train the network again.
<br>Doc: https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/04_Save_Restore.ipynb

**Note that nothing is actually saved at this point, which will be done further below.

In [14]:
saver = tf.train.Saver()

## Main (Execute)

In [15]:
# Create TensorFlow Session to execute the TensorFlow graph
session = tf.Session()
# Collecting accuracy for TensorBoard
# Summaries such as scalars can be seen by running the command below
# tensorboard --logdir="./log"
# http://192.168.0.150:6006
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter('./log', session.graph)
# Initialise weights and bias
session.run(tf.global_variables_initializer())

### Executing-function - Perform optimization iterations
This function is called to execute the training process of RNN. A number of optimization iterations so as to gradually improve the variables of the network layers. Each iteration, new batch of data is selected from the training-set and TensorFlow executes the optimizer using them. 

Input:<br>
num_iterations - Number of optimization iterations<br>
Output:<br>
None<br>
*weights is updated with-in the model and message of <b>Training Accuracy</b> is printed every 100 iterations

In [16]:
loss = 0
def optimize(num_iterations):
    global loss
    # Start-time used for printing time-usage below.
    start_time = time.time()

    for i in range(num_iterations):

        # Get a batch of training examples.
        # x_batch - batch of image
        # y_true_batch - labels of x_batch
        x_batch, y_true_batch = get_batch(hidden_size=hidden_size,
                                          data=data_char, 
                                          seq_length=seq_length, 
                                          d2i=chars_indices)

        # Put the batch into a dict with the proper names
        # for placeholder variables in the TensorFlow graph.
        feed_dict_train = {x: x_batch, y_true: y_true_batch}
        
        # Run the optimizer using this batch of training data.
        # TensorFlow assigns the variables in feed_dict_train
        # to the placeholder variables and then runs the optimizer.
        # !!! Add [merged, opt] for TensorBoard !!! Very Important, Order has no effect
        layer_pred ,layer, y_p, yt, yc, ty_output, i_global, summary, _ = session.run([y_pred ,layer_rnn ,correct_prediction, y_true_cls, y_pred_cls, layer_rnn, global_step, merged, optimizer], feed_dict=feed_dict_train)
        # useless stuff just for testing
        prob = np.exp(ty_output) / np.sum(np.exp(ty_output))
        pp = prob[np.arange(seq_length),yt]
        loss += np.sum(-np.log(pp)) * 0.001
        # Add summary to TensorBoard
        writer.add_summary(summary, i)

        # Print status to screen every 100 iterations (and last).
        if (i_global % 100 == 0) or (i == num_iterations - 1):
            # Calculate the accuracy on the training-batch.
            batch_acc = session.run(accuracy, feed_dict=feed_dict_train)

            # Print status.
            msg = "Global Step: {0:>6}, Training Batch Accuracy: {1:>6.1%}"
            print(msg.format(i_global, batch_acc))
        # Save a checkpoint to disk every 1000 iterations (and last).
        if (i_global % 1000 == 0) or (i == num_iterations - 1):
            # Save all variables of the TensorFlow graph to a
            # checkpoint. Append the global_step counter
            # to the filename so we save the last several checkpoints.
            saver.save(session,
                       save_path=save_path,
                       global_step=global_step)

            print("Saved checkpoint.")

    # Ending time.
    end_time = time.time()

    # Difference between start and end-times.
    time_dif = end_time - start_time

    # Print the time-usage.
    print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))

### Restore or initialize variables
Training this neural network may take a long time, especially if you do not have a GPU. We therefore save checkpoints during training so we can continue training at another time (e.g. during the night), and also for performing analysis later without having to train the neural network every time we want to use it.

If you want to restart the training of the neural network, you have to delete the checkpoints first.

This is the directory used for the checkpoints.

In [17]:
save_dir = 'checkpoints/'
# Create directory if not exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
save_path = os.path.join(save_dir, 'cifar10_cnn')

# Try to restore the latest checkpoint. 
# If checkpoint doesn't exist or TensorFlow graph has been modified, exception will raise.
# When exception appears, initialise will be made
try:
    print("Trying to restore last checkpoint ...")

    # Use TensorFlow to find the latest checkpoint - if any.
    last_chk_path = tf.train.latest_checkpoint(checkpoint_dir=save_dir)

    # Try and load the data in the checkpoint.
    saver.restore(session, save_path=last_chk_path)

    # If we get to this point, the checkpoint was successfully loaded.
    print("Restored checkpoint from:", last_chk_path)
except:
    # If the above failed for some reason, simply
    # initialise all the variables for the TensorFlow graph.
    print("Failed to restore checkpoint. Initializing variables instead.")
    session.run(tf.global_variables_initializer())

Trying to restore last checkpoint ...
Failed to restore checkpoint. Initializing variables instead.


In [18]:
optimize(num_iterations=1000)

Global Step:    100, Training Batch Accuracy:  26.6%
Global Step:    200, Training Batch Accuracy:  65.6%
Global Step:    300, Training Batch Accuracy:  53.1%
Global Step:    400, Training Batch Accuracy:  57.8%
Global Step:    500, Training Batch Accuracy:  73.4%
Global Step:    600, Training Batch Accuracy: 100.0%
Global Step:    700, Training Batch Accuracy: 100.0%
Global Step:    800, Training Batch Accuracy: 100.0%
Global Step:    900, Training Batch Accuracy: 100.0%
Global Step:   1000, Training Batch Accuracy: 100.0%
Saved checkpoint.
Time usage: 0:00:05
