In [1]:
'''
load module and dataset
'''
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf

#load dataset
with open('anna.txt', 'r') as f:
    text = f.read()

#create dictionary of vocab    
vocab = sorted(set(text))
#convert vocab into int
vocabToInt = {char : i for i, char in enumerate(vocab)}
#convert int to vocab
intToVocab = dict(enumerate(vocab))
#encode the character
encoded = np.array([vocabToInt[c] for c in text], dtype = np.int32)

In [2]:
'''
print first 100 characters and print the encoded one
'''
print(text[ : 100])
print('')
print('The encoded text above are:')
print('{}'.format(encoded[ : 100]))

#classes that our network has
print('Number of classes that our network has to pick from is {}'.format(len(vocab)))

Chapter 1


Happy families are all alike; every unhappy family is unhappy in its own
way.

Everythin

The encoded text above are:
[31 64 57 72 76 61 74  1 16  0  0  0 36 57 72 72 81  1 62 57 69 65 68 65
 61 75  1 57 74 61  1 57 68 68  1 57 68 65 67 61 26  1 61 78 61 74 81  1
 77 70 64 57 72 72 81  1 62 57 69 65 68 81  1 65 75  1 77 70 64 57 72 72
 81  1 65 70  1 65 76 75  1 71 79 70  0 79 57 81 13  0  0 33 78 61 74 81
 76 64 65 70]
Number of classes that our network has to pick from is 83


In [3]:
'''
making training mini-batches
----------------------------
The first thing we need to do is discard some of the text so we only have completely full batches.
Each batch contains NxM characters, where N is the batch size(the number of sequence) and M is the number of steps
K is the number of the batches, so the total number of characters to keep from arr, N * M * K.

The idea is each batch is a NxM window on the Nx(M*K) array, with each subsequent batch, the window moves over
by n steps. Remember that the targets are the inputs shiffted over one character
'''

def getBatches(arr, batchSize, nSteps):
    '''
    create a generator that returns batches of size batchSize X nSteps from arr
    
    arguments
    ---------
    arr: array you want to make batches from
    batchSize: batch size, the number sequences per batch
    nSteps: number of sequence steps per batch
    '''
    
    #get the number of characters per batch and number of batches we can make
    charsPerBatch = batchSize * nSteps
    nBatch = len(arr) // charsPerBatch
    
    #keep only enough characters to make full batches
    arr = arr[ : nBatch * charsPerBatch]
    
    #reshape into batchSize rows
    arr = arr.reshape((batchSize, -1))
    
    for n in range(0, arr.shape[1], nSteps):
        #the features
        x = arr[ : , n : n + nSteps]
        #the targets, shifted by one
        yTemp = arr[ : ,n + 1 : n + nSteps + 1]
        
        '''
        for the very last batch, y will be one characters short at the end of the sequences which break things.
        To handle this, make an array of the appropriate size first, with all zeros then add the targets.
        '''
        y = np.zeros(x.shape, dtype = x.dtype)
        y[ : , : yTemp.shape[1]] = yTemp
        
        yield x, y
        
#test the batches function with size of 10 and steps of 50
batches = getBatches(encoded, 10, 50)
x, y = next(batches)

print('Make sure the data is shifted over one step for y')
print('x\n', x[ : 10, : 10])
print('\ny\n', y[ : 10, : 10])

Make sure the data is shifted over one step for y
x
 [[31 64 57 72 76 61 74  1 16  0]
 [ 1 57 69  1 70 71 76  1 63 71]
 [78 65 70 13  0  0  3 53 61 75]
 [70  1 60 77 74 65 70 63  1 64]
 [ 1 65 76  1 65 75 11  1 75 65]
 [ 1 37 76  1 79 57 75  0 71 70]
 [64 61 70  1 59 71 69 61  1 62]
 [26  1 58 77 76  1 70 71 79  1]
 [76  1 65 75 70  7 76 13  1 48]
 [ 1 75 57 65 60  1 76 71  1 64]]

y
 [[64 57 72 76 61 74  1 16  0  0]
 [57 69  1 70 71 76  1 63 71 65]
 [65 70 13  0  0  3 53 61 75 11]
 [ 1 60 77 74 65 70 63  1 64 65]
 [65 76  1 65 75 11  1 75 65 74]
 [37 76  1 79 57 75  0 71 70 68]
 [61 70  1 59 71 69 61  1 62 71]
 [ 1 58 77 76  1 70 71 79  1 75]
 [ 1 65 75 70  7 76 13  1 48 64]
 [75 57 65 60  1 76 71  1 64 61]]


In [4]:
'''
building the model, using tensorflow
'''
def buildInputs(batchSize, numSteps):
    '''
    define placeholder for inputs, targets, and dropout
    
    arguments
    ---------
    batchSize: batch size, number of sequences per batch
    numSteps: number of sequence steps in a batch
    '''
    
    #declare placeholders we'll feed into the graph
    inputs  = tf.placeholder(tf.int32, [batchSize, numSteps], name = 'inputs')
    targets = tf.placeholder(tf.int32, [batchSize, numSteps], name = 'targets')
    
    #dropout probability
    dropoutProb = tf.placeholder(tf.float32, name = 'dropoutProb')
    
    return inputs, targets, dropoutProb

In [5]:
'''
build long short term memory
The idea behind this is building a stack of LSTM cell
but we still need to define LSTM cell
'''

def buildLSTM(LSTMSize, numLayer, batchSize, dropoutProb):
    '''
    build LSTM cell
    
    arguments
    ---------
    LSTMSize: size of the hidden layer in the LSTM cells
    numLayer: number of LSTM layers
    batchSize: batch size
    dropoutProb: scalar tensor(tf.placeholder) for the dropout prob
    '''
    
    ##build the LSTM cell
    def buildCell(LSTMSize, dropoutProb):
        #use basic LSTM cell
        LSTM = tf.contrib.rnn.BasicLSTMCell(LSTMSize) #basic cell
        #add dropout to the cell
        drop = tf.contrib.rnn.DropoutWrapper(LSTM, output_keep_prob = dropoutProb)
        
        return drop
    
    #stack LSTM cell
    cell = tf.contrib.rnn.MultiRNNCell([buildCell(LSTMSize, dropoutProb) for _ in range(numLayer)]) #stack of LSTM cell
    intialState = cell.zero_state(batchSize, tf.float32)
    
    return cell, intialState

In [6]:
'''
RNN output
-----------
we need to connect the output of the RNN cells to a full connected layer with a softmax output
We are using the same fully connected layer, the same weights, for each of the outputs.
We should reshape the output into a 2D tensor with shape (M*N) X L. That is 1 row for each sequence and step
where the values of each row are the output from the LSTM cells
'''

def buildOutput(LSTMOutput, inSize, outSize):
    '''
    Build a softmax layer, return softmax output and logits
    
    arguments
    ---------
    x: input tensor
    inSize: size of the input tensor, size of the LSTM cells
    outSize: size of the softmax layer
    '''
    
    #reshape output so it's a buch of rows, one row for each step for each sequence
    #the shape should be batchSize * numSteps rows by LSTM size columns
    SequenceOutput = tf.concat(LSTMOutput, axis = 1)
    x = tf.reshape(SequenceOutput, [-1, inSize]) #transpose the shape
    
    #connect the RNN outputs to a softmax layer
    with tf.variable_scope('softmax'):
        softmaxW = tf.Variable(tf.truncated_normal((inSize, outSize), stddev = 0.1))
        softmaxB = tf.Variable(tf.zeros(outSize))
        
    #since output is bunch a rows of RNN cell outputs, logits will be a bunch
    # of rows of logits output, one for each step and sequence
    logits = tf.matmul(x, softmaxW) + softmaxB
    
    #use softmax to get the probabilities for predicted characters
    output = tf.nn.softmax(logits, name = 'predictions') # output of fully connected neural network
    
    return output, logits


In [7]:
'''
training loss
-------------
We need to one hot encode the targets, we get them as the encoded one
then we reshape them into a 2D tensor with size of (M * N) x C where C is the number of classes
'''

def buildLoss(logits, targets, LSTMSize, numClasses):
    '''
    calculate the loss from the logits and the targets
    
    arguments
    ---------
    logits: logits from final fully connected layer
    targets: targets for supervised learning
    LSTMSize: number of LSTM hidden units
    numClasses: number of classes in targets
    '''
    
    #one hot encode targets and reshape to match logits, one row per batchSize per step
    yOneHot = tf.one_hot(targets, numClasses)
    yReshaped = tf.reshape(yOneHot, logits.get_shape())
    
    #softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = yReshaped)
    loss = tf.reduce_mean(loss)
    
    return loss

In [8]:
'''
optimizer for gradient exploding and disappearing problem
---------------------------------------------------------
Using gradient clip above some threshold. If a gradient is larger than a threshold, we set it to the threshold
this will ensure the gradients never grow overly large
'''

def buildOptimizer(loss, learningRate, gradClip):
    '''
    build optimizer for training, using gradient clipping
    
    arguments
    ---------
    loss: network loss
    learningRate: learning rate for optimizer
    '''
    
    #optimizer for training, using gradient clipping to control exploding gradients
    tVars = tf.trainable_variables()
    grad, _ = tf.clip_by_global_norm(tf.gradients(loss, tVars), gradClip)
    trainOp = tf.train.AdamOptimizer(learningRate)
    optimizer = trainOp.apply_gradients(zip(grad, tVars))
    
    return optimizer

In [9]:
'''
build network
-------------
create a class for the RNN. The is function will pass the hidden and cell states across LSTM cells appropriately
it returns the output for each LSTM cell at each step for each sequence in the mini-batch and also give the LSTM state

This class is using all function created before.
'''

class charRNN:
    def __init__(self, numClasses, batchSize = 64, numSteps = 50, 
                       LSTMSize = 128, numLayer = 2, learningRate = 0.0001, 
                       gradClip = 5, sampling = False):
        
        #when we are using the network for sampling, we will passing in one character at a time
        if sampling == True:
            batchSize, numSteps = 1, 1
        else:
            batchSize, numSteps = batchSize, numSteps
            
        tf.reset_default_graph()
        
        #build the input placeholder tensor
        self.inputs, self.targets, self.dropoutProb = buildInputs(batchSize, numSteps) #buildInputs function created above
        #buidl the LSTM cell
        cell, self.initialState, = buildLSTM(LSTMSize, numLayer, batchSize, self.dropoutProb) #build LSTM function created above
        
        ##run the date through the RNN layers
        #one hot encode the input tokens
        xOneHot = tf.one_hot(self.inputs, numClasses)
        
        #run each sequence step through the RNN and collect the outputs
        outputs, state = tf.nn.dynamic_rnn(cell, xOneHot, initial_state = self.initialState)
        self.finalState = state
        
        #get sofmax predictions and logits
        self.prediction, self.logits = buildOutput(outputs, LSTMSize, numClasses) #buildOutput function created above
        
        #loss and optimizer with gradient clipping 
        self.loss = buildLoss(self.logits, self.targets, LSTMSize, numClasses) #buildLoss function created above
        self.optimizer = buildOptimizer(self.loss, learningRate, gradClip) #buildOptimizer function created above
        

In [10]:
'''
hyperparameter
--------------
written separately in purpose to make it simpler to understand
'''

batchSize = 100 #sequence per batch
numSteps = 100 #number of sequence steps per batch
LSTMSize = 512 #size of hidden layer in LSTM
numLayer = 2 #number of LSTM layer
learningRate = 0.001 #learning rate
dropoutProb = 0.5 #dropout probability


In [11]:
'''
training the network
'''
epochs = 20
printEveryN = 50
saveEveryN = 200

#create the object RNN network based on charRNN class
model = charRNN(len(vocab), batchSize = batchSize, numSteps = numSteps, 
               LSTMSize = LSTMSize, numLayer = numLayer, learningRate = learningRate)

saver = tf.train.Saver(max_to_keep = 100)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    counter = 0
    
    #load saved checkpoint
    saver.restore(sess, 'checkpoints/i3960_l512.ckpt')
    for e in range(epochs):
        newState = sess.run(model.initialState)
        loss = 0
        
        for x, y, in getBatches(encoded, batchSize, numSteps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x, 
                    model.targets: y, 
                    model.dropoutProb: dropoutProb, 
                    model.initialState: newState}
            
            batchLoss, newState, _ = sess.run([model.loss,
                                               model.finalState, 
                                               model.optimizer], 
                                              feed_dict = feed)
            
            if(counter % printEveryN == 0):
                end = time.time()
                print('Epochs: {}/{}...'.format(e + 1, epochs), 
                      'Training step: {}...'.format(counter), 
                      'Training loss: {}...'.format(batchLoss), 
                      '{:.4f}sec/batch'.format((end - start)))
                
            if(counter % saveEveryN == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, LSTMSize))
            
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, LSTMSize))
    
#saved checkpoint
tf.train.get_checkpoint_state('checkpoints')

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

INFO:tensorflow:Restoring parameters from checkpoints/i3960_l512.ckpt
Epochs: 1/20... Training step: 50... Training loss: 1.0613077878952026... 0.3165sec/batch
Epochs: 1/20... Training step: 100... Training loss: 1.023555874824524... 0.3196sec/batch
Epochs: 1/20... Training step: 150... Training loss: 1.033983588218689... 0.3199sec/batch
Epochs: 2/20... Training step: 200... Training loss: 1.0533851385116577... 0.3191sec/batch
Epochs: 2/20... Training step: 250... Training loss: 1.0888053178787231... 0.3157sec/batch
Epochs: 2/20... Training step: 300... Training loss: 1.0326857566833496... 0.3183sec/batch
Epochs: 2/20... Training step: 350... Training loss: 1.0351042747497559... 0.3182sec/batch
Epochs: 3/20... Training step: 400... Training loss: 1.0727630853652954... 0.3194sec/batch
Epochs: 3/20... 

model_checkpoint_path: "checkpoints/i3960_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i200_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i400_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i600_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i800_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i1000_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i1200_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i1400_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i1600_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i1800_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i2000_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i2200_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i2400_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i2600_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i2800_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i3000_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i3200_l512.ckpt"
all_model_checkpoint_pa

In [22]:
'''
sampling
--------
The idea is that we pass in a character, then the network will predict the next character.
We can use the new one to predict the next one
'''

def pickTopN(preds, vocabSize, topN = 5):
    p = np.squeeze(preds)
    p[np.argsort(p) [ : -topN]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocabSize, 1, p = p)[0]
    
    return c

def sample(checkpoint, nSamples, LSTMSize, vocabSize, prime = "The "):
    samples = [c for c in prime]
    model = charRNN(len(vocab), LSTMSize = LSTMSize, sampling = True)
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        newState = sess.run(model.initialState)
        
        for c in prime:
            x = np.zeros((1, 1))
            x[0, 0] = vocabToInt[c]
            feed = {model.inputs: x, 
                    model.dropoutProb: 1., 
                    model.initialState: newState}
            
            preds, newState = sess.run([model.prediction, model.finalState], 
                                       feed_dict = feed)
            
        c = pickTopN(preds, len(vocab))
        samples.append(intToVocab[c])
        
        for i in range(nSamples):
            x[0, 0] = c
            feed = {model.inputs: x, 
                    model.dropoutProb: 1., 
                    model.initialState: newState}
            
            preds, newState = sess.run([model.prediction, model.finalState], 
                                       feed_dict = feed)
            
            c = pickTopN(preds, len(vocab))
            samples.append(intToVocab[c])
            
    return ''.join(samples)

In [24]:
'''
testing
'''
tf.train.latest_checkpoint('checkpoints')


'checkpoints/i3960_l512.ckpt'

We will try to test the trained model to artificially predict sequence of the words within. The network will receive an incomplete phase and try to predict the sequence outcome

In [54]:
#sampling 1
print("First sampling, we try to predict the sequence of '''part''' words")
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 20, LSTMSize, len(vocab), prime = "part")
print("predicted 20 sequence of characters based on words given is: ")
print(' ')
print(samp)

First sampling, we try to predict the sequence of '''part''' words
INFO:tensorflow:Restoring parameters from checkpoints/i3960_l512.ckpt
predicted 20 sequence of characters based on words given is: 
 
partly, who had been
take


In [53]:
#sampling 2 
print("First sampling, we try to predict the sequence of '''bec''' words")
checkpoint = 'checkpoints/i200_l512.ckpt'
samp = sample(checkpoint, 20, LSTMSize, len(vocab), prime = "bec")
print("predicted 20 sequence of characters based on words given is: ")
print(' ')
print(samp)

First sampling, we try to predict the sequence of '''bec''' words
INFO:tensorflow:Restoring parameters from checkpoints/i200_l512.ckpt
predicted 20 sequence of characters based on words given is: 
 
become of the world. He 


In [52]:
#sampling 3
print("First sampling, we try to predict the sequence of '''fas''' words")
checkpoint = 'checkpoints/i600_l512.ckpt'
samp = sample(checkpoint, 20, LSTMSize, len(vocab), prime = "fas")
print("predicted 20 sequence of characters based on words given is: ")
print(' ')
print(samp)

First sampling, we try to predict the sequence of '''fas''' words
INFO:tensorflow:Restoring parameters from checkpoints/i600_l512.ckpt
predicted 20 sequence of characters based on words given is: 
 
fast that he saw in her



In [50]:
#sampling 4
print("First sampling, we try to predict the sequence of '''Far''' words")
checkpoint = 'checkpoints/i1200_l512.ckpt'
samp = sample(checkpoint, 20, LSTMSize, len(vocab), prime = "Far")
print("predicted 20 sequence of characters based on words given is: ")
print(' ')
print(samp)

First sampling, we try to predict the sequence of '''Far''' words
INFO:tensorflow:Restoring parameters from checkpoints/i1200_l512.ckpt
predicted 20 sequence of characters based on words given is: 
 
Farritaly. He was not at
