## import modules

In [1]:
import time
from collections import namedtuple

import numpy as np
import tensorflow as tf

## tokenizer
> convert the character to and from integers

In [2]:
with open('west.txt', 'r') as f:
    text = f.read()
vocab = sorted(set(text))
vocab_to_int = {c: i for i,c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

check out the first 100 characters

In [3]:
text[:100]

'\n（明）吴承恩著\n第一回\n灵根育孕源流出心性修持大道生\n诗曰：\n混沌未分天地乱，茫茫渺渺无人见。\n自从盘古破鸿蒙，开辟从兹清浊辨。\n覆载群生仰至仁，发明万物皆成善。\n欲知造化会元功，须看西游释厄传。\n'

In [4]:
len(vocab)

4325

In [5]:
len(text)

658298

words frequency

In [6]:
len(text)/len(vocab)

152.20763005780347

encoded integers

In [7]:
encoded[:100]

array([   0, 4319, 1695, 4320,  564, 1425, 1304, 3224,    0, 2738,   19,
        730,    0, 2230, 1856, 3008,  947, 2176, 2094,  354, 1248, 1284,
        222, 1493,  833, 3787, 2443,    0, 3499, 1742, 4322,    0, 2147,
       2034, 1763,  362,  834,  754,   81, 4321, 3155, 3155, 2167, 2167,
       1677,  112, 3441,   16,    0, 3079,  123, 2526,  518, 2590, 4262,
       3239, 4321, 1190, 3722,  123,  306, 2150, 2098, 3724,   16,    0,
       3440, 3704, 2939, 2443,  137, 3081,  115, 4321,  509, 1695,   22,
       2319, 2505, 1382,  672,   16,    0, 1962, 2574, 3775,  437,  153,
        278,  412, 4321, 4089, 2539, 3438, 2166, 3847,  484,  156,   16,
          0], dtype=int32)

In [8]:
int_to_vocab

{0: '\n',
 1: ' ',
 2: '!',
 3: '"',
 4: '#',
 5: '*',
 6: ',',
 7: '.',
 8: '—',
 9: '‘',
 10: '’',
 11: '“',
 12: '”',
 13: '□',
 14: '⻊',
 15: '、',
 16: '。',
 17: '《',
 18: '》',
 19: '一',
 20: '丁',
 21: '七',
 22: '万',
 23: '丈',
 24: '三',
 25: '上',
 26: '下',
 27: '不',
 28: '与',
 29: '丑',
 30: '专',
 31: '且',
 32: '丕',
 33: '世',
 34: '丘',
 35: '丙',
 36: '业',
 37: '丛',
 38: '东',
 39: '丝',
 40: '丞',
 41: '丢',
 42: '两',
 43: '严',
 44: '丧',
 45: '个',
 46: '丫',
 47: '中',
 48: '丰',
 49: '串',
 50: '临',
 51: '丸',
 52: '丹',
 53: '为',
 54: '主',
 55: '丽',
 56: '举',
 57: '乃',
 58: '久',
 59: '么',
 60: '义',
 61: '之',
 62: '乌',
 63: '乍',
 64: '乎',
 65: '乏',
 66: '乐',
 67: '乒',
 68: '乓',
 69: '乔',
 70: '乖',
 71: '乘',
 72: '乙',
 73: '乜',
 74: '九',
 75: '乞',
 76: '也',
 77: '习',
 78: '乡',
 79: '书',
 80: '买',
 81: '乱',
 82: '乳',
 83: '乾',
 84: '了',
 85: '予',
 86: '争',
 87: '事',
 88: '二',
 89: '于',
 90: '亏',
 91: '云',
 92: '互',
 93: '五',
 94: '井',
 95: '亘',
 96: '亚',
 97: '些',
 98: '亡',
 99: '亢',
 100: '交'

## Making training mini-batchs

In [15]:
def get_batches(arr, batch_size, n_steps):
    '''
    Create a generator that returns batches of size
       batch_size x n_steps from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       n_steps: Number of sequence steps per batch
    '''
    characters_per_batch = batch_size * n_steps
    n_batches = len(arr)//characters_per_batch
    
    arr = arr[:characters_per_batch*n_batches]
    
    arr = np.reshape(arr,[batch_size,-1])
    
    for n in range(0, arr.shape[1], n_steps):
        x = arr[:, n:n+n_steps]
        y_temp = arr[:,n+1:n+n_steps+1]
        
        y = np.zeros(x.shape, dtype=x.dtype)
        y[:, :y_temp.shape[1]] = y_temp
        
        yield x, y

test the function

In [16]:
batchs = get_batches(encoded, 5, 10)
x, y = next(batchs)

In [17]:
print(x)
print(y)

[[   0 4319 1695 4320  564 1425 1304 3224    0 2738]
 [3465   27 1239 4321 3518 1707  114   59  842  112]
 [3379 2963 3787 4322   11    0    0 2741 1383 2539]
 [ 748  268 4321   19   45   45 3456 3531 2150 4022]
 [1249 1707  877 2805 2504  174  822   84 4321 1383]]
[[4319 1695 4320  564 1425 1304 3224    0 2738   19]
 [  27 1239 4321 3518 1707  114   59  842  112 4317]
 [2963 3787 4322   11    0    0 2741 1383 2539 2539]
 [ 268 4321   19   45   45 3456 3531 2150 4022 4321]
 [1707  877 2805 2504  174  822   84 4321 1383   31]]


## Building the model

### inputs

In [18]:
def build_inputs(batch_size, num_steps):
    ''' Define placeholders for inputs, targets, and dropout 
    
        Arguments
        ---------
        batch_size: Batch size, number of sequences per batch
        num_steps: Number of sequence steps in a batch
        
    '''
    inputs = tf.placeholder(tf.int32, shape=[batch_size, num_steps], name='inputs')
    targets = tf.placeholder(tf.int32, shape=[batch_size, num_steps], name= 'targets')
    
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs, targets, keep_prob

### Lstm cell

In [36]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    ''' Build LSTM cell.
    
        Arguments
        ---------
        keep_prob: Scalar tensor (tf.placeholder) for the dropout keep probability
        lstm_size: Size of the hidden layers in the LSTM cells
        num_layers: Number of LSTM layers
        batch_size: Batch size

    '''
    def build_cell(lstm_size, keep_prob):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        return drop
    
    cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(num_layers)])
    init_state = cell.zero_state(batch_size, dtype=tf.float32)
    
    return cell, init_state

### RNN Output

In [37]:
def build_output(lstm_output, in_size, out_size):
    '''
        Build a softmax layer, return the softmax output and logits.
    
        Arguments
        ---------
        
        lstm_output: List of output tensors from the LSTM layer
        in_size: Size of the input tensor, for example, size of the LSTM cells
        out_size: Size of this softmax layer
    '''
    
    seq_output = tf.concat(lstm_output, axis=1)
    x = tf.reshape(seq_output, [-1, in_size])
    
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal([in_size,out_size],stddev=1.0))
        softmax_b = tf.Variable(tf.zeros(out_size))
    
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    out = tf.nn.softmax(logits, name='predictions')
    
    return out, logits

## Training loss

In [38]:
def build_loss(logits, targets, lstm_size, num_classes):
    ''' Calculate the loss from the logits and the targets.
    
        Arguments
        ---------
        logits: Logits from final fully connected layer
        targets: Targets for supervised learning
        lstm_size: Number of LSTM hidden units
        num_classes: Number of classes in targets
        
    '''
    y_one_hot = tf.one_hot(targets,num_classes)
    y_shaped = tf.reshape(y_one_hot, [-1, num_classes])
    
    loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_shaped, logits=logits)
    loss = tf.reduce_mean(loss, name='loss')
    
    return loss

### Optimizer

In [39]:
def build_optimizer(loss, learning_rate, grad_clip):
    ''' Build optmizer for training, using gradient clipping.
    
        Arguments:
        loss: Network loss
        learning_rate: Learning rate for optimizer
    
    '''
    
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

### Build the network

In [40]:
class CharRNN:
    def __init__(self, num_classes, batch_size=64, num_steps=50,
                lstm_size=128, num_layers=2, learning_rate=0.001,
                grad_clip=5, sampling=False):
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps
        
        tf.reset_default_graph()
        
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)
        
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
        
        x_one_hot = tf.one_hot(self.inputs, num_classes)
        
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot,initial_state=self.initial_state)
        self.final_state = state
        
        self.predition, self.logits = build_output(outputs, lstm_size, num_classes)
        
        self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

### Hyperparameters

In [41]:
batch_size = 10         # Sequences per batch
num_steps = 50          # Number of sequence steps per batch
lstm_size = 128         # Size of hidden layers in LSTMs
num_layers = 2          # Number of LSTM layers
learning_rate = 0.01    # Learning rate
keep_prob = 0.5         # Dropout keep probability

### Training

In [43]:
epochs = 20
# Print losses every N interations
print_every_n = 50

# Save every N iterations
save_every_n = 200

model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Use the line below to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/______.ckpt')
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            if (counter % print_every_n == 0):
                end = time.time()
                print('Epoch: {}/{}... '.format(e+1, epochs),
                      'Training Step: {}... '.format(counter),
                      'Training loss: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)))
        
            if (counter % save_every_n == 0):
                saver.save(sess, "west_checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "west_checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

Epoch: 1/20...  Training Step: 50...  Training loss: 5.9304...  0.6918 sec/batch


KeyboardInterrupt: 

### Saved checkpoints

In [44]:
tf.train.get_checkpoint_state('west_checkpoints')

### Sampling

In [None]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [None]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

In [None]:
tf.train.latest_checkpoint('west_checkpoints')

In [None]:
checkpoint = tf.train.latest_checkpoint('west_checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="孙")
print(samp)

In [None]:
checkpoint = 'west_checkpoints/i65600_l128.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)