In [1]:
ci_dir = 'clear_songci_data.txt'

In [3]:
import numpy as np
import os
import json
import tensorflow as tf
import time

In [4]:
ci = []

with open(ci_dir, 'r', encoding = 'utf-8') as text:
    for line in text:
        ci.append(line)

In [5]:
len(ci)

21042

In [6]:
all_word = []

for i in range(len(ci)):
    for line in ci[i]:
        for word in line:
            all_word.append(word)

In [7]:
all_word_dict = sorted(set(all_word))

In [8]:
len(all_word_dict)

5328

In [9]:
# all_word 是所有诗词单个字的组合
# all_word_dict 是所有出现的字

word_to_int = {word: i for i, word in enumerate(all_word_dict)}
int_to_word = dict(enumerate(all_word_dict))

In [10]:
encoded = np.array([word_to_int[i] for i in all_word])

In [11]:
def next_batch(input_data, num_sequence, steps):
    
    # num_sequence: 每批数据分成多少个sequence
    # char_per_batch: 每批有多少个字
    # steps: 每批次要多少步
    char_per_batch = num_sequence * steps
    
    # 数据分成多少批
    num_batches = len(input_data) // char_per_batch
    
    # 避免reshape的时候出错
    input_data = input_data[ :char_per_batch * num_batches]
    
    '''
    num_sequence * steps * num_batches = input_data
    把input_data变成shape( num_sequence, steps * num_batches)
    在只generate一次的情况下，shape = (num_sequence, steps)
    '''
    input_data = np.reshape(input_data, (num_sequence, -1))
    
    # (0, steps * num_batches, steps) = (0, input_data.shape[1], steps)
    # num = num_batches
    for i in range(0, num_batches * steps, steps):
        x = input_data[ : , i:i + steps]
        y_temp = input_data[ : , i+1: i+steps+1]
        
        y = np.zeros(x.shape, dtype = x.dtype)
        y[:, :y_temp.shape[1]] = y_temp
        
        yield x, y

In [12]:
# 两万首词；分成100个batch；每次训练50步；每个batch的字数是100*50 = 5000；
# 20000 / 5000 = 4; 也就是分成4批数据
len(encoded)

1583790

In [13]:
# 这个函数其实是一个生成函数
# 在这里如果num_sequence = 100, steps = 100，那么二者相乘就是10000
# 这样的话这个生成函数就会生成150次(总数为150万字)

test_x, test_y = next(next_batch(encoded, num_sequence = 100, steps = 100))

In [14]:
print(test_x[:3, :10])
print('')
print(test_y[:3, :10])

[[3123 2396 1781   77 1121   42 2017 3153 2036 2373]
 [4102 2450 2595 2062   42 2247 3221  478 2336 5314]
 [5314 3766 1134 3971   42  595 3847 3379 2091 1342]]

[[2396 1781   77 1121   42 2017 3153 2036 2373 1141]
 [2450 2595 2062   42 2247 3221  478 2336 5314 4177]
 [3766 1134 3971   42  595 3847 3379 2091 1342 5314]]


In [34]:
def create_placeholder(sequence_size, steps):
    x = tf.placeholder(dtype = tf.int32, shape = [sequence_size, steps], name = 'input_x')
    y = tf.placeholder(dtype = tf.int32, shape = [sequence_size, steps], name = 'target_y')
    keep_prob = tf.placeholder(dtype = tf.float32, name = 'keep_prob')
    return x, y, keep_prob

In [35]:
def create_lstm_cell(num_units, keep_prob, sequence_size, num_layers):
    
    '''
    返回cell和initial_state
    '''
    
    # 先用一个函数生成 cell
    def build_cell(num_units, keep_prob):
        cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
        drop = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = keep_prob)
        return drop
    
    # 有多少个layers，就有多少个cell
    cell = tf.nn.rnn_cell.MultiRNNCell([build_cell(num_units, keep_prob) for _ in range(num_layers)])
    initial_state = cell.zero_state(sequence_size, tf.float32)
    
    return cell, initial_state

In [36]:
def create_output(cell, input_size, output_size):
    
    # 把各个cell整合在一起
    rnn_cell = tf.concat(cell, axis = 1)
    rnn_cell = tf.reshape(rnn_cell, shape = (-1, input_size))
    
    with tf.variable_scope('softmax'): 
        
        # input_size = lstm_size; 
        # output_size = num_classes
        
        softmax_w = tf.Variable(tf.truncated_normal(shape = [input_size, output_size], stddev = 0.1))
        softmax_b = tf.Variable(tf.zeros(shape = [output_size]))
        
    logits = tf.matmul(rnn_cell, softmax_w) + softmax_b
    outputs = tf.nn.softmax(logits)
    
    return outputs, logits

In [37]:
def loss(logits, targets, lstm_size, num_classes ):
    
    y_ont_hot = tf.one_hot(targets, num_classes)
    # logits_shape = (-1, num_classes)
    y_reshape = tf.reshape(y_ont_hot, logits.get_shape())
    
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = y_reshape)
    loss = tf.reduce_mean(loss)
    
    return loss

In [38]:
def build_optimizer(loss, grad_clip, learning_rate):
    
    # 返回所有trainable的变量
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train = optimizer.apply_gradients(zip(grads, tvars))
    
    return train

In [39]:
class CharRNN():
    
    def __init__(self, learning_rate, num_classes, sequence_size = 64, steps = 100,
                 num_units = 256, num_layers = 2, grad_clip = 5,sampling = False):
        
        if sampling == True:
            sequence_size, steps = 1, 1
        else:
            sequence_size, steps = sequence_size, steps
            
        tf.reset_default_graph()
        
        # 输入数据
        self.x_input, self.y_target, self.keep_prob = create_placeholder(sequence_size, steps)        
        cell, self.initial_state = create_lstm_cell(num_units, keep_prob, sequence_size, num_layers)
        
        # ont_hot
        x_one_hot = tf.one_hot(self.x_input, depth = num_classes)
        
        # 构建rnn结构
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state = self.initial_state)
        self.final_state = state
        
        # 输出数据
        self.prediction, self.logits = create_output(cell = outputs, input_size = num_units, output_size = num_classes)
        
        # loss function
        self.loss = loss(self.logits, self.y_target, num_units, num_classes)
        self.optimizer = build_optimizer(self.loss, grad_clip, learning_rate)
        

In [40]:
sequence_size = 64
steps = 80
num_units = 512
num_layers = 4
learning_rate = 0.0023
keep_prob = 0.9
num_classes = len(word_to_int)

In [32]:
model = CharRNN(learning_rate, num_classes, sequence_size = sequence_size, steps = steps, num_units = num_units,
               num_layers = num_layers, grad_clip = 5, sampling = False)

In [33]:
epochs = 20

saver = tf.train.Saver(max_to_keep=100)

with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    counter = 0
    
    for e in range(epochs):
        new_state = sess.run(model.initial_state)
        loss = 0
        
        for x, y in next_batch(encoded, sequence_size, steps):
            counter += 1
            start = time.time()
            feed = {model.x_input: x, model.y_target: y, model.keep_prob: keep_prob, 
                   model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss, model.final_state, model.optimizer], feed_dict = feed)
            
            if counter % 64 == 0:
                end = time.time()
                print('Epoch: {}/{}\n'.format(e+1, epochs), 
                     'Training Steps: {}...\n'.format(counter),
                     'Training loss: {:.4f}\n'.format(batch_loss),
                     '{:.4f} sec/batch\n'.format(end-start))
            
            if counter % 1000 == 0:
                saver.save(sess, 'Songci_checkpoint/i{}.ckpt'.format(counter))
                
    saver.save(sess, 'Songci_checkpoint/i{}.ckpt'.format(counter))

Epoch: 1/20
 Training Steps: 64...
 Training loss: 6.4185
 0.7593 sec/batch

Epoch: 1/20
 Training Steps: 128...
 Training loss: 6.1022
 0.7610 sec/batch

Epoch: 1/20
 Training Steps: 192...
 Training loss: 6.0954
 0.7589 sec/batch

Epoch: 1/20
 Training Steps: 256...
 Training loss: 5.9429
 0.7584 sec/batch

Epoch: 2/20
 Training Steps: 320...
 Training loss: 5.7035
 0.7616 sec/batch

Epoch: 2/20
 Training Steps: 384...
 Training loss: 5.5873
 0.7613 sec/batch

Epoch: 2/20
 Training Steps: 448...
 Training loss: 5.5276
 0.7610 sec/batch

Epoch: 2/20
 Training Steps: 512...
 Training loss: 5.3831
 0.7627 sec/batch

Epoch: 2/20
 Training Steps: 576...
 Training loss: 5.2447
 0.7623 sec/batch

Epoch: 3/20
 Training Steps: 640...
 Training loss: 5.1006
 0.7624 sec/batch

Epoch: 3/20
 Training Steps: 704...
 Training loss: 5.0606
 0.7676 sec/batch

Epoch: 3/20
 Training Steps: 768...
 Training loss: 5.0820
 0.7667 sec/batch

Epoch: 3/20
 Training Steps: 832...
 Training loss: 4.9792
 0.763

In [70]:
def pick_top_n(preds, vocab_size , top_n = 15):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / sum(p)
    c = np.random.choice(vocab_size, 1, p = p)[0]
    return c

In [42]:
sequence_size = 64
steps = 50
num_units = 512
num_layers = 4
learning_rate = 0.0001
keep_prob = 0.95
num_classes = len(word_to_int)

In [43]:
def sample(checkpoints, n_samples, num_units, vocab_size, prime = '我'):
    samples = [c for c in prime]
    model = CharRNN(learning_rate = learning_rate, num_classes=num_classes, steps = steps, num_layers = num_layers,
                    num_units = 512, sequence_size = sequence_size, sampling = True)
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        saver.restore(sess, checkpoints)
        new_state = sess.run(model.initial_state)
        
        for c in prime:
            x = np.zeros((1,1))
            x[0,0] = word_to_int[c]
            feed = {model.x_input: x, model.keep_prob: 1.0, model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], feed_dict = feed)
        
        c = pick_top_n(preds, len(word_to_int))
        samples.append(int_to_word[c])
        
        for i in range(n_samples):
            x[0, 0] = c
            feed = {model.x_input: x, model.keep_prob: 1.0, model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], feed_dict = feed)
            
            c = pick_top_n(preds, len(word_to_int))
            samples.append(int_to_word[c])
    
    return ''.join(samples)

In [44]:
tf.train.latest_checkpoint('Songci_checkpoint')

'Songci_checkpoint/i6180.ckpt'

In [79]:
checkpoint = 'Songci_checkpoint/i6180.ckpt'
samp = sample(checkpoint, 500, 2048, len(word_to_int), prime='黄鹂')
print('')
print(samp)

INFO:tensorflow:Restoring parameters from Songci_checkpoint/i6180.ckpt

黄鹂。今年春事，早向一帘残雪。待著春时多意了，又管得、桃花红萼。
嫩红初试胭脂透。玉妃下、玉容侧。玉人端向花钿薄。念小雨、低斜细细。好夜逐、东郊更暖。
天外莺声，燕飞花下梅花瘦。卷帘成影。几点斜阳雨。一日归时，只解穿阑柳。休相恋。乱山千嶂。欲向人千里。
绿叶红萸，满城芳草花中树。碧纱窗外。燕燕双声起。一点清霜，恼得江南去。知何处。海棠开了。总是花间主。
春色如何，不胜老来风景。为谁怜我。一片闲云暖。一掬清凉，一叶清明月。还相恋。乱山何处。明月花开遍。
画阁帘栊，月淡疏疏，更霜渐过。向晓寒疏影，香风冉冉，人间梦渡，还听单丝。梦里春风，又教人去，独倚阑干春水西。沈沈久，望长安市上，月影回文。愁肠无限奚人。似小院朱帘十二愁。恨流莺入曲，青芜空远，东风又暖，花影初晴。最好重阳，花枝月下，约略不堪吹到行。还休道，这一回一点，一片相思。
五百三宫，花信争妍，花前似梅。看紫纱宫殿，云收锦阁，冰绡袖稳，绿野红堆。玉瑟重腰，鹅花捣润，件曲柔人尤可怜。空思处，又残蝉更过，却是愁愁。离愁。一点尘埃。况此去人生第一回。况暮云无赖，残烟冉冉，云横雁翅，落日云收。梦入楼涯，梦归还是，千里楼台烟浪声。长沙路，望天涯雁远，


In [None]:
word_to_int