In [1]:
import tensorflow as tf
import numpy as np
from xpinyin import Pinyin
from tensorflow.keras.preprocessing.text import Tokenizer
import jieba

  from ._conv import register_converters as _register_converters


In [2]:
# 用小青龙的 time 一首歌做的 Seq2Seq 歌词生成 demo
# 主要参照 https://github.com/ematvey/tensorflow-seq2seq-tutorials

# pipeline: 
# 1. 上下两句作为 x 和 y
# 2. jieba 切词
# 3. word -> idx, padding, 记录词表
# 4. 定义模型 seq2seq MLE
# 5. 训练



In [3]:
# load dataset
x = []
y =[]
i = 0
with open("time.txt", "r") as f:
    for l in f:
        if i % 2 == 0:
            x.append(l.strip())
        else:
            y.append(l.strip())
        i+=1

In [4]:
# 拼音模块 押韵可能要用到
# p = Pinyin()
# for w in seg_list:
#     if w != ' ':
#         print(w, p.get_pinyin(w))

In [5]:
# Tokenizer 
tokenizer = Tokenizer(oov_token='<UNK>')
words = []
for s in x:
    words.extend(jieba.cut(s))
for s in y:
    words.extend(jieba.cut(s))
    
tokenizer.fit_on_texts(words)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/0l/3x73_lfs6czgjngbxbtn1vrh0000gn/T/jieba.cache
Loading model cost 1.297 seconds.
Prefix dict has been built succesfully.


In [6]:
train = []
for s in x:
    xx = tokenizer.texts_to_sequences(list(jieba.cut(s)))
#     print(list(jieba.cut(s)))
#     print(s)
    if len(xx) > 0:
        idx = [ xxx[0] for xxx in xx if xxx != []]
        train.append(idx)
test = []
for s in y:
    xx = tokenizer.texts_to_sequences(list(jieba.cut(s)))
#     print(list(jieba.cut(s)))
#     print(s)
    if len(xx) > 0:
        idx = [ xxx[0] for xxx in xx if xxx != []]
        test.append(idx)

In [7]:
train_padded = tf.keras.preprocessing.sequence.pad_sequences(train, maxlen=12, padding='post', truncating='post')

In [8]:
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test, maxlen=12, padding='post', truncating='post')

In [9]:
test_padded

array([[ 28, 120, 121,  20,   1, 122,   4, 123,   0,   0,   0,   0],
       [124,  29,  18, 125, 126, 127, 128,   0,   0,   0,   0,   0],
       [129, 130,   2, 131,   6,  10, 132,  13,   2,  21,   0,   0],
       [133, 134, 135,  30,   5, 136,   0,   0,   0,   0,   0,   0],
       [  1,  14,   1,  15,   7, 137, 138,   0,   0,   0,   0,   0],
       [ 29, 139,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [140,   5, 141,   2, 142,   0,   0,   0,   0,   0,   0,   0],
       [ 31,   4, 143,  32,   1, 144, 145, 146, 147, 148,   0,   0],
       [149,   1,  31,   4, 150, 151,   0,   0,   0,   0,   0,   0],
       [152,  24, 153,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  1, 154,   9,  33,  34, 155,  32,   9,  33,  34, 156,   0],
       [ 35, 157, 158, 159,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 19, 160, 161, 162,   0,   0,   0,   0,   0,   0,   0,   0],
       [163, 164, 165,   2, 166, 167,   0,   0,   0,   0,   0,   0],
       [  1, 168,  36, 169,  12,  

In [10]:
vocab_size = len(tokenizer.word_docs) + 2 # unk + oov

In [11]:
vocab_size

210

In [12]:
## define model 
hidden = 16
embed_size = 32
graph = tf.Graph()

with graph.as_default():
    encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
    decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
    with tf.name_scope("embedding"):
        embeddings = tf.Variable(tf.random_uniform([vocab_size, embed_size], -1., 1.), dtype=tf.float32)
        encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
        decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_targets)
    
    with tf.name_scope("encoder"):
        encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden)
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(encoder_cell, encoder_inputs_embedded, 
                                                                 dtype=tf.float32, time_major=True,)
        del encoder_outputs
    
    with tf.name_scope("decoder"):
        decoder_cell = tf.contrib.rnn.LSTMCell(hidden)
        # pass encoder final state as inital state
        decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
                                                    decoder_cell, decoder_inputs_embedded,
                                                        initial_state=encoder_final_state,
                                                            dtype=tf.float32, time_major=True, scope="plain_decoder")
        decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)
        decoder_prediction = tf.argmax(decoder_logits, 2)
#         print(decoder_logits)
    
    stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                                labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
                                    logits=decoder_logits,)

    loss = tf.reduce_mean(stepwise_cross_entropy)
    train_op = tf.train.AdamOptimizer().minimize(loss)

In [13]:
# 把输入变形为 time major
def batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
    """
    
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    
    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
    inputs_time_major = inputs_batch_major.swapaxes(0, 1)

    return inputs_time_major, sequence_lengths



In [14]:
# translate indexes to sentence
word2idx = tokenizer.word_index
id2word = {k: v for v, k in zip(word2idx.keys(), word2idx.values())}
def translate(word_indexs):
    words = []
    for idx in word_indexs:
        word = id2word.get(idx)
#         print(word)
        if word:
            words.append(id2word.get(idx))
        else:
            words.append("<UNK>")
    return "".join(words)
#     print(words)
#     print("".join(words))
#     print("Finish")
    
print(translate([214, 1, 2,24 ,23, 3, 209, 214]))

<UNK>我的不用过在<UNK><UNK>


In [15]:
# s = "当我没日没夜工作从长水起飞"
# s_idx = [ i[0] for i in tokenizer.texts_to_sequences(jieba.cut(s))]
# print(s_idx)
# ss = [s_idx]
# test_input = tf.keras.preprocessing.sequence.pad_sequences(ss, maxlen=12, padding='post', truncating='post')
# print(test_input)
# print(test_input)
# battle = batch(test_input, max_sequence_length=12)
# print(battle[0])

In [16]:
train_time, _ = batch(train_padded, max_sequence_length=12)
test_time, _ = batch(test_padded, max_sequence_length=12)
print(train_time.shape)

(12, 26)


In [17]:
import random

epochs = 10000

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    for e in range(epochs):
        l, _ = sess.run([loss, train_op], feed_dict={
            encoder_inputs: train_time,
            decoder_targets: test_time,
        })
        if e % 50 == 0:
        
            
            pred = sess.run(decoder_prediction, feed_dict={
                encoder_inputs: train_time,
                decoder_targets: test_time
            })
            
#             print(translate(np.transpose(pred)[0]))

            rand_idx = (int) (random.random() * (train_time.shape[1]) )
            print("epoch: %d loss : %f" %( e, l))
            print("input: ", translate(np.transpose(train_time)[rand_idx]))
            print("predict: ", translate(np.transpose(pred)[rand_idx]))
            print("ground truth: ",translate(np.transpose(test_time)[rand_idx]) )
            print("-----------------------------------------")
#             print("Input: ")
            
            
    

epoch: 0 loss : 5.290242
input:  课桌上面摆着老师送的铅笔刀<UNK><UNK><UNK><UNK><UNK>
predict:  那年我那年以前包里包里包里包里包里包里包里包里
ground truth:  她教我如何起跑<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 50 loss : 4.125021
input:  特别的怀念<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  <UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  回到童年<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 100 loss : 3.103010
input:  校门口美味的路边摊还在不在<UNK><UNK><UNK><UNK>
predict:  <UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  包里塞的小吃那是外婆给的爱<UNK><UNK>
-----------------------------------------
epoch: 150 loss : 2.670379
input:  那心酸的滋味我只能独自体会<UNK><UNK><UNK><UNK>
predict:  <UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  想要回到过去但是时光不能倒退<UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 200 loss : 2.397929
input:  所以我努力的跑<UNK><UNK><UNK><

epoch: 1800 loss : 0.039838
input:  我想让我爱的人过的更好<UNK><UNK>
predict:  所以才去战斗<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  所以才去战斗<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 1850 loss : 0.037470
input:  就算我是个战神<UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  可我心也一样会疼<UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  可我心也一样会疼<UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 1900 loss : 0.035296
input:  于是日复一日年复一年<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  那个美丽的春天早已变成我的纪念<UNK><UNK><UNK>
ground truth:  那个美丽的春天早已变成我的纪念<UNK><UNK><UNK>
-----------------------------------------
epoch: 1950 loss : 0.033296
input:  那心酸的滋味我只能独自体会<UNK><UNK><UNK><UNK>
predict:  想要回到过去但是时光不能倒退<UNK><UNK><UNK><UNK><UNK>
ground truth:  想要回到过去但是时光不能倒退<UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 2000 loss : 0.031455
input:  校门口美味的路边摊还在不在<UNK><UNK><UNK><UNK>
predict:  包里塞的小吃那是外婆给的爱<UNK><UNK>
ground truth:  

epoch: 3650 loss : 0.007815
input:  街边的落日和小时候的复读机<UNK><UNK><UNK><UNK><UNK>
predict:  度过了末日但我要比以前有出息<UNK><UNK>
ground truth:  度过了末日但我要比以前有出息<UNK><UNK>
-----------------------------------------
epoch: 3700 loss : 0.007555
input:  我闭上双眼祈祷<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  我知道努力学习以后才能把歌写好<UNK><UNK><UNK>
ground truth:  我知道努力学习以后才能把歌写好<UNK><UNK><UNK>
-----------------------------------------
epoch: 3750 loss : 0.007306
input:  我渐渐长大成人<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  眼看着世界沉沦<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  眼看着世界沉沦<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 3800 loss : 0.007066
input:  那年生日同桌攒钱给我买的卡带<UNK><UNK><UNK>
predict:  包装粗糙海报写着jay<UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  包装粗糙海报写着jay<UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 3850 loss : 0.006836
input:  校门口美味的路边摊还在不在<UNK><UNK><UNK><UNK>
predict:  包里塞的小吃那是外婆给的爱<UNK><UNK>
ground truth:  包里塞的小吃那

epoch: 5500 loss : 0.002553
input:  特别的怀念<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  回到童年<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  回到童年<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 5550 loss : 0.002483
input:  那心酸的滋味我只能独自体会<UNK><UNK><UNK><UNK>
predict:  想要回到过去但是时光不能倒退<UNK><UNK><UNK><UNK><UNK>
ground truth:  想要回到过去但是时光不能倒退<UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 5600 loss : 0.002416
input:  当我没日没夜工作从长水起飞<UNK><UNK><UNK><UNK><UNK>
predict:  时间反复催促让我忘了疲惫<UNK><UNK><UNK><UNK>
ground truth:  时间反复催促让我忘了疲惫<UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 5650 loss : 0.002350
input:  我渐渐长大成人<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  眼看着世界沉沦<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  眼看着世界沉沦<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 5700 loss : 0.002287
input:  课桌上面摆着老师送的铅笔刀<UNK><UNK>

epoch: 7300 loss : 0.000976
input:  回忆是每个人的财富<UNK><UNK><UNK><UNK><UNK><UNK>
predict:  一定记住他<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  一定记住他<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 7350 loss : 0.000951
input:  到现在<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  我依旧做说唱还在饿着肚子<UNK><UNK><UNK><UNK><UNK>
ground truth:  我依旧做说唱还在饿着肚子<UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 7400 loss : 0.000927
input:  回忆是每个人的财富<UNK><UNK><UNK><UNK><UNK><UNK>
predict:  一定记住他<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  一定记住他<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 7450 loss : 0.000904
input:  所以我努力的跑<UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  把一切全部看透<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  把一切全部看透<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 7500 loss : 0.000881
input: 

epoch: 9100 loss : 0.000389
input:  特别的怀念<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  回到童年<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  回到童年<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 9150 loss : 0.000379
input:  那心酸的滋味我只能独自体会<UNK><UNK><UNK><UNK>
predict:  想要回到过去但是时光不能倒退<UNK><UNK><UNK><UNK><UNK>
ground truth:  想要回到过去但是时光不能倒退<UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 9200 loss : 0.000370
input:  街边的落日和小时候的复读机<UNK><UNK><UNK><UNK><UNK>
predict:  度过了末日但我要比以前有出息<UNK><UNK>
ground truth:  度过了末日但我要比以前有出息<UNK><UNK>
-----------------------------------------
epoch: 9250 loss : 0.000360
input:  脱掉了曾经<UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
predict:  认为很时尚的大肥裤子<UNK><UNK><UNK><UNK><UNK><UNK>
ground truth:  认为很时尚的大肥裤子<UNK><UNK><UNK><UNK><UNK><UNK>
-----------------------------------------
epoch: 9300 loss : 0.000351
input:  当我没日没夜工作从长水起飞<UNK><UNK><UNK><UNK><UNK>
predict:  时间反复催促

In [None]:
print()