In [1]:
import tensorflow as tf
import numpy as np
from xpinyin import Pinyin
from tensorflow.keras.preprocessing.text import Tokenizer
import jieba

  from ._conv import register_converters as _register_converters


In [None]:
# 用小青龙的 time 一首歌做的 Seq2Seq 歌词生成 demo
# 主要参照 https://github.com/ematvey/tensorflow-seq2seq-tutorials

# pipeline: 
# 1. 上下两句作为 x 和 y
# 2. jieba 切词
# 3. word -> idx, padding, 记录词表
# 4. 定义模型 seq2seq MLE
# 5. 训练



In [2]:
# load dataset
x = []
y =[]
i = 0
with open("time.txt", "r") as f:
    for l in f:
        if i % 2 == 0:
            x.append(l.strip())
        else:
            y.append(l.strip())
        i+=1

In [None]:
# 拼音模块 押韵可能要用到
# p = Pinyin()
# for w in seg_list:
#     if w != ' ':
#         print(w, p.get_pinyin(w))

In [3]:
# Tokenizer 
tokenizer = Tokenizer(oov_token='<UNK>')
words = []
for s in x:
    words.extend(jieba.cut(s))
for s in y:
    words.extend(jieba.cut(s))
    
tokenizer.fit_on_texts(words)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/0l/3x73_lfs6czgjngbxbtn1vrh0000gn/T/jieba.cache
Loading model cost 0.914 seconds.
Prefix dict has been built succesfully.


In [4]:
train = []
for s in x:
    xx = tokenizer.texts_to_sequences(list(jieba.cut(s)))
#     print(list(jieba.cut(s)))
#     print(s)
    if len(xx) > 0:
        idx = [ xxx[0] for xxx in xx if xxx != []]
        train.append(idx)
test = []
for s in y:
    xx = tokenizer.texts_to_sequences(list(jieba.cut(s)))
#     print(list(jieba.cut(s)))
#     print(s)
    if len(xx) > 0:
        idx = [ xxx[0] for xxx in xx if xxx != []]
        test.append(idx)

In [5]:
train_padded = tf.keras.preprocessing.sequence.pad_sequences(train, maxlen=12, padding='post', truncating='post')

In [6]:
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test, maxlen=12, padding='post', truncating='post')

In [7]:
test_padded

array([[ 28, 120, 121,  20,   1, 122,   4, 123,   0,   0,   0,   0],
       [124,  29,  18, 125, 126, 127, 128,   0,   0,   0,   0,   0],
       [129, 130,   2, 131,   6,  10, 132,  13,   2,  21,   0,   0],
       [133, 134, 135,  30,   5, 136,   0,   0,   0,   0,   0,   0],
       [  1,  14,   1,  15,   7, 137, 138,   0,   0,   0,   0,   0],
       [ 29, 139,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [140,   5, 141,   2, 142,   0,   0,   0,   0,   0,   0,   0],
       [ 31,   4, 143,  32,   1, 144, 145, 146, 147, 148,   0,   0],
       [149,   1,  31,   4, 150, 151,   0,   0,   0,   0,   0,   0],
       [152,  24, 153,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  1, 154,   9,  33,  34, 155,  32,   9,  33,  34, 156,   0],
       [ 35, 157, 158, 159,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 19, 160, 161, 162,   0,   0,   0,   0,   0,   0,   0,   0],
       [163, 164, 165,   2, 166, 167,   0,   0,   0,   0,   0,   0],
       [  1, 168,  36, 169,  12,  

In [8]:
vocab_size = len(tokenizer.word_docs) + 2 # unk + oov

In [None]:
vocab_size

In [None]:
## define model 
hidden = 16
embed_size = 32
graph = tf.Graph()

with graph.as_default():
    encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
    decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
    with tf.name_scope("embedding"):
        embeddings = tf.Variable(tf.random_uniform([vocab_size, embed_size], -1., 1.), dtype=tf.float32)
        encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
        decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_targets)
    
    with tf.name_scope("encoder"):
        encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden)
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(encoder_cell, encoder_inputs_embedded, 
                                                                 dtype=tf.float32, time_major=True,)
        del encoder_outputs
    
    with tf.name_scope("decoder"):
        decoder_cell = tf.contrib.rnn.LSTMCell(hidden)
        # pass encoder final state as inital state
        decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
                                                    decoder_cell, decoder_inputs_embedded,
                                                        initial_state=encoder_final_state,
                                                            dtype=tf.float32, time_major=True, scope="plain_decoder")
        decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)
        decoder_prediction = tf.argmax(decoder_logits, 2)
#         print(decoder_logits)
    
    stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                                labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
                                    logits=decoder_logits,)

    loss = tf.reduce_mean(stepwise_cross_entropy)
    train_op = tf.train.AdamOptimizer().minimize(loss)

In [None]:
# 把输入变形为 time major
def batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
    """
    
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    
    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
    inputs_time_major = inputs_batch_major.swapaxes(0, 1)

    return inputs_time_major, sequence_lengths



In [None]:
# translate indexes to sentence
word2idx = tokenizer.word_index
id2word = {k: v for v, k in zip(word2idx.keys(), word2idx.values())}
def translate(word_indexs):
    words = []
    for idx in word_indexs:
        word = id2word.get(idx)
#         print(word)
        if word:
            words.append(id2word.get(idx))
        else:
            words.append("<UNK>")
    return "".join(words)
#     print(words)
#     print("".join(words))
#     print("Finish")
    
print(translate([214, 1, 2,24 ,23, 3, 209, 214]))

In [None]:
# s = "当我没日没夜工作从长水起飞"
# s_idx = [ i[0] for i in tokenizer.texts_to_sequences(jieba.cut(s))]
# print(s_idx)
# ss = [s_idx]
# test_input = tf.keras.preprocessing.sequence.pad_sequences(ss, maxlen=12, padding='post', truncating='post')
# print(test_input)
# print(test_input)
# battle = batch(test_input, max_sequence_length=12)
# print(battle[0])

In [None]:
train_time, _ = batch(train_padded, max_sequence_length=12)
test_time, _ = batch(test_padded, max_sequence_length=12)
print(train_time.shape)

In [None]:
import random

epochs = 1000

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    for e in range(epochs):
        l, _ = sess.run([loss, train_op], feed_dict={
            encoder_inputs: train_time,
            decoder_targets: test_time,
        })
        if e % 50 == 0:
        
            
            pred = sess.run(decoder_prediction, feed_dict={
                encoder_inputs: train_time,
                decoder_targets: test_time
            })
            
#             print(translate(np.transpose(pred)[0]))

            rand_idx = (int) (random.random() * (train_time.shape[1]) )
            print("epoch: %d loss : %f" %( e, l))
            print("input: ", translate(np.transpose(train_time)[rand_idx]))
            print("predict: ", translate(np.transpose(pred)[rand_idx]))
            print("ground truth: ",translate(np.transpose(test_time)[rand_idx]) )
            print("-----------------------------------------")
#             print("Input: ")
            
            
    

In [None]:
print()