# 詞神林夕養成計畫(cntk)

![md_images](../Images/charrnn.png)

In [1]:
import os
import io
import sys
import math
import codecs
import numpy as np
import random

from cntk.initializer import *
from cntk.layers import *
from cntk.layers.models.attention import *
from cntk.layers.typing import *
from cntk.learners import adam, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType
from cntk.logging import log_number_of_parameters, ProgressPrinter
from cntk.losses import *
from cntk.metrics import classification_error
from cntk.ops import *
from cntk.train import Trainer
from cntk.device import try_set_default_device, cpu,gpu

# 是否使用GPU
is_gpu = True

if is_gpu:
    try_set_default_device(gpu(0))
else:
    try_set_default_device(cpu())


In [2]:
with io.open('lingxi.txt', encoding='utf-8-sig') as f:
    text = f.read().lower()
print('corpus length:', len(text))


corpus length: 52647


In [3]:
#把每個字去重複
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


total chars: 2114


In [4]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen-1, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i+1:i + maxlen+1])
print('nb sequences:', len(sentences))
print(sentences[:3])
print(next_chars[:3])


nb sequences: 17536
['你說你 從來未愛戀過\n但很珍惜 跟我在消磨\n我笑我 原來是我的錯\n裂開的心 還未', ' 從來未愛戀過\n但很珍惜 跟我在消磨\n我笑我 原來是我的錯\n裂開的心 還未算清楚', '未愛戀過\n但很珍惜 跟我在消磨\n我笑我 原來是我的錯\n裂開的心 還未算清楚\n如此']
['說你 從來未愛戀過\n但很珍惜 跟我在消磨\n我笑我 原來是我的錯\n裂開的心 還未算', '從來未愛戀過\n但很珍惜 跟我在消磨\n我笑我 原來是我的錯\n裂開的心 還未算清楚\n', '愛戀過\n但很珍惜 跟我在消磨\n我笑我 原來是我的錯\n裂開的心 還未算清楚\n如此天']


In [5]:
idx=0
def word2onehot(w):
    idx=char_indices[w]
    arr=np.zeros(len(chars),dtype=np.float32)
    arr[idx]=1
    return arr


def get_next_minibatch(minibatch_size=16):
    global idx
    features=[]
    labels=[]
    groundtruths=[]
    while len(features)<minibatch_size:
        features.append([word2onehot(s) for s in list(sentences[idx])]) 
        labels.append([word2onehot(s) for s in list(next_chars[idx])])
        groundtruths.append( list(sentences[idx]))
        idx+=1
        if idx>len(sentences)-1:
            idx=0
    return features,labels,groundtruths

#print(get_next_minibatch(3))

In [6]:
def focal_loss(output, target, gamma=2, axis=-1):
    return negate(reduce_sum(target * C.pow(1 - output, gamma) * log(output), axis))


In [7]:
def create_model(input_sequence, vocab_dim=len(chars),num_layers=2, hidden_dim=512):
    with default_options(enable_self_stabilization=True,init=he_uniform(0.02)):
        rnn = Sequential([
            C.layers.Embedding(hidden_dim),
            For(range(num_layers), lambda: 
                Sequential([Stabilizer(), 
                            Recurrence(LSTM(hidden_dim), go_backwards=False),
                           C.layers.BatchNormalization()])),
            Dropout(0.5),
            Dense(vocab_dim,activation=softmax)
            ])
    return rnn(input_sequence)

In [8]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds+10e-14) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / (np.sum(exp_preds)+10e-14)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [9]:
def write_something(epoch,z):
    print()
    print('----- 第Epoch: %d後自動寫詞' % epoch)
    z.save("Models/LingXi_%d.lstm" % epoch)
    z.save("Models/LingXi.lstm")
    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.8,1.0]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- 根據以下詞彙發想: "' + sentence + '"')
        sys.stdout.write(generated)
        
        for i in range(200):
            try:
                x_pred =[word2onehot(s) for s in list(sentence)]   
                preds = z([x_pred])[0][-1]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            except:
                generated+=''
        

In [None]:
learning_rate=0.001
num_epochs=10
minibatch_size=32
#訓練漢字層級的下一個字預測模型
def train():
    global learning_rate,minibatch_size

    #定義序列軸
    input_seq_axis = Axis('inputAxis')
    #定義輸入變數
    input_sequence = sequence.input_variable(shape=len(chars), sequence_axis=input_seq_axis)
    label_sequence = sequence.input_variable(shape=len(chars), sequence_axis=input_seq_axis)
    
    #定義模型輸出
    z = create_model(input_sequence)
    if os.path.exists("Models/LingXi.lstm"):
        model=Function.load("Models/LingXi.lstm")
        z = model(input_sequence)

    #定義進行訓練的損失函數以及錯誤率計算
    #loss=cross_entropy_with_softmax(z,label_sequence)
    loss= cross_entropy_with_softmax(z,label_sequence)#+0.1*focal_loss(z,label_sequence)
    errs = classification_error(z, label_sequence)
    
    # 列印模型參數
    log_number_of_parameters(z);
    print()
    
    # 定義訓練器
    num_trained_samples_since_last_report = 0
    progress_printer = ProgressPrinter(freq=20, tag='Training', num_epochs=300)
    learner = adam(z.parameters,
                    lr=learning_rate_schedule([learning_rate], UnitType.sample, 300),
                    momentum=momentum_as_time_constant_schedule([minibatch_size / -math.log(0.95)], epoch_size=300),
                    l2_regularization_weight=5e-4)
    trainer = Trainer(z, (loss, errs), learner, progress_printer)
    
    for epoch in range(num_epochs):
        mbs = 0
        progress_printer.update_with_trainer(trainer, with_metric=True)
        num_trained_samples = 0
        while mbs<1000:
            features, labels, truths=get_next_minibatch(minibatch_size)
            #進行訓練
            trainer.train_minibatch({input_sequence: features, label_sequence: labels})
 
            if mbs%200==0 and mbs>0:
                write_something(epoch,z)
                learning_rate*=0.75
            mbs += 1
        #回報每個epoch訓練進度以及相關指標
        trainer.summarize_training_progress()
        #寫作測試
        write_something(epoch,z)
        
  

In [None]:
train()

Training 7402056 parameters in 24 parameter tensors.

Learning rate per 1 samples: 0.001
 Minibatch[   1-  20]: loss = 7.654861 * 25600, metric = 95.98% * 25600;
 Minibatch[  21-  40]: loss = 7.598824 * 25600, metric = 92.89% * 25600;
 Minibatch[  41-  60]: loss = 7.583510 * 25600, metric = 92.00% * 25600;
 Minibatch[  61-  80]: loss = 7.590808 * 25600, metric = 93.12% * 25600;
 Minibatch[  81- 100]: loss = 7.571462 * 25600, metric = 90.58% * 25600;
 Minibatch[ 101- 120]: loss = 7.596580 * 25600, metric = 93.59% * 25600;
 Minibatch[ 121- 140]: loss = 7.584307 * 25600, metric = 92.45% * 25600;
 Minibatch[ 141- 160]: loss = 7.584023 * 25600, metric = 92.52% * 25600;
 Minibatch[ 161- 180]: loss = 7.587266 * 25600, metric = 92.80% * 25600;
 Minibatch[ 181- 200]: loss = 7.591128 * 25600, metric = 93.21% * 25600;

----- 第Epoch: 0後自動寫詞
----- diversity: 0.8
----- 根據以下詞彙發想: "巴不得我四季放暑假
誰又不需要呵護 和自我競賽多痛苦
沿途幸有你製造憧憬中所有夢"
巴不得我四季放暑假
誰又不需要呵護 和自我競賽多痛苦
沿途幸有你製造憧憬中所有夢                                       