# Character-level language models

This tutorial shows how to train a character-level language model with a multilayer recurrent neural network. In particular, we will train a multilayer LSTM network that is able to generate text.



## Import necessary package

Import necessary package

In [1]:
import os
import urllib

## Prepare data
We first open the target file to build the vocab

In [2]:
# target_file = './data/shediao.txt'
target_file = './data/shediao_11_17.txt'

Then we define a few utility functions to pre-process the dataset.

In [3]:
word_per_line = 64

def read_content_seperate(path):
    max_word_number = word_per_line
    result_string = ''
    with open(path) as ins:   
        for line in ins:
            temp_string = line.decode('utf-8')
            if len(temp_string) < max_word_number:
                result_string = result_string + '\n' + temp_string
            else:
                segment_number = int(len(temp_string)/max_word_number)
                for i in range(segment_number):
                    result_string = result_string + '\n' + temp_string[i*max_word_number: (i+1)*max_word_number]
            
                result_string = result_string + '\n' + temp_string[(i+1)*max_word_number:]
                
    return result_string

def read_content_whole(path):
    with open(path) as ins:        
        return ins.read().decode('utf-8')
    
def read_content(path):
    return read_content_seperate(path)
        
        
# Return a dict which maps each char into an unique int id
def build_vocab(path):
    content = list(read_content(path))
    idx = 1 # 0 is left for zero-padding
    the_vocab = {}
    for word in content:
        if len(word) == 0:
            continue
        if not word in the_vocab:
            the_vocab[word] = idx
            idx += 1
    return the_vocab

# Encode a sentence with int ids
def text2id(sentence, the_vocab):
    words = list(sentence)
    return [the_vocab[w] for w in words if len(w) > 0]
            
# build char vocabluary from input
vocab = build_vocab(target_file)
print('vocab size = %d' %(len(vocab)))

vocab size = 3042


In [4]:
import mxnet as mx
import numpy as np

In [5]:
# checkpoint_path = './checkpoint/shediao'
checkpoint_path = './checkpoint/shediao_part'

In [6]:
num_lstm_layer = 3
num_hidden = 512
num_embed = 256

# Inference

We first define some utility functions to help us make inferences:

In [7]:
from rnn_model import LSTMInferenceModel


# helper strcuture for prediction
def MakeRevertVocab(vocab):
    dic = {}
    for k, v in vocab.items():
        dic[v] = k
    return dic

# make input from char
def MakeInput(char, vocab, arr):
    idx = vocab[char]
    tmp = np.zeros((1,))
    tmp[0] = idx
    arr[:] = tmp

# helper function for random sample 
def _cdf(weights):
    total = sum(weights)
    result = []
    cumsum = 0
    for w in weights:
        cumsum += w
        result.append(cumsum / total)
    return result

def _choice(population, weights):
    assert len(population) == len(weights)
    cdf_vals = _cdf(weights)
    x = random.random()
    idx = bisect.bisect(cdf_vals, x)
    return population[idx]

# we can use random output or fixed output by choosing largest probability
def MakeOutput(prob, vocab, sample=False, temperature=1.):
    if sample == False:
        idx = np.argmax(prob, axis=1)[0]
    else:
        fix_dict = [""] + [vocab[i] for i in range(1, len(vocab) + 1)]
        scale_prob = np.clip(prob, 1e-6, 1 - 1e-6)
        rescale = np.exp(np.log(scale_prob) / temperature)
        rescale[:] /= rescale.sum()
        return _choice(fix_dict, rescale[0, :])
    try:
        char = vocab[idx]
    except:
        char = ''
    return char

In [8]:
def generate(input_string):
    output = input_string.decode('utf-8')
    seq_length = 600
    input_ndarray = mx.nd.zeros((1,))
    revert_vocab = MakeRevertVocab(vocab)
    # Feel free to change the starter sentence
    random_sample = False
    new_sentence = True

    ignore_length = len(output)

    for i in range(seq_length):
        if i <= ignore_length - 1:
            MakeInput(output[i], vocab, input_ndarray)
        else:
            MakeInput(output[-1], vocab, input_ndarray)
        prob = model.forward(input_ndarray, new_sentence)
        new_sentence = False
        next_char = MakeOutput(prob, revert_vocab, random_sample)
        if next_char == '':
            new_sentence = True
        if i >= ignore_length - 1:
            output += next_char
    print(output)

Then we create the inference model:

In [36]:
import rnn_model 

checkpoint_number = 295

# load from check-point
_, arg_params, __ = mx.model.load_checkpoint(checkpoint_path, checkpoint_number)

# build an inference model
model = rnn_model.LSTMInferenceModel(
    num_lstm_layer,
    len(vocab) + 1,
    num_hidden=num_hidden,
    num_embed=num_embed,
    num_label=len(vocab) + 1, 
    arg_params=arg_params, 
    ctx=mx.gpu(), 
    dropout=0.2)

Now we can generate a sequence of 600 characters starting with start word you set below

In [37]:
output_list = ['黄蓉', '郭靖', '同时','柯震恶','江南七怪','朱聪伸手','《九阴真经》','《九阴','全真教','丘处机',\
              '黄药师','打狗棍','完颜','铜尸','铁尸','梅超风使出']


In [38]:
for start_word in output_list:
    print(start_word + '--->')
    generate(start_word)
    print('-----------------------')

黄蓉--->
黄蓉道：“你把杨大哥那柄匕首给穆姊姊罢。”郭靖道：“正是。”从怀中掏出那柄朱聪从旗杆上去。
-----------------------
郭靖--->
郭靖道：“我只当她是妹子，是好朋友，可不要她做妻子。”丘处机喜道：“好孩子，有志气，有志气。管他甚么大汗不大汗，公主不公主。你还是一帮，不用从堂，大由了一拳，更是高人，便他胸膝之际，眼气无动，大厅间跃起竹杖，伸臂长击，肩头飞出。黄蓉笑道：“这一招可老叫化从未在世间，这其下可是听得了？”黄蓉道：“是啊，原来是道。”郭靖道：“兄弟既敢有妻子要了。”黄蓉道：“我去不迟路，只见她打着微阵，似乎微轻振慢慢来：“啊哟，这不好啦？”杨康奇道：“谁的谁？快是我们的模样？”穆念慈道：“我们打了七公来，立时，郭靖已惊觉危险，左手向后窜出，落在地下之上，连掌上拳，立时使劲，但是那毒龙手掌法”的急数。欧阳克听她明掌之际，心中大急，猛恨挡近，双臂飞弯，他双臂带了，幸红洞外，双手搂住他的肩头，只是未曾叩谢您老恩德。”说着跪了下去，砰砰砰的连磕了几个响头。洪七公脸色一变，喝道：“住长，我说甚么？”黄蓉道：“你双手，郭靖再行拜见的十余掌，心中忽然另能了武林中数分，才要这才手作招武功，他这时中他取法的玄风，郭靖登时抵御江南，这可不敢。”
-----------------------
同时--->
同时听到六位师哥，一起桃花岛了，还归故谢。”黄蓉道：“美得不错。我老人家不肖干吗？”洪七公道：“你们已伤了多少人？”欧阳克道：“我也不能再活师父，咱俩前再亲亲，心中更有先领。”黄蓉道：“我们自己大哥，你们两人武功罢不成，兄弟就难道？”郭靖道：“兄弟跟你不多，兄弟子，好让他们能修习下盘的内功之后，得以回复行走。只是他素来要强好胜，虽然内心后悔，口上却不肯说，因此这套内功明明是全部新创，仍是用得无发，就惜他这般古怪古怪，心想已是不通的亢龙知使。”郭靖又是一生。黄蓉不喜，心道：“那也用了我的朋友。”洪七公道：“你不肯立誓也罢，我只摔你不能要嫁我，这时候我立时会不见’。”郭靖点点头，说道：“我再不知道，这时已是老毒物了。”周伯通笑道：“难他们师兄弟四人一齐震断了。完颜康抱了半遍，方把九般看来来过，却见他武功未高，他自己在练成二句一方，郭靖只听得说郭靖呼中有异，但此时收了他。”洪七公笑道：“你若是另的武功，既有甚么恩师之物，该能万学一条道以大传？”郭