# 2-字符型语言模型
## 构建模型学习不同的名称模式，随机生成新名字

In [1]:
import numpy as np
import random
import time 
import cllm_utils

### (1) 数据集与预处理

In [2]:
data = open("dinos.txt","r").read()

data = data.lower() # 转换为小写字符

chars = list(set(data)) # 转换为无序且不重复的元素列表

data_size,vocab_size = len(data),len(chars)

print("共有%d种字符%d个" % (vocab_size,data_size))

共有27种字符19909个


In [3]:
# 创建字符与索引的相互映射
char_to_ix = {ch:i for i,ch in enumerate(sorted(chars))}
ix_to_char = {i:ch for i,ch in enumerate(sorted(chars))}

print(char_to_ix,ix_to_char)

{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26} {0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


### (2) 梯度修剪

In [26]:
def clip(gradients,maxValue):
    """实现最大值梯度修剪"""
    dWaa,dWax,dWya,db,dby= gradients['dWaa'],gradients['dWax'],gradients['dWya'],gradients['db'],gradients['dby']
    
    for gradient in [dWaa,dWax,dWya,db,dby]:
        np.clip(gradient,-maxValue,maxValue,out=gradient)
        
    gradients = {"dWaa":dWaa,"dWax":dWax,"dWya":dWya,"db":db,"dby":dby}
    
    return gradients

In [27]:
# 测试梯度修剪
np.random.seed(3)
dWax = np.random.randn(5,3)*10
dWaa = np.random.randn(5,5)*10
dWya = np.random.randn(2,5)*10
db = np.random.randn(5,1)*10
dby = np.random.randn(2,1)*10
gradients = {"dWaa":dWaa,"dWax":dWax,"dWya":dWya,"db":db,"dby":dby}
gradients = clip(gradients,10)


gradients["db"].shape,gradients["dby"].shape

((5, 1), (2, 1))

### （3）采样

In [9]:
def sample(params,char_to_ix,seed):
    """对RNN进行采样"""
    Waa,Wax,Wya,by,b = params['Waa'],params['Wax'],params['Wya'],params['by'],params['b']
    vocab_size = by.shape[0]
    n_a = Waa.shape[1]
    
    x = np.zeros((vocab_size,1))
    a_prev = np.zeros((n_a,1))
    indices = []
    idx = -1
    counter = 0
    newline_char = char_to_ix["\n"]
    
    while (idx != newline_char and counter < 50):
        # 前向传播
        a = np.tanh(np.dot(Wax,x) + np.dot(Waa,a_prev) + b)
        z = np.dot(Wya,a) + by 
        y = cllm_utils.softmax(z)
        
        np.random.seed(counter+seed)
        idx = np.random.choice(list(range(vocab_size)),p=y.ravel())
        indices.append(idx)
        
        x = np.zeros((vocab_size,1))
        x[idx] = 1
        
        a_prev = a 
        
        seed += 1
        counter += 1
        
    if counter == 50:
        indices.append(char_to_ix["\n"])
        
    return indices

In [11]:
# 测试采样
np.random.seed(2)
n_a = 100 
Wax,Waa,Wya = np.random.randn(n_a,vocab_size),np.random.randn(n_a,n_a),np.random.randn(vocab_size,n_a)
b,by = np.random.randn(n_a,1),np.random.randn(vocab_size,1)
params = {"Wax":Wax,"Waa":Waa,"Wya":Wya,"b":b,"by":by}

indices = sample(params,char_to_ix,0)
print(indices)
print([ix_to_char[i] for i in indices])

[12, 17, 24, 14, 13, 9, 10, 22, 24, 6, 13, 11, 12, 6, 21, 15, 21, 14, 3, 2, 1, 21, 18, 24, 7, 25, 6, 25, 18, 10, 16, 2, 3, 8, 15, 12, 11, 7, 1, 12, 10, 2, 7, 7, 11, 17, 24, 12, 3, 1, 0]
['l', 'q', 'x', 'n', 'm', 'i', 'j', 'v', 'x', 'f', 'm', 'k', 'l', 'f', 'u', 'o', 'u', 'n', 'c', 'b', 'a', 'u', 'r', 'x', 'g', 'y', 'f', 'y', 'r', 'j', 'p', 'b', 'c', 'h', 'o', 'l', 'k', 'g', 'a', 'l', 'j', 'b', 'g', 'g', 'k', 'q', 'x', 'l', 'c', 'a', '\n']


### (4) 梯度下降优化

In [28]:
def optimize(X,Y,a_prev,params,alpha=0.01):
    """模型单步训练优化"""
    # 前向传播
    loss,cache = cllm_utils.rnn_forward(X,Y,a_prev,params)
    
    # 反向传播
    gradients,a = cllm_utils.rnn_backward(X,Y,params,cache)
    
    # 梯度修剪
    gradients = clip(gradients,5)
    
    # 更新参数
    params = cllm_utils.update_parameters(params,gradients,alpha)
    
    return loss,gradients,a[len(X)-1]

In [29]:
np.random.seed(1)
vocab_size,n_a = 27,100
a_prev = np.random.randn(n_a,1)
Wax,Waa,Wya = np.random.randn(n_a,vocab_size),np.random.randn(n_a,n_a),np.random.randn(vocab_size,n_a)
b,by = np.random.randn(n_a,1),np.random.randn(vocab_size,1)
params = {"Wax":Wax,"Waa":Waa,"Wya":Wya,"b":b,"by":by}
X = [12,3,5,11,22,3]
Y = [4,14,11,22,25,26]

loss,gradients,a_last = optimize(X,Y,a_prev,params,alpha=0.01)
gradients["dWaa"][1][2]

0.19470931534725341

### （5）训练模型

In [32]:
def model(data,ix_to_char,char_to_ix,epochs=3500,n_a=50,dino_names=7,vocab_size=27):
    """训练模型并生成恐龙名字"""
    n_x,n_y = vocab_size,vocab_size
    params = cllm_utils.initialize_parameters(n_a,n_x,n_y)
    loss = cllm_utils.get_initial_loss(vocab_size,dino_names)
    
    with open("dinos.txt") as f:
        examples = f.readlines()
    examples = [x.lower().strip() for x in examples]
    
    np.random.seed(0)
    np.random.shuffle(examples)
    
    a_prev = np.zeros((n_a,1))
    
    for j in range(epochs):
        # 选择训练样本
        index = j % len(examples)
        X = [None] + [char_to_ix[ch] for ch in examples[index]]
        Y = X[1:] + [char_to_ix["\n"]]
        
        curr_loss,gradients,a_prev = optimize(X,Y,a_prev,params)
        
        loss = cllm_utils.smooth(loss,curr_loss)
        
        if j % 2000 == 0:
            print("迭代%d几次，误差为%f" % (j+1,loss))
            
            seed = 0
            
            for name in range(dino_names):
                # 采样
                sampled_indices = sample(params,char_to_ix,seed)
                cllm_utils.print_sample(sampled_indices,ix_to_char)
                
                seed += 1
                
            print("\n")
            
    return params

In [33]:
params = model(data,ix_to_char,char_to_ix)

迭代1几次，误差为23.087336
Nkzxwtdmfqoeyhsqwasjkjvu
Kneb
Kzxwtdmfqoeyhsqwasjkjvu
Neb
Zxwtdmfqoeyhsqwasjkjvu
Eb
Xwtdmfqoeyhsqwasjkjvu


迭代2001几次，误差为27.884160
Liusskeomnolxeros
Hmdaairus
Hytroligoraurus
Lecalosapaus
Xusicikoraurus
Abalpsamantisaurus
Tpraneronxeros


