# 作业5：RNN 生成模型

以 `data/names.txt` 中的英文名作为训练集，利用 RNN 或 LSTM 等方法对字母序列数据进行建模，然后使用拟合的模型随机生成20个名字。本次作业为开放式，不指定各类超参数（如网络结构、学习率、迭代次数等），但需提供必要的输出和诊断结果支持你的选择（如模型是否收敛、效果评价等）。

提示：可以参照 `lec12-rnn-generation.zip` 中的代码，但注意英文名不需要像中文那样构建字典，因为可以直接使用26个字母作为字典。

In [4]:
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import itertools
import collections
import matplotlib.pyplot as plt

## 1. 处理数据

In [5]:
# load txt file

def read_txt_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read().split()
        return content

dat = read_txt_file('data/names.txt')
print(dat[:5])

['abbas', 'abbey', 'abbott', 'abdi', 'abel']


In [6]:
# construct dictionary
charset_size = 27 # 26 letters  + 1 <EOS>
dictionary = list('abcdefghijklmnopqrstuvwxyz') + ['<EOS>'] 
print(dictionary)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '<EOS>']


In [7]:
# names to tensor
def char2index(char):
    """Transform a character to its index in the dictionary
    Args:
        char (str): a character
        
    Returns:
       int: the index of the character in the dictionary
        
    """
    return dictionary.index(char.lower()) 

def names2tensor(NameList):
    """Transform a list of names to one-hot tensor
    Args:
        NameList (array): a list of names
        
    Returns:
        tensor: a tensor of shape (LongestNameLength, NumberOfNames, charset_size=27), storing the one-hot representation of names
        array: a numpy array of shape (NumberOfNames), storing each name's length
        target: a tensor of shape (LongestNameLength, NumberOfNames), storing the index of the next letter
        
    """
    names_num = len(NameList) # number of names
    names_lens = [len(name) for name in NameList] # a list storing each name's length
    max_name_len = max(names_lens) # the longest name's length
    
    tensor = torch.zeros(max_name_len, names_num, charset_size) # (each char in a name, each name, one-hot vector)
    target = torch.zeros(max_name_len, names_num, dtype=int) + charset_size - 1 # initialize with <EOS>
    
    for name_i in range(names_num): # for each name(idx) in data set
        name = NameList[name_i] # get the name
        for char_i in range(names_lens[name_i]): # for each char(idx) in the name
            # set tensor
            tensor[char_i, name_i, char2index(name[char_i])] = 1 # set the corresponding one-hot vector
            # set target
            if char_i < names_lens[name_i] - 1: # if not the last char (here note that python index starts from 0)
                target[char_i, name_i] = char2index(name[char_i + 1]) # target for name_i, char_i is char_i+1
                
    return tensor, np.array(names_lens), target

# test names2tensor
names2tensor(["leon","rachel"])

(tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,

## 2. 构建LSTM模型类

![](https://michael-1313341240.cos.ap-shanghai.myqcloud.com/202312042025464.png)

Parameters Definition:

- Suppose there are $h$ hidden units, batch size is $n$, number of inputs is $d$. Thus, $X_t \in \mathbb{R}^{n\times h}, H_t = \mathbb{R}^{n\times h}.$
- Define Gates (at time $t$):
    - Input gate ($I_t \in \mathbb{R}^{n\times h}$) : $I_t = \text{sigmoid}(X_t W_{xi} + H_{t-1} W_{hi} + b_i)$
    - Forget gate ($F_t \in \mathbb{R}^{n\times h}$) : $F_t = \text{sigmoid}({X_t W_{xf} + H_{t-1} W_{hf} + b_f)}$
    - Output gate ($O_t \in  \mathbb{R}^{n\times h}$) : $O_t = \text{sigmoid}{X_t W_{xo} + H_{t-1} W_{ho} + b_o)}$
  
  where $W_{x,\cdot} \in \mathbb{R}^{d\times h}$, $W_{h,\cdot} \in \mathbb{R}^{h\times h}$, $b_{\cdot} \in \mathbb{R}^{1\times h}$.

- Define Candidate Memory Cell $\tilde C$:

$$
  \tilde C_t = \text{tanh}(X_t W_{xc} + H_{t-1} W_{hc} + b_c)
$$

> *Reference*
>
> *1. Dive Into Deep Learning (https://zh.d2l.ai/chapter_recurrent-modern/lstm.html)*
>

In [42]:
# Build LSTM
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM,self).__init__()
        
        self.hidden_size = hidden_size #?
        
        self.X2F = nn.Linear(in_features = input_size + hidden_size,
                             out_features = hidden_size )
        self.X2I = nn.Linear(in_features = input_size + hidden_size,
                             out_features = hidden_size )
        self.X2O = nn.Linear(in_features = input_size + hidden_size,
                             out_features = hidden_size )
        
        self.X2Ct = nn.Linear(in_features = input_size + hidden_size,
                             out_features = hidden_size )
        
        self.O2O = nn.Linear(in_features = hidden_size,
                             out_features = hidden_size)
        
        self.dropout = nn.Dropout(0.1)
        self.logsoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden, MemCell):
        
        input_combined = torch.cat((input,hidden),1)     #? how
        
        ForgetGate = torch.sigmoid(self.X2F(input_combined))   
        InputGate = torch.sigmoid(self.X2I(input_combined))
        OutputGate = torch.sigmoid(self.X2O(input_combined))
        
        CandidateMemCell = torch.tanh(self.X2Ct(input_combined))
        
        MemCell = ForgetGate * MemCell + InputGate * CandidateMemCell
        
        hidden = OutputGate * torch.tanh(MemCell)
        print(f"hidden:{hidden.shape}")
        
        output = self.O2O(hidden)
        output = self.dropout(output)
        output = self.logsoftmax(output)
        
        return output, hidden, MemCell
    
    def init_hidden(self, batch_size, device):
        return torch.zeros((batch_size, self.hidden_size), device = device)

    def init_MemCell(self, batch_size, device):
        return torch.zeros((batch_size, self.hidden_size), device = device)


In [35]:
n_hidden = 64
lstm = LSTM(charset_size, n_hidden)
def name2tensor(name):
    """将名字转换为 one-hot 编码的张量"""
    tensor = torch.zeros(len(name), 1, charset_size) #一个tensor（其实是2d矩阵的感觉），第一个维度是名字的长度（名字中的各个字符），第二个维度是1，第三个维度是每个字符的onehot
    for i, char in enumerate(name): #enmuerate
        tensor[i, 0, char2index(char)] = 1
    return tensor

input = name2tensor("leon")
hidden = lstm.init_hidden(batch_size=1, device='cpu')
MemCell = lstm.init_MemCell(batch_size=1, device='cpu')


In [43]:
output, next_hidden = lstm(input[0], hidden, MemCell)
print(output)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\xinby\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\xinby\AppData\Local\Temp\ipykernel_18768\1103372874.py", line 1, in <module>
    output, next_hidden = lstm(input[0], hidden, MemCell)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\xinby\.conda\envs\win_DL_Q\Lib\site-packages\torch\nn\modules\module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xinby\AppData\Local\Temp\ipykernel_18768\3351133768.py", line 38, in forward
    output = self.O2O(hidden)
             ^^^^^^^^^^^^^^^^
  File "c:\Users\xinby\.conda\envs\win_DL_Q\Lib\site-packages\torch\nn\modules\module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\xinby\