# LSTM

参考：https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/

In [1]:
import torch
from torch import nn

import numpy as np

In [5]:
text = ['hey how are you',
        'good i am fine',
        'have a nice day']

# 所有字母去重放入chars中
chars = set(''.join(text))

In [4]:
chars

{' ',
 'a',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'm',
 'n',
 'o',
 'r',
 'u',
 'v',
 'w',
 'y'}

In [8]:
# 建立一个字典，key是index，value是字母
int2char = dict(enumerate(chars))

# 建立一个字典，key是字母，value是数字，反过来
char2int = {char: ind for ind, char in int2char.items()}

In [11]:
char2int

{'i': 0,
 'c': 1,
 'd': 2,
 'n': 3,
 'y': 4,
 'u': 5,
 'h': 6,
 'o': 7,
 'w': 8,
 'r': 9,
 ' ': 10,
 'g': 11,
 'v': 12,
 'e': 13,
 'm': 14,
 'f': 15,
 'a': 16}

### 输入处理

我们会padding句子保证所有句子都是一样长的。

In [14]:
# 15个字符
maxlen = len(max(text, key=len))

print(f"The longest string has {maxlen} characters")

The longest string has 15 characters


In [15]:
# Padding

# 遍历句子增加空格
for i in range(len(text)):
    while len(text[i]) < maxlen:
        text[i] += ' '

- 输入数据
  - 最后一个字符排除在外，因为不需要输入模型
- Target
  - 输入数据之前的一个时间步，这是将模型中于输入数据对应的每个时间步的正确答案？

In [68]:
# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []

for i in range(len(text)):
    # 移除输入句子的最后一个字母
    input_seq.append(text[i][:-1])
    
    # 移除目标句子的第一个字母
    target_seq.append(text[i][1:])
    print(f"Input Sequence: {input_seq[i]}\nTarget Sequence: {target_seq[i]}")

Input Sequence: hey how are yo
Target Sequence: ey how are you
Input Sequence: good i am fine
Target Sequence: ood i am fine 
Input Sequence: have a nice da
Target Sequence: ave a nice day


使用之前建立的字典，把这些字母变成数字。

In [69]:
for i in range(len(text)):
    input_seq[i] = [char2int[character] for character in input_seq[i]]
    target_seq[i] = [char2int[character] for character in target_seq[i]]

In [70]:
input_seq

[[6, 13, 4, 10, 6, 7, 8, 10, 16, 9, 13, 10, 4, 7],
 [11, 7, 7, 2, 10, 0, 10, 16, 14, 10, 15, 0, 3, 13],
 [6, 16, 12, 13, 10, 16, 10, 3, 0, 1, 13, 10, 2, 16]]

In [71]:
target_seq

[[13, 4, 10, 6, 7, 8, 10, 16, 9, 13, 10, 4, 7, 5],
 [7, 7, 2, 10, 0, 10, 16, 14, 10, 15, 0, 3, 13, 10],
 [16, 12, 13, 10, 16, 10, 3, 0, 1, 13, 10, 2, 16, 4]]

定义三个关键变量：

- *dict_size*：文本中唯一字符的数量，用来确定 onehot 矢量的大小。
- *seq_len*：输入模型的序列的长度。
- *batch_size*：batch，句子的数量。

In [72]:
dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)

def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # 创建具有所需输出形状的全为零的多维数组
    # 这里就是基于字母级别的，全部变为0
    features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
    
    # 把对应字母的位置置为1|
    for i in range(batch_size):
        for u in range(seq_len):
            features[i, u, sequence[i][u]] = 1
    return features

In [73]:
input_seq = one_hot_encode(input_seq,  # 见上面
                           dict_size,  # 17个字母
                           seq_len,  # 最长14个字母一句话
                           batch_size)  # 一个batch三句话

print("Input shape: {} --> (Batch Size, Sequence Length, One-Hot Encoding Size)".format(input_seq.shape))

Input shape: (3, 14, 17) --> (Batch Size, Sequence Length, One-Hot Encoding Size)


可以看出已经变成onehot了，embedding完成。

In [74]:
input_seq

array([[[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.

In [75]:
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)

放入GPU

In [76]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


### 模型结构

In [77]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size,  # input_size 
                          hidden_dim,  # hidden_size
                          n_layers,  # 层数
                          batch_first=True)  # 
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        
        batch_size = x.size(0)  # 输入的第一个维度，就是len(text)

        #Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)  # 隐藏层初始状态

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)  # 隐藏层初始状态
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        # contiguous()用于整块内存调用，优化的操作
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)  # 全连接输出
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # 隐藏层初始状态
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
         # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden

定义超参数：

- n_epochs：训练整个数据集的次数
- lr：学习率


In [78]:
# Instantiate the model with hyperparameters
model = Model(input_size=dict_size,  # 字典大小
              output_size=dict_size,  # 字典大小
              hidden_dim=12,  # 隐藏层size
              n_layers=1)  # rnn层数
# We'll also set the model to the device that we defined earlier (default is CPU)

model = model.to(device)

# Define hyperparameters
n_epochs = 100
lr=0.01

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

训练模型：

In [79]:
# Training Run
input_seq = input_seq.to(device)
for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad() # Clears existing gradients from previous epoch
    #input_seq = input_seq.to(device)
    output, hidden = model(input_seq)
    output = output.to(device)
    target_seq = target_seq.to(device)
    loss = criterion(output, target_seq.view(-1).long())
    loss.backward() # Does backpropagation and calculates gradients
    optimizer.step() # Updates the weights accordingly
    
    if epoch%10 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(loss.item()))

Epoch: 10/100............. Loss: 2.4097
Epoch: 20/100............. Loss: 2.0579
Epoch: 30/100............. Loss: 1.6524
Epoch: 40/100............. Loss: 1.2506
Epoch: 50/100............. Loss: 0.8827
Epoch: 60/100............. Loss: 0.5947
Epoch: 70/100............. Loss: 0.3979
Epoch: 80/100............. Loss: 0.2752
Epoch: 90/100............. Loss: 0.1996
Epoch: 100/100............. Loss: 0.1527
