In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [19]:
# 读取processed_data
x_train_bow = np.load('./processed_data/xtrain_bow.npy')
x_test_bow = np.load('./processed_data/xtest_bow.npy')
y_train_bow_tf = np.load('./processed_data/ytrain.npy')
y_test_bow_tf = np.load('./processed_data/ytest.npy')
x_train_tf = np.load('./processed_data/xtrain_tf.npy')
x_test_tf = np.load('./processed_data/xtest_tf.npy')
x_train_vec = np.load('./processed_data/xtrain_vec.npy')
x_test_vec = np.load('./processed_data/xtest_vec.npy')
y_train_vec = np.load('./processed_data/ytrain_vec.npy')
y_test_vec = np.load('./processed_data/ytest_vec.npy')

In [20]:
# 读取word2idx文件
import pickle
with open('./models/word2idx.pkl','rb') as f:
    word2idx = pickle.load(f)

In [21]:
# 查看一下规模
print(x_train_bow.shape)
print(x_train_tf.shape)
print(x_train_vec.shape)

(5600, 15244)
(5600, 15244)
(5600, 50)


In [22]:
# 查看前十行
print(x_train_bow[:10])
print(x_train_tf[:10])
print(x_train_vec[:10])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0  304   15 2682  281    3  141    4  725
     9 4158  214 5767  281    0    0    6   76    3  966   70  281  195
  4061 1618  117  912 3048    5 2682  281]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0   23   89   59  111   23    2 1706   33  290  430 1315
     3   53    4    7    5  332 1706  968   81    6  115 3395   10   21
     4  234    0    0 1058   49  663   70]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0  969    0   83   23 

In [23]:
# 查看最底层数字的数据类型
print(x_train_bow[0][0].dtype)
print(x_train_tf[0][0].dtype)
print(x_train_vec[0][0].dtype)

int64
float64
int32


# 训练LSTM

In [24]:
# 需要下载pytorch GPU版本，配合cuda，不然训练速度可能较慢
# 参考：https://zhuanlan.zhihu.com/p/106133822 进行安装相应版本

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
# 使用GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available, CPU is used") 

GPU is available


In [25]:
# TensorDataset 就类似zip变为元组形式
# DataLoader 就是类似list进行不过就是将一个batch打包在一起加速，元素变为一个batch了

# 从 x_test_vec 中使用随机数切分为验证集和测试集

# 随机数种子
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# 随机数切分
from sklearn.model_selection import train_test_split
x_test_vec, x_val_vec, y_test_vec, y_val_vec = train_test_split(x_test_vec, y_test_vec, test_size=0.1, random_state=seed)

# 转换为TensorDataset
train_data_vec = TensorDataset(torch.from_numpy(x_train_vec).type(torch.Tensor), torch.from_numpy(y_train_vec).type(torch.Tensor))
val_data_vec = TensorDataset(torch.from_numpy(x_val_vec).type(torch.Tensor), torch.from_numpy(y_val_vec).type(torch.Tensor))
test_data_vec = TensorDataset(torch.from_numpy(x_test_vec).type(torch.Tensor), torch.from_numpy(y_test_vec).type(torch.Tensor))

batch_size = 40

# 设置drop_last 防止最后一个batch不够batch_size，导致数据量不对
train_loader_vec = DataLoader(train_data_vec, shuffle=True, batch_size=batch_size,drop_last=True)
val_loader_vec = DataLoader(val_data_vec, shuffle=True, batch_size=batch_size,drop_last=True)
test_loader_vec = DataLoader(test_data_vec, shuffle=True, batch_size=batch_size,drop_last=True)

In [26]:
# 查看数据训练的规模
print(train_loader_vec.dataset.tensors[0].shape)
print(val_loader_vec.dataset.tensors[0].shape)
print(test_loader_vec.dataset.tensors[0].shape)

torch.Size([5600, 50])
torch.Size([240, 50])
torch.Size([2160, 50])


In [27]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(LSTM, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [35]:
vocab_size = len(word2idx) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

model = LSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [36]:
epochs = 3
counter = 0
clip = 5
valid_loss_min = np.Inf

if(torch.cuda.is_available()):
    model.cuda()

model.train()

for i in range(epochs):    
    h = model.init_hidden(batch_size)
    
    for inputs, labels in train_loader_vec:
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        if counter % 100 == 0 :
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inp, lab in val_loader_vec:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './models/state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)
        

Epoch: 1/3... Step: 100... Loss: 0.501291... Val Loss: 0.549179
Validation loss decreased (inf --> 0.549179).  Saving model ...
Epoch: 2/3... Step: 200... Loss: 0.348684... Val Loss: 0.484888
Validation loss decreased (0.549179 --> 0.484888).  Saving model ...
Epoch: 3/3... Step: 300... Loss: 0.422833... Val Loss: 0.618378
Epoch: 3/3... Step: 400... Loss: 0.208310... Val Loss: 0.497837


In [37]:
# 读取最优模型
model.load_state_dict(torch.load('./models/state_dict.pt'))


# 检测模型的准确率，用于微调模型，初步查看以下
test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader_vec:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader_vec.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.494
Test accuracy: 79.537%
