### 加载数据集

In [44]:
from utils import load_corpus, stopwords

TRAIN_PATH = "./data/weibo2018/train.txt"
TEST_PATH = "./data/weibo2018/test.txt"

In [45]:
# 分别加载训练集和测试集
train_data = load_corpus(TRAIN_PATH)
test_data = load_corpus(TEST_PATH)

In [46]:
import pandas as pd

df_train = pd.DataFrame(train_data, columns=["text", "label"])
df_test = pd.DataFrame(test_data, columns=["text", "label"])
df_train.head()

Unnamed: 0,text,label
0,书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹...,1
1,这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑...,0
2,中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ...,1
3,看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十...,1
4,汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历...,1


### 训练词向量

In [47]:
# word2vec要求的输入格式: list(word)
wv_input = df_train['text'].map(lambda s: s.split(" "))   # [for w in s.split(" ") if w not in stopwords]
wv_input.head()                         

0    [书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...
1    [这是, 英超, 被, 黑, 的, 最惨, 的, 一次, 二哈, 二哈, 十几年来, 中国,...
2    [中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...
3    [看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...
4    [汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...
Name: text, dtype: object

In [50]:
from gensim import models

# Word2Vec
word2vec = models.Word2Vec(wv_input, 
                           size=64,   # 词向量维度
                           min_count=1,      # 最小词频, 因为数据量较小, 这里卡1
                           iter=1000)      # 迭代轮次

查找近义词, 直观感受训练得到的word2vec效果

In [51]:
word2vec.wv.most_similar("哈哈")

[('哈哈哈', 0.7510852813720703),
 ('啦', 0.5347838997840881),
 ('差真萌', 0.5331387519836426),
 ('可爱', 0.4959038496017456),
 ('本柔', 0.4910407066345215),
 ('哈哈哈哈', 0.484031081199646),
 ('图留', 0.47534051537513733),
 ('李洛书', 0.4709894359111786),
 ('啊', 0.4615936875343323),
 ('今天', 0.45326608419418335)]

In [52]:
word2vec.wv.most_similar("伤心")

[('难过', 0.7424141764640808),
 ('哭', 0.6389803290367126),
 ('想', 0.6264787912368774),
 ('痛苦', 0.5951669812202454),
 ('真的', 0.5945923328399658),
 ('也', 0.592292308807373),
 ('对不起', 0.5748379826545715),
 ('孤独', 0.5715669393539429),
 ('的', 0.5705571174621582),
 ('遗憾', 0.5697544813156128)]

### 神经网络

In [53]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [54]:
# 超参数
learning_rate = 5e-4
input_size = 768
num_epoches = 5
batch_size = 100
embed_size = 64
hidden_size = 64
num_layers = 2

In [55]:
# 数据集
class MyDataset(Dataset):
    def __init__(self, df):
        self.data = []
        self.label = df["label"].tolist()
        for s in df["text"].tolist():
            vectors = []
            for w in s.split(" "):
                if w in word2vec.wv.index2word:
                    vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
            vectors = torch.Tensor(vectors)
            self.data.append(vectors)
    
    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.label)

def collate_fn(data):
    """
    :param data: 第0维：data，第1维：label
    :return: 序列化的data、记录实际长度的序列、以及label列表
    """
    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列
    data_length = [len(sq[0]) for sq in data]
    x = [i[0] for i in data]
    y = [i[1] for i in data]
    data = pad_sequence(x, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    return data, torch.tensor(y, dtype=torch.float32), data_length


# 训练集
train_data = MyDataset(df_train)
train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

# 测试集
test_data = MyDataset(df_test)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

In [56]:
# 网络结构
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))

        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出
        out = self.fc(lstm_out)
        out = self.sigmoid(out)
        return out

lstm = LSTM(embed_size, hidden_size, num_layers)

In [57]:
from sklearn import metrics

# 在测试集效果检验
def test():
    y_pred, y_true = [], []

    with torch.no_grad():
        for x, labels, lengths in test_loader:
            x = x.to(device)
            outputs = lstm(x, lengths)          # 前向传播
            outputs = outputs.view(-1)          # 将输出展平
            y_pred.append(outputs)
            y_true.append(labels)

    y_prob = torch.cat(y_pred)
    y_true = torch.cat(y_true)
    y_pred = y_prob.clone()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    
    print(metrics.classification_report(y_true, y_pred))
    print("准确率:", metrics.accuracy_score(y_true, y_pred))
    print("AUC:", metrics.roc_auc_score(y_true, y_prob) )

In [58]:
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [59]:
# 迭代训练
for epoch in range(num_epoches):
    total_loss = 0
    for i, (x, labels, lengths) in enumerate(train_loader):
        x = x.to(device)
        labels = labels.to(device)
        outputs = lstm(x, lengths)          # 前向传播
        logits = outputs.view(-1)           # 将输出展平
        loss = criterion(logits, labels)    # loss计算
        total_loss += loss
        optimizer.zero_grad()               # 梯度清零
        loss.backward(retain_graph=True)    # 反向传播，计算梯度
        optimizer.step()                    # 梯度更新
        if (i+1) % 10 == 0:
            print("epoch:{}, step:{}, loss:{}".format(epoch+1, i+1, total_loss/10))
            total_loss = 0
    
    # test
    test()
    
    # save model
    model_path = "./model/lstm_{}.model".format(epoch+1)
    torch.save(lstm, model_path)
    print("saved model: ", model_path)

epoch:1, step:10, loss:0.6904367208480835
epoch:1, step:20, loss:0.6728327870368958
epoch:1, step:30, loss:0.6429343223571777
epoch:1, step:40, loss:0.5999282002449036
epoch:1, step:50, loss:0.514881432056427
epoch:1, step:60, loss:0.5138906240463257
epoch:1, step:70, loss:0.47093087434768677
epoch:1, step:80, loss:0.4633955955505371
epoch:1, step:90, loss:0.4423191547393799
epoch:1, step:100, loss:0.4551025331020355
              precision    recall  f1-score   support

         0.0       0.68      0.85      0.76       155
         1.0       0.92      0.82      0.87       345

    accuracy                           0.83       500
   macro avg       0.80      0.84      0.81       500
weighted avg       0.85      0.83      0.83       500

准确率: 0.83
AUC: 0.9116409537166901
saved model:  ./model/lstm_1.model
epoch:2, step:10, loss:0.4156826138496399
epoch:2, step:20, loss:0.4121144711971283
epoch:2, step:30, loss:0.41760390996932983
epoch:2, step:40, loss:0.4073079526424408
epoch:2, step:

### 手动输入句子，判断情感倾向（1正/0负）

In [40]:
net = torch.load("./model/lstm_5.model")    # 训练过程中的巅峰时刻

In [43]:
from utils import processing

strs = ["我想说我会爱你多一点点", "日有所思梦感伤"]

data = []
for s in strs:
    vectors = []
    for w in processing(s).split(" "):
        if w in word2vec.wv.index2word:
            vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
    vectors = torch.Tensor(vectors)
    data.append(vectors)
x, _, lengths = collate_fn(list(zip(data, [-1] * len(strs))))
with torch.no_grad():
    x = x.to(device)
    outputs = lstm(x, lengths)          # 前向传播
    outputs = outputs.view(-1)          # 将输出展平
outputs

tensor([0.9937, 0.7252])