**尝试了 tfidf + 各种机器学习模型，分数一直在0.93+ 左右**

**尝试了 BiRNN 和 TextCNN，其中TextCNN 分数在0.94～0.95徘徊，可能是没有仔细调参的原因**

**最后梭哈的单模型 BiRNN，采用不同学习率，最后一分钟提交的最高分数0.95686，排行榜第9，侥幸杀进Top10**

## Best Score Code - BiRNN

In [24]:
import collections
import os
import random
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from tqdm import tqdm
import time
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import jieba


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
# 读取数据
def read_comments(fname):
    data = []
    with open(fname, 'r', encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            label, text = line.strip().split('\t')
            data.append([text, int(label)])
    random.shuffle(data)
    return data
total_data = read_comments("./data/Comments/train_shuffle.txt")
train_data = total_data[:16000]  # 调参完毕改成全部的数据，调参时用all_data[:14000]，划分训练集和验证集
valid_data = total_data[14000:]

In [26]:
max_len = 19  # 最大长度
# 预处理数据
def get_tokenized_comments(data):
    def tokenizer(text):
        return list(text)
    return [tokenizer(comment) for comment, _ in data]

In [27]:
def get_vocab_comments(data):
    tokenized_data = get_tokenized_comments(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=2)


def preprocess_comments(data, vocab, train_data=True):
    def pad(x): 
        # 0 为 <unk> 即未登录次，1为<pad>, 即填充。但是wiki预训练的词向量中 <unk>和<pad>均是全0
        # 故这里填0 或 1 均可，严格来说应该填 1
        return x[:max_len] if len(x) > max_len else x + [1] * (max_len - len(x))
    if train_data:
        tokenized_data = get_tokenized_comments(data)
        features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
        labels = torch.tensor([score for _, score in data])
    else:
        tokenized_data = [list(line.strip()) for line in data]
        features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
        labels = None
    return features, labels

In [28]:
# 创建数据集
vocab = get_vocab_comments(train_data)
print(len(vocab))
batch_size = 32
train_set = Data.TensorDataset(*preprocess_comments(train_data, vocab))
valid_set = Data.TensorDataset(*preprocess_comments(valid_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
valid_iter = Data.DataLoader(valid_set, batch_size)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

2015
X torch.Size([32, 19]) y torch.Size([32])


('#batches:', 500)

In [29]:
# RNN
class BiRNN(torch.nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=num_hiddens,
                               num_layers=num_layers,
                               bidirectional=True)
        self.dropout = nn.Dropout(p=0.1)
        self.decoder = nn.Linear(4*num_hiddens, 2)

    def forward(self, inputs):
        embeddings = self.embedding(inputs.permute(1, 0))
        outputs, _ = self.encoder(embeddings) # output, (h, c)
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        dropout = self.dropout(encoding)
        outs = self.decoder(dropout)
        return outs

In [30]:
embed_size, num_hiddens, num_layers = 300, 50, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)  # 构建model

# gensim 加载预训练的词向量

In [31]:
import gensim
from gensim.models import KeyedVectors

# 用load_word2vec_format快速打开词向量
w2v = KeyedVectors.load_word2vec_format('./data/Comments/sgns.wiki.word')

In [32]:
def load_pretrained_embedding(words, pretrained_vocab):
    '''
    @params:
        words: 需要加载词向量的词语列表，以 itos (index to string) 的词典形式给出
        pretrained_vocab: 预训练词向量
    @return:
        embed: 加载到的词向量
    '''
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        if word not in pretrained_vocab:
            embed[i,:] = torch.Tensor([0]*pretrained_vocab.vectors[0].shape[0])
        else:
            embed[i,:] = torch.Tensor(pretrained_vocab[word])
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    else:
        print("Yeah... There is NO oov word.")
    return embed

In [33]:
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, w2v))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它 -- 也可以使embedding matrix一起更新

Yeah... There is NO oov word.


# 训练Model

In [34]:
# 准确度评价指标
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            net.eval() # 评估模式, 这会关闭dropout
            acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
            net.train() # 改回训练模式
            n += y.shape[0]
    return acc_sum / n

In [35]:
def softmax(X):
    X_exp = X.exp()
    partition = X_exp.sum(dim=1, keepdim=True)
    return X_exp / partition


def evaluate_auc(data_iter, net, device=None):
    if device is None:
        device = list(net.parameters())[0].device
    y_true, y_hat = np.zeros(0), np.zeros(0)
    with torch.no_grad():
        for X, y in data_iter:
            net.eval() # 评估模式, 这会关闭dropout
            y_hat = np.concatenate([y_hat, softmax(net(X.to(device)).detach().cpu())[:,1].numpy()])
            y_true = np.concatenate([y_true, y.cpu().numpy()])
            net.train() # 改回训练模式
    return roc_auc_score(y_true, y_hat), y_hat

In [36]:
# 训练函数
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        train_auc, _ = evaluate_auc(train_iter, net)
        test_acc = evaluate_accuracy(test_iter, net)
        test_auc, _ = evaluate_auc(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, train auc %.3f, test auc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, train_auc, test_auc, time.time() - start))

# 调参

### 即尝试不同的超参的值

简单说明几点：

- 文本长度 max_len 也属于超参，之所以调整文本长度，是为了更好的并行
- 一般假设训练数据和测试数据满足**独立同分布**，这里训练集和测试集都是短文本，最大长度为19
- 按字embedding 比切词embedding要好一些，可能的数据量较小的原因
- 也可能是数据量小的原因，模型迭代3次便开始过拟合
- batch_size 一般是越大梯度下降的越精确，但是本任务中发现32的batch 略优于 64的batch
- wiki预训练的词向量作为参数跟着一起train 略优于 fixed这些词向量
- 用sigmoid输出数据（直接输出一个概率值，表示为1的概率） 分数略低于 softmax处理（分别输出0、1的概率）
- 学习率的调整：先用0.001的学习率 train 3次，在用0.0002的学习率train2次
- lstm层数
- 尝试了Attention机制，但是由于当时时间比较紧迫，第一次的分数约在0.94+，就没深入调参。

# 调参完成

In [37]:
# 0.001的学习率先训练3次
lr, num_epochs = 0.001, 3
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, valid_iter, net, loss, optimizer, device, num_epochs)  # 全部的数据参与训练，验证集还是最后2k的数据

training on  cpu
epoch 1, loss 0.3619, train acc 0.836, test acc 0.890, train auc 0.953, test auc 0.955, time 21.9 sec
epoch 2, loss 0.1299, train acc 0.892, test acc 0.912, train auc 0.964, test auc 0.966, time 21.3 sec
epoch 3, loss 0.0763, train acc 0.909, test acc 0.920, train auc 0.972, test auc 0.972, time 21.3 sec


In [38]:
# 0.0002的学习率再训练2次
net.train()
lr, num_epochs = 0.0002, 2
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, valid_iter, net, loss, optimizer, device, num_epochs)  # 全部的数据参与训练，验证集还是最后2k的数据

training on  cpu
epoch 1, loss 0.1881, train acc 0.926, test acc 0.928, train auc 0.978, test auc 0.979, time 21.0 sec
epoch 2, loss 0.0876, train acc 0.932, test acc 0.940, train auc 0.981, test auc 0.982, time 21.0 sec


In [39]:
net.eval()

# 预测函数
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return label.item()

# 输出结果
with open("./data/Comments/test_handout.txt", 'r', encoding="utf-8") as f:
    lines = f.readlines()
test_X, _ = preprocess_comments(lines, vocab, train_data=False)


# test_y = torch.nn.Sigmoid()(net(test_X.to(device))[:, 1]).detach().cpu().numpy()
test_y = softmax(net(test_X.to(device))).detach().cpu()[:,1].numpy()
pd.DataFrame({"ID":range(0,len(test_y)),"Prediction":test_y}).to_csv("./data/Comments/final_submission.csv", index=False)