In [22]:
import pandas as pd
import re
import random
import tarfile
import requests
import numpy as np
import paddle
from paddle.nn import Embedding
import paddle.nn.functional as F
from paddle.nn import LSTM, Embedding, Dropout, Linear
import paddle.fluid as fluid
train_data = pd.read_csv('train.csv', sep='\t', header=None)
test_data = pd.read_csv('test.csv', sep='\t', header=None)
train_data.columns = ["text","label"]
test_data.columns = ["text"]

In [79]:
import jieba
def load_data(data,train=True):
    data_set = []
    if train:
        for i in range(0,len(data)):
            text = data.iloc[i]["text"]
            label = data.iloc[i]["label"]
            sentence = jieba.cut(text, cut_all=False)
            data_set.append((" ".join(sentence), label))
    else:
        for i in range(0,len(data)):
            text = data.iloc[i]["text"]
            sentence = jieba.cut(text, cut_all=False)
            data_set.append((" ".join(sentence)))
    return data_set

In [134]:
train_corpus = load_data(train_data)

In [135]:
test_corpus = load_data(test_data,train=False)

In [107]:
df_test = pd.DataFrame(train_corpus)

In [109]:
df_test[0].apply(len).describe()

count    12100.000000
mean         9.006281
std          3.160494
min          2.000000
25%          7.000000
50%          9.000000
75%         11.000000
max         31.000000
Name: 0, dtype: float64

In [25]:
word2id_dict = dict()

In [150]:
#使用开源的word2vec词向量：来源https://github.com/Flywolfs/Chinese-Word-Vectors
word2vec_path = "word2vec\\sgns.zhihu.word\\sgns.zhihu.word"
zhi_vec_depth = 300
import numpy as np
zhihu_w2v = {}
with open(word2vec_path,"r",encoding="utf-8") as f:
    for line in f.readlines()[1:]:
        line = line.rstrip()
        seps = line.split(" ")
        word = seps[0]
        vec = seps[1:]
        zhihu_w2v[seps[0]] = np.array(vec,dtype=float)

In [151]:
word2id_dict['[oov]'] = 0
word2id_dict['[pad]'] = 1
index = 2
for word in zhihu_w2v:
    word2id_dict[word] = index
    index += 1

In [152]:
oov_embedding_init = np.random.uniform(low=-0.1, high=0.1, size=300).tolist()
pad_embeeding_init = np.random.uniform(low=-0.1, high=0.1, size=300).tolist()

In [153]:
embedding_init = [oov_embedding_init,pad_embeeding_init]
for word in word2id_dict:
    if word != "[oov]" and word != "[pad]":
        embedding_init.append(zhihu_w2v[word].tolist())

In [159]:
embedding_init = np.array(embedding_init,dtype="float32")

In [154]:
labels = train_data["label"].unique().tolist()
label_dict = {}
for i in range(len(labels)):
    label_dict[labels[i]] = i

In [155]:
label_dict

{'Travel-Query': 0,
 'Music-Play': 1,
 'FilmTele-Play': 2,
 'Video-Play': 3,
 'Radio-Listen': 4,
 'HomeAppliance-Control': 5,
 'Weather-Query': 6,
 'Alarm-Update': 7,
 'Calendar-Query': 8,
 'TVProgram-Play': 9,
 'Audio-Play': 10,
 'Other': 11}

In [133]:
def convert_corpus_to_id(corpus, word2id_dict, train=True):
    data_set = []
    if train:
        for sentence, sentence_label in corpus:
            label = label_dict.get(sentence_label, -1)
            # 将句子中的词逐个替换成id，如果句子中的词不在词表内，则替换成oov
            # 这里需要注意，一般来说我们可能需要查看一下test-set中，句子oov的比例，
            # 如果存在过多oov的情况，那就说明我们的训练数据不足或者切分存在巨大偏差，需要调整
            sentence = [word2id_dict[word] if word in word2id_dict \
                        else word2id_dict['[oov]'] for word in sentence.split(" ")]    
            data_set.append((sentence, label))
    else:
        for sentence in corpus:
            # 将句子中的词逐个替换成id，如果句子中的词不在词表内，则替换成oov
            # 这里需要注意，一般来说我们可能需要查看一下test-set中，句子oov的比例，
            # 如果存在过多oov的情况，那就说明我们的训练数据不足或者切分存在巨大偏差，需要调整
            sentence = [word2id_dict[word] if word in word2id_dict \
                        else word2id_dict['[oov]'] for word in sentence.split(" ")]    
            data_set.append((sentence))
    return data_set

In [137]:
train_corpus = convert_corpus_to_id(train_corpus, word2id_dict)

In [138]:
test_corpus =  convert_corpus_to_id(test_corpus, word2id_dict,train=False)

In [139]:
def build_batch(word2id_dict, corpus, batch_size, epoch_num, max_seq_len, shuffle = True, drop_last = True, train=True):

    # 模型将会接受的两个输入：
    # 1. 一个形状为[batch_size, max_seq_len]的张量，sentence_batch，代表了一个mini-batch的句子。
    # 2. 一个形状为[batch_size, 1]的张量，sentence_label_batch，每个元素都是非0即1，代表了每个句子的情感类别（正向或者负向）
    sentence_batch = []
    sentence_label_batch = []
    if train:
        for _ in range(epoch_num): 

            #每个epoch前都shuffle一下数据，有助于提高模型训练的效果
            #但是对于预测任务，不要做数据shuffle
            if shuffle:
                random.shuffle(corpus)

            for sentence, sentence_label in corpus:
                sentence_sample = sentence[:min(max_seq_len, len(sentence))]
                if len(sentence_sample) < max_seq_len:
                    for _ in range(max_seq_len - len(sentence_sample)):
                        sentence_sample.append(word2id_dict['[pad]'])


                sentence_sample = [[word_id] for word_id in sentence_sample]

                sentence_batch.append(sentence_sample)
                sentence_label_batch.append([sentence_label])

                if len(sentence_batch) == batch_size:
                    yield np.array(sentence_batch).astype("int64"), np.array(sentence_label_batch).astype("int64")
                    sentence_batch = []
                    sentence_label_batch = []
        if not drop_last and len(sentence_batch) > 0:
            yield np.array(sentence_batch).astype("int64"), np.array(sentence_label_batch).astype("int64")
    else:
        for sentence in corpus:
            sentence_sample = sentence[:min(max_seq_len, len(sentence))]
            if len(sentence_sample) < max_seq_len:
                for _ in range(max_seq_len - len(sentence_sample)):
                    sentence_sample.append(word2id_dict['[pad]'])


            sentence_sample = [[word_id] for word_id in sentence_sample]

            sentence_batch.append(sentence_sample)

            if len(sentence_batch) == batch_size:
                yield np.array(sentence_batch).astype("int64")
                sentence_batch = []
                sentence_label_batch = []
        if len(sentence_batch) > 0:
            yield np.array(sentence_batch).astype("int64")

In [160]:
import paddle.fluid as fluid
class IntentClassifier(paddle.nn.Layer):
    
    def __init__(self, hidden_size, vocab_size, embedding_size, class_num=12, num_steps=128, num_layers=1, init_scale=0.1, dropout_rate=None):
        
        # 参数含义如下：
        # 1.hidden_size，表示embedding-size，hidden和cell向量的维度
        # 2.vocab_size，模型可以考虑的词表大小
        # 3.embedding_size，表示词向量的维度
        # 4.class_num，情感类型个数，可以是2分类，也可以是多分类
        # 5.num_steps，表示这个情感分析模型最大可以考虑的句子长度
        # 6.num_layers，表示网络的层数
        # 7.dropout_rate，表示使用dropout过程中失活的神经元比例
        # 8.init_scale，表示网络内部的参数的初始化范围,长短时记忆网络内部用了很多Tanh，Sigmoid等激活函数，\
        # 这些函数对数值精度非常敏感，因此我们一般只使用比较小的初始化范围，以保证效果
        super(IntentClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.class_num = class_num
        self.num_steps = num_steps
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.init_scale = init_scale
       
        # 声明一个LSTM模型，用来把每个句子抽象成向量
        self.simple_lstm_rnn = paddle.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)

        # 声明一个embedding层，用来把句子中的每个词转换为向量
        self.embedding = paddle.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, sparse=False, 
                                    weight_attr=fluid.initializer.NumpyArrayInitializer(embedding_init))
        
#         self.embedding = paddle.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, sparse=False, 
#                                     weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(low=-init_scale, high=init_scale)))
        
        # 声明使用上述语义向量映射到具体类别时所需要使用的线性层
        self.cls_fc = paddle.nn.Linear(in_features=self.hidden_size, out_features=self.class_num, 
                             weight_attr=None, bias_attr=None)
        
        # 一般在获取单词的embedding后，会使用dropout层，防止过拟合，提升模型泛化能力
        self.dropout_layer = paddle.nn.Dropout(p=self.dropout_rate, mode='upscale_in_train')

    # forwad函数即为模型前向计算的函数，它有两个输入，分别为：
    # input为输入的训练文本，其shape为[batch_size, max_seq_len]
    # label训练文本对应的情感标签，其shape维[batch_size, 1]
    def forward(self, inputs):
        # 获取输入数据的batch_size
        batch_size = inputs.shape[0]

        # 本实验默认使用1层的LSTM，首先我们需要定义LSTM的初始hidden和cell，这里我们使用0来初始化这个序列的记忆
        init_hidden_data = np.zeros(
            (self.num_layers, batch_size, self.hidden_size), dtype='float32')
        init_cell_data = np.zeros(
            (self.num_layers, batch_size, self.hidden_size), dtype='float32')

        # 将这些初始记忆转换为飞桨可计算的向量，并且设置stop_gradient=True，避免这些向量被更新，从而影响训练效果
        init_hidden = paddle.to_tensor(init_hidden_data)
        init_hidden.stop_gradient = True
        init_cell = paddle.to_tensor(init_cell_data)
        init_cell.stop_gradient = True

        # 对应以上第2步，将输入的句子的mini-batch转换为词向量表示，转换后输入数据shape为[batch_size, max_seq_len, embedding_size]
        x_emb = self.embedding(inputs)
        x_emb = paddle.reshape(x_emb, shape=[-1, self.num_steps, self.embedding_size])
        # 在获取的词向量后添加dropout层
        if self.dropout_rate is not None and self.dropout_rate > 0.0:
            x_emb = self.dropout_layer(x_emb)
        
        # 对应以上第3步，使用LSTM网络，把每个句子转换为语义向量
        # 返回的last_hidden即为最后一个时间步的输出，其shape为[self.num_layers, batch_size, hidden_size]
        rnn_out, (last_hidden, last_cell) = self.simple_lstm_rnn(x_emb, (init_hidden, init_cell))
        # 提取最后一层隐状态作为文本的语义向量，其shape为[batch_size, hidden_size]
        last_hidden = paddle.reshape(last_hidden[-1], shape=[-1, self.hidden_size])

        # 对应以上第4步，将每个句子的向量表示映射到具体的情感类别上, logits的维度为[batch_size, 2]
        logits = self.cls_fc(last_hidden)
        
        return logits

In [168]:
# 定义训练参数
epoch_num = 50
batch_size = 128 

learning_rate = 0.0005
dropout_rate = 0.1
num_layers = 1
hidden_size = 300
embedding_size = 300
max_seq_len =  32
vocab_size = len(word2id_dict)

In [169]:
# 检测是否可以使用GPU，如果可以优先使用GPU
use_gpu = True if paddle.get_device().startswith("gpu") else False
if use_gpu:
    paddle.set_device('gpu:0')
intent_clf = IntentClassifier(hidden_size, vocab_size, embedding_size,  num_steps=max_seq_len, num_layers=num_layers, dropout_rate=dropout_rate)
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, beta1=0.9, beta2=0.999, parameters= intent_clf.parameters())
losses = []
steps = []
def train(model):
    # 开启模型训练模式
    model.train()
    
    # 建立训练数据生成器，每次迭代生成一个batch，每个batch包含训练文本和文本对应的情感标签
    # word2id_dict 词典, train_corpus 训练数据的向量表示, batch_size 每个批的大小, epoch_num 训练的次数, max_seq_len最大输入的长度
    train_loader = build_batch(word2id_dict, train_corpus, batch_size, epoch_num, max_seq_len)
    #step:迭代的次数
    for step, (sentences, labels) in enumerate(train_loader):
        # 获取数据，并将张量转换为Tensor类型
        sentences = paddle.to_tensor(sentences)
        labels = paddle.to_tensor(labels)
        one_hot_label = fluid.layers.one_hot(input=labels, depth=12)
        logits = model(sentences)
        #print(labels, "--->", logits, "--->", one_hot_label)

        # 计算损失
        loss = F.cross_entropy(input=logits, label=one_hot_label, soft_label=True)
        loss = paddle.mean(loss)
        
        # 后向传播
        loss.backward()
        # 更新参数
        optimizer.step()
        # 清除梯度
        optimizer.clear_grad()
        # 记录当前步骤的loss变化情况
        losses.append(loss.numpy()[0])
        steps.append(step)
        # 打印当前loss数值
        print("step %d, loss %.3f" % (step, loss.numpy()[0]))
#训练模型
train(intent_clf)

step 0, loss 2.490
step 1, loss 2.476
step 2, loss 2.456
step 3, loss 2.458
step 4, loss 2.435
step 5, loss 2.419
step 6, loss 2.388
step 7, loss 2.400
step 8, loss 2.409
step 9, loss 2.375
step 10, loss 2.376
step 11, loss 2.385
step 12, loss 2.345
step 13, loss 2.359
step 14, loss 2.446
step 15, loss 2.354
step 16, loss 2.318
step 17, loss 2.426
step 18, loss 2.283
step 19, loss 2.365
step 20, loss 2.327
step 21, loss 2.373
step 22, loss 2.336
step 23, loss 2.370
step 24, loss 2.355
step 25, loss 2.368
step 26, loss 2.305
step 27, loss 2.369
step 28, loss 2.376
step 29, loss 2.345
step 30, loss 2.326
step 31, loss 2.345
step 32, loss 2.388
step 33, loss 2.387
step 34, loss 2.340
step 35, loss 2.390
step 36, loss 2.336
step 37, loss 2.437
step 38, loss 2.357
step 39, loss 2.343
step 40, loss 2.399
step 41, loss 2.381
step 42, loss 2.345
step 43, loss 2.326
step 44, loss 2.306
step 45, loss 2.379
step 46, loss 2.357
step 47, loss 2.402
step 48, loss 2.411
step 49, loss 2.429
step 50, l

step 397, loss 2.382
step 398, loss 2.305
step 399, loss 2.373
step 400, loss 2.379
step 401, loss 2.427
step 402, loss 2.445
step 403, loss 2.292
step 404, loss 2.346
step 405, loss 2.456
step 406, loss 2.355
step 407, loss 2.310
step 408, loss 2.376
step 409, loss 2.335
step 410, loss 2.284
step 411, loss 2.345
step 412, loss 2.353
step 413, loss 2.365
step 414, loss 2.349
step 415, loss 2.383
step 416, loss 2.420
step 417, loss 2.343
step 418, loss 2.418
step 419, loss 2.396
step 420, loss 2.391
step 421, loss 2.418
step 422, loss 2.335
step 423, loss 2.357
step 424, loss 2.365
step 425, loss 2.360
step 426, loss 2.332
step 427, loss 2.344
step 428, loss 2.322
step 429, loss 2.372
step 430, loss 2.371
step 431, loss 2.344
step 432, loss 2.338
step 433, loss 2.412
step 434, loss 2.388
step 435, loss 2.323
step 436, loss 2.325
step 437, loss 2.420
step 438, loss 2.302
step 439, loss 2.363
step 440, loss 2.372
step 441, loss 2.254
step 442, loss 2.358
step 443, loss 2.292
step 444, los

step 796, loss 2.307
step 797, loss 2.357
step 798, loss 2.356
step 799, loss 2.370
step 800, loss 2.348
step 801, loss 2.351
step 802, loss 2.382
step 803, loss 2.372
step 804, loss 2.294
step 805, loss 2.322
step 806, loss 2.293
step 807, loss 2.346
step 808, loss 2.337
step 809, loss 2.335
step 810, loss 2.393
step 811, loss 2.342
step 812, loss 2.318
step 813, loss 2.340
step 814, loss 2.338
step 815, loss 2.360
step 816, loss 2.395
step 817, loss 2.271
step 818, loss 2.384
step 819, loss 2.331
step 820, loss 2.388
step 821, loss 2.289
step 822, loss 2.462
step 823, loss 2.349
step 824, loss 2.335
step 825, loss 2.337
step 826, loss 2.302
step 827, loss 2.330
step 828, loss 2.353
step 829, loss 2.375
step 830, loss 2.299
step 831, loss 2.420
step 832, loss 2.368
step 833, loss 2.406
step 834, loss 2.317
step 835, loss 2.396
step 836, loss 2.339
step 837, loss 2.337
step 838, loss 2.390
step 839, loss 2.352
step 840, loss 2.339
step 841, loss 2.359
step 842, loss 2.346
step 843, los

step 1178, loss 1.235
step 1179, loss 1.016
step 1180, loss 1.255
step 1181, loss 1.037
step 1182, loss 1.082
step 1183, loss 1.199
step 1184, loss 1.142
step 1185, loss 1.061
step 1186, loss 1.125
step 1187, loss 1.204
step 1188, loss 1.140
step 1189, loss 0.952
step 1190, loss 1.117
step 1191, loss 1.060
step 1192, loss 1.041
step 1193, loss 1.167
step 1194, loss 1.178
step 1195, loss 1.199
step 1196, loss 0.909
step 1197, loss 1.163
step 1198, loss 1.045
step 1199, loss 0.910
step 1200, loss 0.955
step 1201, loss 1.094
step 1202, loss 0.998
step 1203, loss 0.953
step 1204, loss 0.822
step 1205, loss 1.038
step 1206, loss 0.992
step 1207, loss 0.893
step 1208, loss 0.993
step 1209, loss 0.937
step 1210, loss 1.029
step 1211, loss 0.804
step 1212, loss 1.017
step 1213, loss 0.787
step 1214, loss 0.602
step 1215, loss 0.846
step 1216, loss 0.647
step 1217, loss 0.907
step 1218, loss 0.847
step 1219, loss 0.779
step 1220, loss 0.863
step 1221, loss 0.637
step 1222, loss 0.712
step 1223,

step 1564, loss 0.261
step 1565, loss 0.235
step 1566, loss 0.273
step 1567, loss 0.342
step 1568, loss 0.193
step 1569, loss 0.283
step 1570, loss 0.274
step 1571, loss 0.232
step 1572, loss 0.394
step 1573, loss 0.326
step 1574, loss 0.300
step 1575, loss 0.193
step 1576, loss 0.201
step 1577, loss 0.161
step 1578, loss 0.308
step 1579, loss 0.344
step 1580, loss 0.191
step 1581, loss 0.224
step 1582, loss 0.371
step 1583, loss 0.228
step 1584, loss 0.223
step 1585, loss 0.365
step 1586, loss 0.305
step 1587, loss 0.347
step 1588, loss 0.243
step 1589, loss 0.396
step 1590, loss 0.218
step 1591, loss 0.333
step 1592, loss 0.274
step 1593, loss 0.286
step 1594, loss 0.196
step 1595, loss 0.416
step 1596, loss 0.260
step 1597, loss 0.290
step 1598, loss 0.290
step 1599, loss 0.243
step 1600, loss 0.262
step 1601, loss 0.137
step 1602, loss 0.367
step 1603, loss 0.230
step 1604, loss 0.280
step 1605, loss 0.364
step 1606, loss 0.200
step 1607, loss 0.290
step 1608, loss 0.336
step 1609,

step 1949, loss 0.161
step 1950, loss 0.099
step 1951, loss 0.079
step 1952, loss 0.237
step 1953, loss 0.140
step 1954, loss 0.099
step 1955, loss 0.186
step 1956, loss 0.205
step 1957, loss 0.096
step 1958, loss 0.103
step 1959, loss 0.173
step 1960, loss 0.130
step 1961, loss 0.153
step 1962, loss 0.033
step 1963, loss 0.142
step 1964, loss 0.281
step 1965, loss 0.089
step 1966, loss 0.273
step 1967, loss 0.107
step 1968, loss 0.061
step 1969, loss 0.108
step 1970, loss 0.106
step 1971, loss 0.124
step 1972, loss 0.059
step 1973, loss 0.115
step 1974, loss 0.108
step 1975, loss 0.176
step 1976, loss 0.127
step 1977, loss 0.135
step 1978, loss 0.030
step 1979, loss 0.058
step 1980, loss 0.052
step 1981, loss 0.193
step 1982, loss 0.043
step 1983, loss 0.128
step 1984, loss 0.088
step 1985, loss 0.168
step 1986, loss 0.155
step 1987, loss 0.088
step 1988, loss 0.112
step 1989, loss 0.121
step 1990, loss 0.187
step 1991, loss 0.073
step 1992, loss 0.023
step 1993, loss 0.106
step 1994,

step 2338, loss 0.058
step 2339, loss 0.058
step 2340, loss 0.090
step 2341, loss 0.221
step 2342, loss 0.122
step 2343, loss 0.128
step 2344, loss 0.125
step 2345, loss 0.111
step 2346, loss 0.055
step 2347, loss 0.050
step 2348, loss 0.102
step 2349, loss 0.052
step 2350, loss 0.149
step 2351, loss 0.068
step 2352, loss 0.095
step 2353, loss 0.169
step 2354, loss 0.040
step 2355, loss 0.166
step 2356, loss 0.133
step 2357, loss 0.061
step 2358, loss 0.123
step 2359, loss 0.113
step 2360, loss 0.087
step 2361, loss 0.050
step 2362, loss 0.111
step 2363, loss 0.060
step 2364, loss 0.110
step 2365, loss 0.086
step 2366, loss 0.109
step 2367, loss 0.092
step 2368, loss 0.173
step 2369, loss 0.133
step 2370, loss 0.106
step 2371, loss 0.107
step 2372, loss 0.046
step 2373, loss 0.116
step 2374, loss 0.074
step 2375, loss 0.145
step 2376, loss 0.141
step 2377, loss 0.103
step 2378, loss 0.135
step 2379, loss 0.130
step 2380, loss 0.149
step 2381, loss 0.082
step 2382, loss 0.119
step 2383,

step 2711, loss 0.132
step 2712, loss 0.055
step 2713, loss 0.044
step 2714, loss 0.122
step 2715, loss 0.012
step 2716, loss 0.101
step 2717, loss 0.061
step 2718, loss 0.145
step 2719, loss 0.145
step 2720, loss 0.019
step 2721, loss 0.140
step 2722, loss 0.110
step 2723, loss 0.105
step 2724, loss 0.046
step 2725, loss 0.034
step 2726, loss 0.152
step 2727, loss 0.099
step 2728, loss 0.081
step 2729, loss 0.144
step 2730, loss 0.050
step 2731, loss 0.135
step 2732, loss 0.076
step 2733, loss 0.031
step 2734, loss 0.121
step 2735, loss 0.014
step 2736, loss 0.135
step 2737, loss 0.071
step 2738, loss 0.031
step 2739, loss 0.090
step 2740, loss 0.028
step 2741, loss 0.074
step 2742, loss 0.065
step 2743, loss 0.053
step 2744, loss 0.017
step 2745, loss 0.074
step 2746, loss 0.057
step 2747, loss 0.130
step 2748, loss 0.030
step 2749, loss 0.081
step 2750, loss 0.008
step 2751, loss 0.113
step 2752, loss 0.158
step 2753, loss 0.085
step 2754, loss 0.115
step 2755, loss 0.113
step 2756,

step 3095, loss 0.008
step 3096, loss 0.009
step 3097, loss 0.043
step 3098, loss 0.020
step 3099, loss 0.033
step 3100, loss 0.011
step 3101, loss 0.036
step 3102, loss 0.017
step 3103, loss 0.075
step 3104, loss 0.072
step 3105, loss 0.012
step 3106, loss 0.021
step 3107, loss 0.011
step 3108, loss 0.038
step 3109, loss 0.004
step 3110, loss 0.015
step 3111, loss 0.008
step 3112, loss 0.061
step 3113, loss 0.046
step 3114, loss 0.098
step 3115, loss 0.039
step 3116, loss 0.047
step 3117, loss 0.066
step 3118, loss 0.058
step 3119, loss 0.040
step 3120, loss 0.010
step 3121, loss 0.031
step 3122, loss 0.055
step 3123, loss 0.006
step 3124, loss 0.082
step 3125, loss 0.046
step 3126, loss 0.044
step 3127, loss 0.037
step 3128, loss 0.025
step 3129, loss 0.055
step 3130, loss 0.036
step 3131, loss 0.004
step 3132, loss 0.030
step 3133, loss 0.097
step 3134, loss 0.024
step 3135, loss 0.007
step 3136, loss 0.011
step 3137, loss 0.018
step 3138, loss 0.065
step 3139, loss 0.011
step 3140,

step 3484, loss 0.039
step 3485, loss 0.018
step 3486, loss 0.018
step 3487, loss 0.005
step 3488, loss 0.022
step 3489, loss 0.004
step 3490, loss 0.026
step 3491, loss 0.033
step 3492, loss 0.070
step 3493, loss 0.006
step 3494, loss 0.023
step 3495, loss 0.033
step 3496, loss 0.015
step 3497, loss 0.012
step 3498, loss 0.002
step 3499, loss 0.007
step 3500, loss 0.062
step 3501, loss 0.062
step 3502, loss 0.024
step 3503, loss 0.018
step 3504, loss 0.004
step 3505, loss 0.034
step 3506, loss 0.041
step 3507, loss 0.017
step 3508, loss 0.047
step 3509, loss 0.046
step 3510, loss 0.005
step 3511, loss 0.044
step 3512, loss 0.009
step 3513, loss 0.006
step 3514, loss 0.014
step 3515, loss 0.053
step 3516, loss 0.046
step 3517, loss 0.034
step 3518, loss 0.026
step 3519, loss 0.007
step 3520, loss 0.066
step 3521, loss 0.019
step 3522, loss 0.006
step 3523, loss 0.010
step 3524, loss 0.024
step 3525, loss 0.004
step 3526, loss 0.011
step 3527, loss 0.003
step 3528, loss 0.015
step 3529,

step 3871, loss 0.025
step 3872, loss 0.002
step 3873, loss 0.011
step 3874, loss 0.133
step 3875, loss 0.025
step 3876, loss 0.042
step 3877, loss 0.094
step 3878, loss 0.007
step 3879, loss 0.006
step 3880, loss 0.002
step 3881, loss 0.037
step 3882, loss 0.004
step 3883, loss 0.005
step 3884, loss 0.061
step 3885, loss 0.006
step 3886, loss 0.091
step 3887, loss 0.032
step 3888, loss 0.005
step 3889, loss 0.010
step 3890, loss 0.027
step 3891, loss 0.002
step 3892, loss 0.013
step 3893, loss 0.057
step 3894, loss 0.028
step 3895, loss 0.012
step 3896, loss 0.035
step 3897, loss 0.003
step 3898, loss 0.038
step 3899, loss 0.020
step 3900, loss 0.019
step 3901, loss 0.006
step 3902, loss 0.004
step 3903, loss 0.003
step 3904, loss 0.008
step 3905, loss 0.017
step 3906, loss 0.035
step 3907, loss 0.022
step 3908, loss 0.037
step 3909, loss 0.002
step 3910, loss 0.020
step 3911, loss 0.012
step 3912, loss 0.062
step 3913, loss 0.002
step 3914, loss 0.008
step 3915, loss 0.032
step 3916,

step 4252, loss 0.002
step 4253, loss 0.047
step 4254, loss 0.039
step 4255, loss 0.009
step 4256, loss 0.014
step 4257, loss 0.022
step 4258, loss 0.005
step 4259, loss 0.048
step 4260, loss 0.002
step 4261, loss 0.039
step 4262, loss 0.003
step 4263, loss 0.053
step 4264, loss 0.027
step 4265, loss 0.068
step 4266, loss 0.055
step 4267, loss 0.004
step 4268, loss 0.004
step 4269, loss 0.003
step 4270, loss 0.002
step 4271, loss 0.022
step 4272, loss 0.003
step 4273, loss 0.010
step 4274, loss 0.061
step 4275, loss 0.032
step 4276, loss 0.071
step 4277, loss 0.008
step 4278, loss 0.002
step 4279, loss 0.008
step 4280, loss 0.014
step 4281, loss 0.020
step 4282, loss 0.017
step 4283, loss 0.025
step 4284, loss 0.026
step 4285, loss 0.018
step 4286, loss 0.053
step 4287, loss 0.002
step 4288, loss 0.085
step 4289, loss 0.034
step 4290, loss 0.019
step 4291, loss 0.055
step 4292, loss 0.002
step 4293, loss 0.016
step 4294, loss 0.009
step 4295, loss 0.015
step 4296, loss 0.004
step 4297,

step 4639, loss 0.007
step 4640, loss 0.010
step 4641, loss 0.002
step 4642, loss 0.007
step 4643, loss 0.042
step 4644, loss 0.045
step 4645, loss 0.039
step 4646, loss 0.001
step 4647, loss 0.005
step 4648, loss 0.011
step 4649, loss 0.037
step 4650, loss 0.014
step 4651, loss 0.011
step 4652, loss 0.004
step 4653, loss 0.004
step 4654, loss 0.004
step 4655, loss 0.019
step 4656, loss 0.032
step 4657, loss 0.004
step 4658, loss 0.025
step 4659, loss 0.005
step 4660, loss 0.001
step 4661, loss 0.003
step 4662, loss 0.001
step 4663, loss 0.001
step 4664, loss 0.053
step 4665, loss 0.035
step 4666, loss 0.007
step 4667, loss 0.002
step 4668, loss 0.007
step 4669, loss 0.046
step 4670, loss 0.010
step 4671, loss 0.063
step 4672, loss 0.013
step 4673, loss 0.008
step 4674, loss 0.002
step 4675, loss 0.001
step 4676, loss 0.004
step 4677, loss 0.014
step 4678, loss 0.033
step 4679, loss 0.002
step 4680, loss 0.050
step 4681, loss 0.026
step 4682, loss 0.004
step 4683, loss 0.002
step 4684,

In [170]:
final_labels = []
def test(model):
    model.eval()
    test_loader = build_batch(word2id_dict, test_corpus, batch_size, epoch_num, max_seq_len, train=False)
    for step, (sentences) in enumerate(test_loader):
        sentences = paddle.to_tensor(sentences)
        logits = model(sentences)
        labels = fluid.layers.argmax(F.softmax(logits, dtype='float64'),axis=-1).numpy().tolist()
        final_labels.extend(labels)

In [171]:
test(intent_clf)

In [172]:
final_labels_str = []
for label in final_labels:
    for key in label_dict:
        if label_dict[key] == label:
            final_labels_str.append(key)
            break
print(len(final_labels_str))
test_data["pred_by_w2v_emb_lstm"] = final_labels_str

3000


In [173]:
from collections import Counter
Counter(final_labels)

Counter({2: 444,
         5: 216,
         10: 204,
         7: 253,
         1: 252,
         8: 220,
         3: 301,
         0: 220,
         9: 192,
         4: 286,
         6: 242,
         11: 170})

In [166]:
from collections import Counter
Counter(final_labels)

Counter({10: 208,
         5: 199,
         7: 239,
         2: 381,
         1: 287,
         8: 235,
         3: 319,
         0: 236,
         9: 233,
         4: 308,
         6: 214,
         11: 141})

In [167]:
#在https://competition.coggle.club/上的结果是0.664667
#这个是随机初始化的lstm embedding
with open("results\\lstm_random_init.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["pred_by_init_emb_lstm"]+"\n")

In [174]:
#在https://competition.coggle.club/上的结果是0.784000
#这个是使用开源w2v初始化的lstm embedding
with open("results\\lstm_w2v_init.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["pred_by_w2v_emb_lstm"]+"\n")

In [175]:
#经过试验得出结论：
#1.Embedding层的精度与初始化方式相关吗？是相关的，如果随机初始化的结果远低于使用一个好的w2v作为初始化的结果
#2.LSTM模型精度与文本最大长度是否相关？根据数据分析得知，该数据集文本的最大长度是31个分词，因此如果文本最大长度取得比较大（比如128），那么导致整条文本大部分是pad,
# 这就会很影响最终的结果，但是把最大长度调整成32之后，最终的效果就提升很多。