# 任务描述：
社交媒体的发展在加速信息传播的同时，也带来了虚假谣言信息的泛滥，往往会引发诸多不安定因素，并对经济和社会产生巨大的影响。

2016年美国总统大选期间，受访选民平均每人每天接触到4篇虚假新闻，虚假新闻被认为影响了2016年美国大选和英国脱欧的投票结果；近期，在新型冠状病毒感染的肺炎疫情防控的关键期，在全国人民都为疫情揪心时，网上各种有关疫情防控的谣言接连不断，从“广州公交线路因新型冠状病毒肺炎疫情停运”到“北京市为防控疫情采取封城措施”，从“钟南山院士被感染”到“10万人感染肺炎”等等，这些不切实际的谣言，“操纵”了舆论感情，误导了公众的判断，更影响了社会稳定。

人们常说“流言止于智者”，要想不被网上的流言和谣言盅惑、伤害，首先需要对其进行科学甄别，而时下人工智能正在尝试担任这一角色。那么，在打假一线AI技术如何做到去伪存真？

传统的谣言检测模型一般根据谣言的内容、用户属性、传播方式人工地构造特征，而人工构建特征存在考虑片面、浪费人力等现象。本次实践使用基于循环神经网络（RNN）的谣言检测模型，将文本中的谣言事件向量化，通过循环神经网络的学习训练来挖掘表示文本深层的特征，避免了特征构建的问题，并能发现那些不容易被人发现的特征，从而产生更好的效果。

数据集介绍：

本次实践所使用的数据是从新浪微博不实信息举报平台抓取的中文谣言数据，数据集中共包含1538条谣言和1849条非谣言。如下图所示，每条数据均为json格式，其中text字段代表微博原文的文字内容。

更多数据集介绍请参考https://github.com/thunlp/Chinese_Rumor_Dataset。

![](https://ai-studio-static-online.cdn.bcebos.com/30665456670941acaf0ad4bfa78252e8d44f296dda8d48dea2ada26a5f10ef1a)

# 一、环境设置
本示例基于飞桨开源框架2.0版本。

In [1]:
import paddle as pd
import paddle.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from sklearn.model_selection import train_test_split
print(pd.__version__)

  from collections import MutableMapping
  from collections import Iterable, Mapping
  from collections import Sized


2.3.2


# 二、数据准备

（1）解压数据，读取并解析数据，生成all_data.txt

（2）生成数据字典，即dict.txt

（3）生成数据列表，并进行训练集与验证集的划分，train_list.txt 、eval_list.txt

（4）定义训练数据集提供器

In [2]:
import os, zipfile
src_path="data/data170001/Rumor_Dataset.zip"
target_path="/home/aistudio/data/Chinese_Rumor_Dataset-master"
if(not os.path.isdir(target_path)):
    z = zipfile.ZipFile(src_path, 'r')
    z.extractall(path=target_path)
    z.close()

In [3]:
import io
import os
import random
import json

#谣言数据文件路径
rumor_class_dirs = os.listdir(target_path+"/Chinese_Rumor_Dataset-master/CED_Dataset/rumor-repost/")

#非谣言数据文件路径
non_rumor_class_dirs = os.listdir(target_path+"/Chinese_Rumor_Dataset-master/CED_Dataset/non-rumor-repost/")

original_microblog = target_path+"/Chinese_Rumor_Dataset-master/CED_Dataset/original-microblog/"

#谣言标签为0，非谣言标签为1
rumor_label="0"
non_rumor_label="1"

#分别统计谣言数据与非谣言数据的总数
rumor_num = 0
non_rumor_num = 0

all_rumor_list = []
all_non_rumor_list = []

#解析谣言数据
for rumor_class_dir in rumor_class_dirs: 
    if(rumor_class_dir != '.DS_Store'):
        #遍历谣言数据，并解析
        with open(original_microblog + rumor_class_dir, 'r') as f:
	        rumor_content = f.read()
        rumor_dict = json.loads(rumor_content)
        all_rumor_list.append(rumor_label+"\t"+rumor_dict["text"]+"\n")
        rumor_num +=1

#解析非谣言数据
for non_rumor_class_dir in non_rumor_class_dirs: 
    if(non_rumor_class_dir != '.DS_Store'):
        with open(original_microblog + non_rumor_class_dir, 'r') as f2:
	        non_rumor_content = f2.read()
        non_rumor_dict = json.loads(non_rumor_content)
        all_non_rumor_list.append(non_rumor_label+"\t"+non_rumor_dict["text"]+"\n")
        non_rumor_num +=1
        
print("谣言数据总量为："+str(rumor_num))
print("非谣言数据总量为："+str(non_rumor_num))

谣言数据总量为：1538
非谣言数据总量为：1849


In [4]:
#全部数据进行乱序后写入all_data.txt

data_list_path="/home/aistudio/data/"
all_data_path=data_list_path + "all_data.txt"

all_data_list = all_rumor_list + all_non_rumor_list

random.shuffle(all_data_list)

#在生成all_data.txt之前，首先将其清空
with open(all_data_path, 'w') as f:
    f.seek(0)
    f.truncate() 
    
with open(all_data_path, 'a') as f:
    for data in all_data_list:
        f.write(data) 

In [5]:
# 生成数据字典
def create_dict(data_path, dict_path):
    with open(dict_path, 'w') as f:
        f.seek(0)
        f.truncate() 

    dict_set = set()
    # 读取全部数据
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # 把数据生成一个元组
    for line in lines:
        content = line.split('\t')[-1].replace('\n', '')
        for s in content:
            dict_set.add(s)
    # 把元组转换成字典，一个字对应一个数字
    dict_list = []
    i = 0
    for s in dict_set:
        dict_list.append([s, i])
        i += 1
    # 添加未知字符
    dict_txt = dict(dict_list)
    end_dict = {"<unk>": i}
    dict_txt.update(end_dict)
    end_dict = {"<pad>": i+1}
    dict_txt.update(end_dict)
    # 把这些字典保存到本地中
    with open(dict_path, 'w', encoding='utf-8') as f:
        f.write(str(dict_txt))

        
    print("数据字典生成完成！")


In [151]:
# 创建序列化表示的数据,并按照一定比例划分训练数据train_list.txt与验证数据eval_list.txt
def create_data_list(data_list_path):
    #在生成数据之前，首先将eval_list.txt和train_list.txt清空
    with open(os.path.join(data_list_path, 'eval_list.txt'), 'w', encoding='utf-8') as f_eval:
        f_eval.seek(0)
        f_eval.truncate()
        
    with open(os.path.join(data_list_path, 'train_list.txt'), 'w', encoding='utf-8') as f_train:
        f_train.seek(0)
        f_train.truncate() 
    
    with open(os.path.join(data_list_path, 'dict.txt'), 'r', encoding='utf-8') as f_data:
        dict_txt = eval(f_data.readlines()[0])

    with open(os.path.join(data_list_path, 'all_data.txt'), 'r', encoding='utf-8') as f_data:
        lines = f_data.readlines()
    
    i = 0
    maxlen = 0
    with open(os.path.join(data_list_path, 'eval_list.txt'), 'a', encoding='utf-8') as f_eval,open(os.path.join(data_list_path, 'train_list.txt'), 'a', encoding='utf-8') as f_train:
        for line in lines:
            words = line.split('\t')[-1].replace('\n', '')
            maxlen = max(maxlen, len(words))
            label = line.split('\t')[0]
            labs = ""
            # 每8个 抽取一个数据用于验证
            if i % 8 == 0:    # n=5
                for s in words:
                    lab = str(dict_txt[s])
                    labs = labs + lab + ','
                labs = labs[:-1]
                labs = labs + '\t' + label + '\n'
                f_eval.write(labs)
            else:
                for s in words:
                    lab = str(dict_txt[s])
                    labs = labs + lab + ','
                labs = labs[:-1]
                labs = labs + '\t' + label + '\n'
                f_train.write(labs)
            i += 1
        
    print("数据列表生成完成！")
    print(maxlen)

In [152]:
# 把生成的数据列表都放在自己的总类别文件夹中
data_root_path = "/home/aistudio/data/" 
data_path = os.path.join(data_root_path, 'all_data.txt')
dict_path = os.path.join(data_root_path, "dict.txt")

# 创建数据字典
create_dict(data_path, dict_path)

# 创建数据列表
create_data_list(data_root_path)

数据字典生成完成！
数据列表生成完成！
226


In [153]:
def load_vocab(file_path):
    fr = open(file_path, 'r', encoding='utf8')
    vocab = eval(fr.read())   #读取的str转换为字典
    fr.close()

    return vocab

In [154]:
# 打印前2条训练数据
vocab = load_vocab(os.path.join(data_root_path, 'dict.txt'))
label_list = ["0","1"]

def ids_to_str(ids):
    words = []
    for k in ids:
        w = list(vocab.keys())[list(vocab.values()).index(int(k))]
        words.append(w if isinstance(w, str) else w.decode('ASCII'))
    return " ".join(words)

file_path = os.path.join(data_root_path, 'train_list.txt')
with io.open(file_path, "r", encoding='utf8') as fin:
        i = 0
        for line in fin:
            i += 1
            cols = line.strip().split("\t")
            if len(cols) != 2:
                sys.stderr.write("[NOTICE] Error Format Line!")
                continue
            label = int(cols[1])
            wids = cols[0].split(",")
            print(str(i)+":")
            print('sentence list id is:', wids)
            print('sentence list is: ', ids_to_str(wids))
            print('sentence label id is:', label)
            print('---------------------------------')
            
            if i == 2: break

1:
sentence list id is: ['1269', '1785', '184', '2662', '1370', '2351', '3181', '592', '2208', '3924', '1268', '884', '4153', '2363', '1369', '2403', '2662', '1472', '3698', '1153', '2978', '9', '3648', '3658', '2662', '3560', '1522', '339', '4254', '709', '3560', '1522', '2627', '1179', '3241', '3281', '323', '1431', '2480', '2627', '1179', '3832', '2287', '1199', '1816', '3182', '3243', '2662', '433', '1364', '3340', '3925', '2662', '3916', '433', '609', '3560', '3916', '1933', '406', '3673', '2146', '4114', '1878', '4153', '2185', '646', '2480', '1116', '1370', '2351', '3181', '1785', '3181', '3061', '2208', '1221', '2627', '1179', '3560', '1059', '1634', '4052', '2662', '2774', '1392', '2872', '2872', '1199', '3804', '2955', '2662', '3844', '660', '183', '3383', '2332', '1116', '1370', '764', '1785', '3856', '2737', '3273', '2662', '3273', '716', '3832', '4057', '646', '2340', '1370', '2202', '108', '3772', '4218', '2430', '1221', '3632', '3632']
sentence list is:  【 这 事 ， 你 怎 么 看 

In [155]:

vocab = load_vocab(os.path.join(data_root_path, 'dict.txt'))

class RumorDataset(pd.io.Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.all_data = []
       
        with io.open(self.data_dir, "r", encoding='utf8') as fin:
            for line in fin:
                cols = line.strip().split("\t")
                if len(cols) != 2:
                    sys.stderr.write("[NOTICE] Error Format Line!")
                    continue
                label = []
                label.append(int(cols[1]))
                wids = cols[0].split(",")
                if len(wids)>=150:
                    wids = np.array(wids[:150]).astype('int64')     
                else:
                    wids = np.concatenate([wids, [vocab["<pad>"]]*(150-len(wids))]).astype('int64')
                label = np.array(label).astype('int64')
                self.all_data.append((wids, label))

        
    def __getitem__(self, index):
        data, label = self.all_data[index]
        return data, label

    def __len__(self):
        return len(self.all_data)


batch_size = 32
train_dataset = RumorDataset(os.path.join(data_root_path, 'train_list.txt'))
test_dataset = RumorDataset(os.path.join(data_root_path, 'eval_list.txt'))

train_loader = pd.io.DataLoader(train_dataset, places=pd.CPUPlace(), return_list=True,
                                    shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = pd.io.DataLoader(test_dataset, places=pd.CPUPlace(), return_list=True,
                                    shuffle=True, batch_size=batch_size, drop_last=True)




#check

print('=============train_dataset =============') 
for data, label in train_dataset:
    print(data)
    print(np.array(data).shape)
    print(label)
    break


print('=============test_dataset =============') 
for data, label in test_dataset:
    print(data)
    print(np.array(data).shape)
    print(label)
    break




[1269 1785  184 2662 1370 2351 3181  592 2208 3924 1268  884 4153 2363
 1369 2403 2662 1472 3698 1153 2978    9 3648 3658 2662 3560 1522  339
 4254  709 3560 1522 2627 1179 3241 3281  323 1431 2480 2627 1179 3832
 2287 1199 1816 3182 3243 2662  433 1364 3340 3925 2662 3916  433  609
 3560 3916 1933  406 3673 2146 4114 1878 4153 2185  646 2480 1116 1370
 2351 3181 1785 3181 3061 2208 1221 2627 1179 3560 1059 1634 4052 2662
 2774 1392 2872 2872 1199 3804 2955 2662 3844  660  183 3383 2332 1116
 1370  764 1785 3856 2737 3273 2662 3273  716 3832 4057  646 2340 1370
 2202  108 3772 4218 2430 1221 3632 3632 4410 4410 4410 4410 4410 4410
 4410 4410 4410 4410 4410 4410 4410 4410 4410 4410 4410 4410 4410 4410
 4410 4410 4410 4410 4410 4410 4410 4410 4410 4410]
(150,)
[0]
[2354 3963 2332  571 2868  920  427 4147 3925 1876 4025 3762  557 4039
 2421 3672  276 1088 2409 1169   23 2461 1229 2332 2247 4226  922  798
  631 2764 1465  571 1878 2831 2624 2624 1762 4153  607  607  607 2953
  764 1508 240

# 三、模型配置

In [11]:
#参数设置
vocab_size = len(vocab)  
maxlen = 200  
seq_len = 200
batch_size = 32
epochs = 3
pad_id = vocab['<pad>']
embed_dim = 128  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_dim = 128  # Hidden layer size in feed forward network inside transformer

classes = ['0', '1']

In [12]:
class MultiHeadSelfAttention(nn.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = nn.Linear(embed_dim, embed_dim)
        self.key_dense = nn.Linear(embed_dim, embed_dim)
        self.value_dense = nn.Linear(embed_dim, embed_dim)
        self.combine_heads = nn.Linear(embed_dim, embed_dim)

    def attention(self, query, key, value):
        score = pd.matmul(query, key, transpose_y=True)
        dim_key = pd.cast(pd.shape(key)[-1], 'float32')
        scaled_score = score / pd.sqrt(dim_key)
        weights = nn.functional.softmax(scaled_score, axis=-1)
        output = pd.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = pd.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return pd.transpose(x, perm=[0, 2, 1, 3])

    def forward(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = pd.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = pd.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = pd.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

In [13]:
class PointWiseFeedForwardNetwork(nn.Layer):
    def __init__(self, embed_dim, feed_dim):
        super(PointWiseFeedForwardNetwork, self).__init__()
        self.linear1 = pd.fluid.dygraph.Linear(embed_dim, feed_dim, act='relu')
        self.linear2 = nn.Linear(feed_dim, embed_dim)

    def forward(self, x):
        out = self.linear1(x)
        out = self.linear2(out)
        return out

In [14]:
class TokenAndPositionEmbedding(nn.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = pd.shape(x)[-1]
        positions = pd.arange(start=0, end=maxlen, step=1, dtype='int64')
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [15]:
class TransformerBlock(nn.Layer):
    def __init__(self, embed_dim, num_heads, feed_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = PointWiseFeedForwardNetwork(embed_dim, feed_dim)
        self.layernorm1 = nn.LayerNorm(embed_dim, epsilon=1e-6)
        self.layernorm2 = nn.LayerNorm(embed_dim, epsilon=1e-6)
        self.dropout1 = nn.Dropout(rate)
        self.dropout2 = nn.Dropout(rate)

    def forward(self, inputs):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [16]:
class MyNet(nn.Layer):
    def __init__(self):
        super(MyNet, self).__init__()
        self.emb = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.trs = TransformerBlock(embed_dim, num_heads, feed_dim)
        self.drop1 = nn.Dropout(0.1)
        self.relu = pd.fluid.dygraph.Linear(feed_dim, 20, act='relu')
        self.drop2 = nn.Dropout(0.1)
        self.soft = pd.fluid.dygraph.Linear(20, 2, act='softmax')

    def forward(self, x):
        x = self.emb(x)
        x = self.trs(x)
        x = pd.mean(x, axis=1)
        x = self.drop1(x)
        x = self.relu(x)
        x = self.drop2(x)
        x = self.soft(x)
        return x

In [22]:
import time

In [186]:
model = pd.Model(MyNet()) # 用 Model封装 MyNet
optim = pd.optimizer.Adam(learning_rate=0.002, parameters=model.parameters())

# 配置模型
model.prepare(optim,pd.nn.CrossEntropyLoss(),metrics=pd.metric.Accuracy())
# # 模型配置
# model.prepare(optimizer=pd.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()),
#               loss=nn.CrossEntropyLoss(),
#               Accuracy())

#visualdl=pd.callbacks.VisualDL(log_dir='visual_log')

# 模型训练
a = time.time()
model.fit(train_loader,
          test_loader,
          epochs=epochs,
          batch_size=batch_size,
          verbose=1,
          save_dir='./model/',save_freq=1)
b = time.time()
print(b-a)


The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/3
save checkpoint at /home/aistudio/model/0
Eval begin...
Eval samples: 416
Epoch 2/3
save checkpoint at /home/aistudio/model/1
Eval begin...
Eval samples: 416
Epoch 3/3
save checkpoint at /home/aistudio/model/2
Eval begin...
Eval samples: 416
save checkpoint at /home/aistudio/model/final
2.8827991485595703


In [187]:
import numpy as np
from sklearn.metrics import confusion_matrix

result = model.predict(test_dataset, batch_size=32, num_workers=0, stack_outputs=True, callbacks=None)

#print(result)
R=result[0]
P=np.argmax(R, axis=1)
print
r = []
for data, label in test_dataset:
    r.append(label)
R = r
confusion_matrix(R, P)

Predict begin...
Predict samples: 424


array([[155,  38],
       [ 14, 217]])

In [188]:
from sklearn.metrics import classification_report
target_names = ["0","1"]
CR=classification_report(R, P, target_names=target_names)
print(CR)

              precision    recall  f1-score   support

           0       0.92      0.80      0.86       193
           1       0.85      0.94      0.89       231

    accuracy                           0.88       424
   macro avg       0.88      0.87      0.87       424
weighted avg       0.88      0.88      0.88       424



import paddle
from paddle.nn import Conv2D, Linear, Embedding
from paddle import to_tensor
import paddle.nn.functional as F

class RNN(paddle.nn.Layer):
    def __init__(self):
        super(RNN, self).__init__()
        self.dict_dim = vocab["<pad>"]
        self.emb_dim = 128
        self.hid_dim = 128
        self.class_dim = 2
        self.embedding = Embedding(
            self.dict_dim + 1, self.emb_dim,
            sparse=False)
        self._fc1 = Linear(self.emb_dim, self.hid_dim)
        self.lstm = paddle.nn.LSTM(self.hid_dim, self.hid_dim)
        self.fc2 = Linear(19200, self.class_dim)

    def forward(self, inputs):
        # [32, 150]
        emb = self.embedding(inputs)
        # [32, 150, 128]
        fc_1 = self._fc1(emb)
        # [32, 150, 128]
        x = self.lstm(fc_1)
        x = paddle.reshape(x[0], [0, -1])
        x = self.fc2(x)
        x = paddle.nn.functional.softmax(x)
        return x

rnn = RNN()
paddle.summary(rnn,(32,150),"int64")

# 四、模型训练

def draw_process(title,color,iters,data,label):
    plt.title(title, fontsize=24)
    plt.xlabel("iter", fontsize=20)
    plt.ylabel(label, fontsize=20)
    plt.plot(iters, data,color=color,label=label) 
    plt.legend()
    plt.grid()
    plt.show()

def train(model):
    model.train()
    opt = pd.optimizer.Adam(learning_rate=0.002, parameters=model.parameters())
    
    steps = 0
    Iters, total_loss, total_acc = [], [], []
    
    for epoch in range(5):
        for batch_id, data in enumerate(train_loader):
            steps += 1
            sent = data[0]
            label = data[1]
            
            logits = model(sent)
            loss = pd.nn.functional.cross_entropy(logits, label)
            acc = pd.metric.accuracy(logits, label)
            recall = pd.metric.Recall()
            precision = pd.metric.Precision()

            if batch_id % 50 == 0:
                Iters.append(steps)
                total_loss.append(loss.numpy()[0])
                total_acc.append(acc.numpy()[0])

                print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, loss.numpy()))
            
            loss.backward()
            opt.step()
            opt.clear_grad()

        # evaluate model after one epoch
        model.eval()
        accuracies = []
        losses = []
        
        for batch_id, data in enumerate(test_loader):
            
            sent = data[0]
            label = data[1]

            logits = model(sent)
            loss = pd.nn.functional.cross_entropy(logits, label)
            acc = pd.metric.accuracy(logits, label)
            
            accuracies.append(acc.numpy())
            losses.append(loss.numpy())
        
        avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)

        print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss))
        
        model.train()

    pd.save(model.state_dict(),"model_final.pdparams")
    
    draw_process("trainning loss","red",Iters,total_loss,"trainning loss")
    draw_process("trainning acc","green",Iters,total_acc,"trainning acc")
        
import time
start_time=time.time()
model = MyNet()
train(model)
end_time=time.time()
running_time=end_time-start_time
print(running_time)

# 五、模型评估

In [42]:
'''
模型评估
'''
model_state_dict = pd.load('/home/aistudio/model/final.pdparams')
model = MyNet()
model.set_state_dict(model_state_dict) 
model.eval()
accuracies = []
losses = []
recalles = []
precisiones = []

for batch_id, data in enumerate(test_loader):
    
    sent = data[0]
    label = data[1]

    logits = model(sent)
    loss = pd.nn.functional.cross_entropy(logits, label)
    acc = pd.metric.accuracy(logits, label)

    
    accuracies.append(acc.numpy())
    losses.append(loss.numpy())
    

avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)
print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss))

[validation] accuracy: 0.838942289352417, loss: 0.46633583307266235


# 六、模型预测

In [40]:


label_map = {0:"是", 1:"否"}

model_state_dict = pd.load('/home/aistudio/model/final.pdparams')
model = MyNet()
model.set_state_dict(model_state_dict) 
model.eval()

for batch_id, data in enumerate(test_loader):
    
    sent = data[0]
    results = model(sent)

    predictions = []
    for probs in results:
        # 映射分类label
        idx = np.argmax(probs)
        labels = label_map[idx]
        predictions.append(labels)
    
    for i,pre in enumerate(predictions):
        print('数据: {} \n\n是否谣言: {}'.format(ids_to_str(sent[0]), pre))
        break
    break

1
1
1
1
0
0
1
1
0
1
1
0
1
0
0
1
数据: 【 国 家 工 商 总 局 决 定 停 止 企 业 年 检 】 中 国 国 家 工 商 总 局 决 定 ， 自 今 年 3 月 1 日 起 停 止 对 领 取 营 业 执 照 的 有 限 责 任 公 司 、 股 份 有 限 公 司 、 非 公 司 企 业 法 人 、 合 伙 企 业 、 个 人 独 资 企 业 及 其 分 支 机 构 、 来 华 从 事 经 营 活 动 的 外 国 （ 地 区 ） 企 业 ， 及 其 他 经 营 单 位 的 企 业 年 检 工 作 。 新 华 网 h t t p : / / t . c n / 8 F H w G 7 9   h t t p : / / t . 

是否谣言: 否
