# 一、环境设置
本示例基于飞桨开源框架2.0版本。

In [33]:
import paddle
from paddle.nn import Conv2D, Linear, Embedding
from paddle import to_tensor
import paddle.nn.functional as F
import os, zipfile
import io, random, json
import numpy as np
import matplotlib.pyplot as plt
print(paddle.__version__)



2.0.2


# 二、数据准备

（1）解压数据，读取并解析数据，生成all_data.txt

（2）生成数据字典，即dict.txt

（3）生成数据列表，并进行训练集与验证集的划分，train_list.txt 、eval_list.txt

（4）定义训练数据集提供器

In [34]:

#解压原始数据集，将Rumor_Dataset.zip解压至data目录下
src_path="data/data20519/Rumor_Dataset.zip"
target_path="/home/aistudio/data/Chinese_Rumor_Dataset-master"
if(not os.path.isdir(target_path)):
    z = zipfile.ZipFile(src_path, 'r')
    z.extractall(path=target_path)
    z.close()





In [35]:


#数据文件路径
rumor_class_dirs = os.listdir(target_path+"/Chinese_Rumor_Dataset-master/CED_Dataset/rumor-repost/")

non_rumor_class_dirs = os.listdir(target_path+"/Chinese_Rumor_Dataset-master/CED_Dataset/non-rumor-repost/")
original_microblog = target_path+"/Chinese_Rumor_Dataset-master/CED_Dataset/original-microblog/"

rumor_label="0"
non_rumor_label="1"


rumor_num = 0
non_rumor_num = 0
all_rumor_list = []
all_non_rumor_list = []

#解析数据
for rumor_class_dir in rumor_class_dirs: 
    if(rumor_class_dir != '.DS_Store'):
        #遍历数据，并解析
        with open(original_microblog + rumor_class_dir, 'r') as f:
	        rumor_content = f.read()
        rumor_dict = json.loads(rumor_content)
        all_rumor_list.append(rumor_label+"\t"+rumor_dict["text"]+"\n")
        rumor_num +=1

#解析数据
for non_rumor_class_dir in non_rumor_class_dirs: 
    if(non_rumor_class_dir != '.DS_Store'):
        with open(original_microblog + non_rumor_class_dir, 'r') as f2:
	        non_rumor_content = f2.read()
        non_rumor_dict = json.loads(non_rumor_content)
        all_non_rumor_list.append(non_rumor_label+"\t"+non_rumor_dict["text"]+"\n")
        non_rumor_num +=1
        

In [36]:
#全部数据进行乱序后写入all_data.txt

data_list_path="/home/aistudio/data/"
all_data_path=data_list_path + "all_data.txt"

all_data_list = all_rumor_list + all_non_rumor_list

random.shuffle(all_data_list)

#在生成all_data.txt之前，首先将其清空
with open(all_data_path, 'w') as f:
    f.seek(0)
    f.truncate() 
    
with open(all_data_path, 'a') as f:
    for data in all_data_list:
        f.write(data) 

In [37]:
# 生成数据字典
def create_dict(data_path, dict_path):
    with open(dict_path, 'w') as f:
        f.seek(0)
        f.truncate() 

    dict_set = set()
    # 读取全部数据
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # 把数据生成一个元组
    for line in lines:
        content = line.split('\t')[-1].replace('\n', '')
        for s in content:
            dict_set.add(s)
    # 把元组转换成字典，一个字对应一个数字
    dict_list = []
    i = 0
    for s in dict_set:
        dict_list.append([s, i])
        i += 1
    # 添加未知字符
    dict_txt = dict(dict_list)
    end_dict = {"<unk>": i}
    dict_txt.update(end_dict)
    end_dict = {"<pad>": i+1}
    dict_txt.update(end_dict)
    # 把这些字典保存到本地中
    with open(dict_path, 'w', encoding='utf-8') as f:
        f.write(str(dict_txt))
        
    print("数据字典生成完成！")


In [38]:
# 创建序列化表示的数据,并按照一定比例划分训练数据train_list.txt与验证数据eval_list.txt
def load_vocab(file_path):
    fr = open(file_path, 'r', encoding='utf8')
    vocab = eval(fr.read())   #读取的str转换为字典
    fr.close()

    return vocab

def f_write_txt(words, dict_txt, label):
    labs = ""
    for s in words:
        lab = str(dict_txt[s])
        labs = labs + lab + ','
    labs = labs[:-1]
    labs = labs + '\t' + label + '\n'
    return labs

def create_data_list(data_path, train_path, test_path, dict_path):
    
    dict_txt = load_vocab(dict_path)
    with open(data_path, 'r', encoding='utf-8') as f_data:
        lines = f_data.readlines()

    i = 0
    maxlen = 0
    with open(test_path, 'a', encoding='utf-8') as f_eval,open(train_path, 'a', encoding='utf-8') as f_train:
        for line in lines:
            words = line.split('\t')[-1].replace('\n', '')
            maxlen = max(maxlen, len(words))
            label = line.split('\t')[0]
            labs = f_write_txt(words, dict_txt, label)
            # 每8个 抽取一个数据用于验证
            if i % 7 == 0:
                f_eval.write(labs)
            else:
                f_train.write(labs)
            i += 1
    print("数据列表生成完成！")
    print(maxlen)




In [39]:
# 把生成的数据列表都放在自己的总类别文件夹中
data_root_path = "/home/aistudio/data/" 
data_path = os.path.join(data_root_path, 'all_data.txt')
train_path = os.path.join(data_list_path, 'train_list.txt')
test_path = os.path.join(data_list_path, 'eval_list.txt')
dict_path = os.path.join(data_root_path, "dict.txt")

# 创建数据字典
create_dict(data_path, dict_path)

# 创建数据列表

#在生成数据之前，首先将eval_list.txt和train_list.txt清空
with open(test_path, 'w', encoding='utf-8') as f_eval:
    f_eval.seek(0)
    f_eval.truncate()
with open(train_path, 'w', encoding='utf-8') as f_train:
    f_train.seek(0)
    f_train.truncate() 

create_data_list(data_path, train_path, test_path, dict_path)



数据字典生成完成！
数据列表生成完成！
226


In [40]:
# 打印前2条训练数据
vocab = load_vocab(dict_path)

def ids_to_str(ids):
    words = []
    for k in ids:
        w = list(vocab.keys())[list(vocab.values()).index(int(k))]
        words.append(w if isinstance(w, str) else w.decode('ASCII'))
    return " ".join(words)

with io.open(train_path, "r", encoding='utf8') as fin:
        i = 0
        for line in fin:
            i += 1
            cols = line.strip().split("\t")
            if len(cols) != 2:
                sys.stderr.write("[NOTICE] Error Format Line!")
                continue
            label = int(cols[1])
            wids = cols[0].split(",")
            print(str(i)+":")
            print('sentence list id is:', wids)
            print('sentence list is: ', ids_to_str(wids))
            print('sentence label id is:', label)
            print('---------------------------------')
            
            if i == 2: break




1:
sentence list id is: ['1900', '176', '3761', '3249', '4400', '4299', '1634', '3455', '3249', '3247', '1882', '1359', '117', '1108', '2191', '1946', '1760', '1170', '1170', '950', '4235', '2598', '1204', '687', '1082', '557', '1082', '3487', '949', '32', '1672', '2844', '3249', '442', '1859', '3973', '2699', '123', '1442', '4299', '1634', '1615', '1193', '951', '110', '1795', '1247', '2026', '514', '1615', '3847', '3557', '4235', '1870', '4120', '1518', '624', '666', '3814', '1297', '2026', '103', '1442', '3119', '497', '1615', '6', '1647', '4299', '1634', '1041', '2315', '1615', '4299', '1634', '1167', '1647', '3455', '1557', '1615', '465', '4011', '1112', '3352', '3585', '1985', '1134', '3677', '2032', '2635', '1204', '687', '1082', '3480', '3249', '4051', '154', '1598', '666', '1744', '624', '666', '2010', '3455', '51', '1442', '4299', '1634', '167', '4242', '2032', '1082', '3487', '3933', '36', '4012', '1268', '3110', '2598', '3460', '2776', '1615', '2327', '557', '1769', '952', 

In [41]:



class RumorDataset(paddle.io.Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.all_data = []
       
        with io.open(self.data_dir, "r", encoding='utf8') as fin:
            for line in fin:
                cols = line.strip().split("\t")
                if len(cols) != 2:
                    sys.stderr.write("[NOTICE] Error Format Line!")
                    continue
                label = []
                label.append(int(cols[1]))
                wids = cols[0].split(",")
                if len(wids)>=150:
                    wids = np.array(wids[:150]).astype('int64')     
                else:
                    wids = np.concatenate([wids, [vocab["<pad>"]]*(150-len(wids))]).astype('int64')
                label = np.array(label).astype('int64')
                self.all_data.append((wids, label))

        
    def __getitem__(self, index):
        data, label = self.all_data[index]
        return data, label

    def __len__(self):
        return len(self.all_data)


batch_size = 32
train_dataset = RumorDataset(train_path)
test_dataset = RumorDataset(test_path)

train_loader = paddle.io.DataLoader(train_dataset, places=paddle.CPUPlace(), return_list=True,
                                    shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = paddle.io.DataLoader(test_dataset, places=paddle.CPUPlace(), return_list=True,
                                    shuffle=True, batch_size=batch_size, drop_last=True)




#check

print('=============train_dataset =============') 
for data, label in train_dataset:
    print(data)
    print(np.array(data).shape)
    print(label)
    break


print('=============test_dataset =============') 
for data, label in test_dataset:
    print(data)
    print(np.array(data).shape)
    print(label)
    break




[1900  176 3761 3249 4400 4299 1634 3455 3249 3247 1882 1359  117 1108
 2191 1946 1760 1170 1170  950 4235 2598 1204  687 1082  557 1082 3487
  949   32 1672 2844 3249  442 1859 3973 2699  123 1442 4299 1634 1615
 1193  951  110 1795 1247 2026  514 1615 3847 3557 4235 1870 4120 1518
  624  666 3814 1297 2026  103 1442 3119  497 1615    6 1647 4299 1634
 1041 2315 1615 4299 1634 1167 1647 3455 1557 1615  465 4011 1112 3352
 3585 1985 1134 3677 2032 2635 1204  687 1082 3480 3249 4051  154 1598
  666 1744  624  666 2010 3455   51 1442 4299 1634  167 4242 2032 1082
 3487 3933   36 4012 1268 3110 2598 3460 2776 1615 2327  557 1769  952
 4223  557 1951 4259  196 2032 2032 2032 1882  487 1108 4410 4410 4410
 4410 4410 4410 4410 4410 4410 4410 4410 4410 4410]
(150,)
[0]
[1882 2087 2868 1108 1882 2087 2868 1108 1900  307 2903 1306 4066 2648
 2244 3828 3650  151 1082  673 1193 1544 4122 3972 4156 2779 3061  955
 4257 3056  221 2191 4270 1748  880 1268 1615 2915  250  307 2903 2779
 1306 4066 263

# 三、模型配置

In [42]:







#定义卷积网络
class Conv_FFD(paddle.nn.Layer):
    def __init__(self):
        super(Conv_FFD,self).__init__()
        self.dict_dim = vocab["<pad>"]
        self.emb_dim = 128
        self.hid_dim = 128
        self.fc_hid_dim = 96
        self.class_dim = 2
        self.channels = 1
        self.win_size = [3, self.hid_dim]
        self.batch_size = 32
        self.seq_len = 150
        self.embedding = Embedding(self.dict_dim + 1, self.emb_dim, sparse=False)
        self.hidden1 = paddle.nn.Conv2D(in_channels=1,                        #通道数
                                            out_channels=self.hid_dim,        #卷积核个数
                                            kernel_size=self.win_size,        #卷积核大小
                                            padding=[1, 1]
                                            )                         
        self.relu1 = paddle.nn.ReLU()
        self.hidden3 = paddle.nn.MaxPool2D(kernel_size=2,         #池化核大小
                                            stride=2)             #池化步长2
        self.hidden4 = paddle.nn.Linear(128*75, 2)
    #网络的前向计算过程
    def forward(self,input):
        
        #print('输入维度：', input.shape)
        x = self.embedding(input)
        x = paddle.reshape(x, [32, 1, 150, 128])   
        x = self.hidden1(x)
        x = self.relu1(x)
        #print('第一层卷积输出维度：', x.shape)
        x = self.hidden3(x)
        #print('池化后输出维度：', x.shape)
        #在输入全连接层时，需将特征图拉平会自动将数据拉平.

        x = paddle.reshape(x, shape=[self.batch_size, -1])
        out = self.hidden4(x)
        return out

# 四、模型训练

In [32]:
def draw_process(title,color,iters,data,label):
    plt.title(title, fontsize=24)
    plt.xlabel("iter", fontsize=20)
    plt.ylabel(label, fontsize=20)
    plt.plot(iters, data,color=color,label=label) 
    plt.legend()
    plt.grid()
    plt.show()

def train(model):
    model.train()
    opt = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())
    steps = 0
    Iters, total_loss, total_acc = [], [], []
    for epoch in range(3):
        for batch_id, data in enumerate(train_loader):
            steps += 1
            sent = data[0]
            label = data[1]
            logits = model(sent)
            loss = paddle.nn.functional.cross_entropy(logits, label)
            acc = paddle.metric.accuracy(logits, label)
            if batch_id % 50 == 0:
                Iters.append(steps)
                total_loss.append(loss.numpy()[0])
                total_acc.append(acc.numpy()[0])
                print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, loss.numpy()))
            
            loss.backward()
            opt.step()
            opt.clear_grad()

        # evaluate model after one epoch
        model.eval()
        accuracies = []
        losses = []
        for batch_id, data in enumerate(test_loader):
            sent = data[0]
            label = data[1]
            logits = model(sent)
            loss = paddle.nn.functional.cross_entropy(logits, label)
            acc = paddle.metric.accuracy(logits, label)
            accuracies.append(acc.numpy())
            losses.append(loss.numpy())
        
        avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)
        print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss))
        model.train()

    paddle.save(model.state_dict(),"model_final.pdparams")
    draw_process("trainning loss","red",Iters,total_loss,"trainning loss")
    draw_process("trainning acc","green",Iters,total_acc,"trainning acc")
        
#model = CNN()
model = Conv_FFD()
train(model)

epoch: 0, batch_id: 0, loss is: [0.6900476]
epoch: 0, batch_id: 50, loss is: [0.7203587]
[validation] accuracy: 0.824999988079071, loss: 0.47299620509147644
epoch: 1, batch_id: 0, loss is: [0.3159663]
epoch: 1, batch_id: 50, loss is: [0.26881313]
[validation] accuracy: 0.8416666388511658, loss: 0.3673580586910248
epoch: 2, batch_id: 0, loss is: [0.11671057]
epoch: 2, batch_id: 50, loss is: [0.05700203]


# 五、模型评估

In [None]:
'''
模型评估
'''
model_state_dict = paddle.load('model_final.pdparams')
model = Conv_FFD()
model.set_state_dict(model_state_dict) 
model.eval()
accuracies = []
losses = []

for batch_id, data in enumerate(test_loader):
    
    sent = data[0]
    label = data[1]

    logits = model(sent)
    loss = paddle.nn.functional.cross_entropy(logits, label)
    acc = paddle.metric.accuracy(logits, label)
    
    accuracies.append(acc.numpy())
    losses.append(loss.numpy())

avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)
print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss))

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix
model_state_dict = paddle.load('model_final.pdparams')
model = Conv_FFD()
model.set_state_dict(model_state_dict) 
model.eval()
predictions = []
r = []
for batch_id, data in enumerate(test_loader):
    
    sent = data[0]
    gt_labels = data[1].numpy()
    for i in gt_labels:
        r.append(i)
    results = model(sent)
    for probs in results:
        # 映射分类label
        idx = np.argmax(probs)
        predictions.append(idx)
    
confusion_matrix(r, predictions)
from sklearn.metrics import classification_report
target_names = ["0","1"]
CR=classification_report(r, predictions, target_names=target_names)
print(CR)

# 六、模型预测

In [None]:


label_map = {0:"谣言", 1:"不是谣言"}

model_state_dict = paddle.load('model_final.pdparams')
model = Conv_FFD()
model.set_state_dict(model_state_dict) 
model.eval()

for batch_id, data in enumerate(test_loader):
    
    sent = data[0]
    gt_labels = data[1].numpy()
   
    results = model(sent)

    predictions = []
    for probs in results:
        # 映射分类label
        idx = np.argmax(probs)
        labels = label_map[idx]
        predictions.append(labels)
    
    for i,pre in enumerate(predictions):
        print('数据: {} \n\n预测: {} \n原始标签：{}'.format(ids_to_str(sent[0]).replace(" ", "").replace("<pad>",""), pre, label_map[gt_labels[0][0]]))
        break
    break