In [1]:
import json, os
import pickle
from tqdm import tqdm

#PyTorch用的包
import torch
import torch.nn as nn
import torch.optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# 自然语言处理相关的包
import re #正则表达式的包
import jieba #结巴分词包
from collections import Counter #搜集器，可以让统计词频更简单

#绘图、计算用的程序包
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# 设置随机种子保证可复现
import random
SEED = 729608
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
# os python hash seed, make experiment reproducable
os.environ['PYTHONHASHSEED'] = str(SEED)
# gpu algorithom 
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# Generator SEED
Generator = torch.Generator()
Generator.manual_seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f466cf927d0>

In [2]:
# 数据来源文件夹 -- 内含多个json文件
non_rumor = './Chinese_Rumor_Dataset/CED_Dataset/non-rumor-repost'
rumor = './Chinese_Rumor_Dataset/CED_Dataset/rumor-repost'
original = './Chinese_Rumor_Dataset/CED_Dataset/original-microblog'

non_rumor_data = []
rumor_data = []

# 遍历文件夹，读取文本数据
print('开始读取数据')
for file in tqdm(os.listdir(original)):
    try:
        data = json.load(open(os.path.join(original, file), 'rb'))['text']
    except:
        continue

    is_rumor = (file in os.listdir(rumor))
    if is_rumor:
        rumor_data.append(data)
    else:
        non_rumor_data.append(data)

print('结束, 有{}条谣言, 有{}条非谣言!'.format(len(rumor_data), len(non_rumor_data)))
print(non_rumor_data[-2:])
print('-'*20)
print(rumor_data[-2:])


# 把数据储存到指定地方 -- 统一到2个txt文件
pth = './rumor_detection_data'
if not os.path.exists(pth):
    os.makedirs(pth)

good_file = os.path.join(pth, 'non_rumor.txt')
bad_file = os.path.join(pth, 'rumor.txt')

# with open(good_file, 'w', encoding='utf-8') as f:
#     f.write('\n'.join(non_rumor_data))
# with open(bad_file, 'w', encoding='utf-8') as f:
#     f.write('\n'.join(rumor_data))

开始读取数据


100%|██████████| 3389/3389 [00:01<00:00, 2185.01it/s]

结束, 有1538条谣言, 有1849条非谣言!
['＂有时最坚强的人是 - 不因缺陷而不爱，关起门才哭泣，和打没有人知道的仗。＂(Sometimes the strongest people are the one love beyond all faults, cry behind closed doors and fight battles that nobody knows about。)', '想研究外星人么？ 去爱大吧！！ 爱丁堡大学计划今年年底第一次提供寻找外星人的课程。。"Introduction to Astrobiology and the Search for Extraterrestrial Life" 将由学校的星际生物学教授Charles Cockell 讲授！']
--------------------
['【#越南乳瓜#】神奇的越南乳瓜，神似女性的乳房，颜色略呈粉色。乳瓜成熟果实含葡萄糖、果糖、蔗糖、胡萝卜素、维生素C、酒石酸、枸椽酸、苹果酸等。未成熟果实的汁液中含多量的乳瓜蛋白酶、脂肪酶。营养价值也很高。可是这样的瓜你敢吃吗。#我心中的乳神# \u200b', '【阿婆厕所捡婴儿 被计生干部查获后活活摔死】15日下午，刘阿婆从乡财政所厕所粪便中将孩子捞起，简单清洗，剪脐带打针消毒。处理妥当正给孩子喂水时，武汉黄陂区蔡店乡计生办5人出现，夺走孩子掼在地上，用脚踢，放稻田里淹。引起当地群众的公愤。 @上海派對SHClubbing']





In [3]:
# 将文本中的标点符号过滤掉
def filter_punc(sentence):
    sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？?、~@#￥%……&*（）：:；“”】》《-【\][]", "",sentence.strip())
    return sentence

# 扫描所有的文本，分词、建立词典，分出是谣言还是非谣言，is_filter可以过滤是否筛选掉标点符号
def Prepare_data(good_file, bad_file, is_filter = True):
    max_len = 0
    pos_sentences = [] #存储非谣言
    neg_sentences = [] #存储谣言
    with open(good_file, 'r', encoding='utf-8') as fr:
        for idx, line in enumerate(fr):
            if is_filter:
                #过滤标点符号
                line = filter_punc(line)
                if not idx: # 只打印第一个例子看看
                    print('分词前：', line)
            if len(line) > 0:
                pos_sentences.append(line)
                max_len = max(max_len,len(line))
    print('{0} 包含 {1} 行'.format(good_file, idx+1))

    with open(bad_file, 'r', encoding='utf-8') as fr:
        for idx, line in enumerate(fr):
            if is_filter:
                line = filter_punc(line.strip())
            if len(line) > 0:
                neg_sentences.append(line)
                max_len = max(max_len,len(line))
    print('{0} 包含 {1} 行'.format(bad_file, idx+1))
    print(f'最长的句子里的未分词的词数:{max_len}')

    return pos_sentences, neg_sentences


pos_sentences, neg_sentences =  Prepare_data(good_file, bad_file, is_filter=False)

./rumor_detection_data/non_rumor.txt 包含 1849 行
./rumor_detection_data/rumor.txt 包含 1538 行
最长的句子里的未分词的词数:227


In [4]:
labels = [] #标签
sentences = [] #原始句子，未分词

# 处理非谣言
for sentence in pos_sentences:
    labels.append(0) #正标签为0 表示非谣言数据
    sentences.append(sentence)

# 处理谣言
for sentence in neg_sentences:
    labels.append(1) #负标签为1
    sentences.append(sentence)

# 打乱所有的数据顺序，形成数据集
# indices为所有数据下标的一个全排列
indices = np.random.permutation(len(sentences))

#对整个数据集进行划分，分为：训练集、验证集和测试集，这里是2:1:1
test_size = len(sentences) // 4

data = {
    'labels': labels,# 标签
    'sentences': sentences,# 句子
}
split = {
    'train': indices[2*test_size:],
    'vali': indices[:test_size],
    'test': indices[test_size:2*test_size]
}

In [5]:
# 测试一下划分情况
while True:
    tag = True
    for key, indices in split.items():
        count = [0, 0]
        for idx in indices:
            count[labels[idx]] += 1
        # 如果类别过于不平衡，则重新随机化
        cls_ratio = count[0]/count[1]
        ratio_threshold = 1.5
        if cls_ratio > ratio_threshold or cls_ratio < 1/ratio_threshold:
            indices = np.random.permutation(len(bow))
            tag = False
            break
        print(key, '非谣言有{}条，谣言有{}条'.format(count[0], count[1]))
        if key==2:
            tag = True
    if tag:
        break

train 非谣言有938条，谣言有757条
vali 非谣言有478条，谣言有368条
test 非谣言有433条，谣言有413条


In [6]:
# coding: UTF-8
sys.path.append('./Bert_Chinese_Text_Classification_Pytorch/')
import time
import torch
import numpy as np
from Bert_Chinese_Text_Classification_Pytorch.train_eval import train, init_network
from importlib import import_module
import argparse
from Bert_Chinese_Text_Classification_Pytorch.utils import build_rumor_dataset, build_iterator, get_time_dif



def run(data,split,bert_path='./Bert_Chinese_Text_Classification_Pytorch/bert_pretrain',freeze_bert=False):
    split_indices = split
    data = data

    model_name = 'bert'
    x = import_module('models.' + model_name)
    config = x.Rumor_Config(split_indices)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")
    train_data, dev_data, test_data = build_rumor_dataset(data,config)
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    model = x.Model(config).to(config.device)
    train(config, model, train_iter, dev_iter, test_iter,freeze_bert=freeze_bert)

run(data,split,freeze_bert=False)
# run(data,split,bert_path='./Bert_Chinese_Text_Classification_Pytorch/cn_bert_wwm',freeze_bert=False)


[3060  297  466 ... 2818 2004  126]
<pytorch_pretrained.tokenization.BertTokenizer object at 0x7f466aba2580>
./Bert_Chinese_Text_Classification_Pytorch/cn_bert_wwm
Loading data...


100%|██████████| 1695/1695 [00:00<00:00, 3365.67it/s]


最长的分词后的词数:156


100%|██████████| 846/846 [00:00<00:00, 3313.79it/s]


最长的分词后的词数:171


100%|██████████| 846/846 [00:00<00:00, 3386.75it/s]


最长的分词后的词数:156
Time usage: 0:00:01
Epoch [1/50]


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Iter:      0,  Train Loss:  0.73,  Train Acc: 43.75%,  Val Loss:  0.68,  Val Acc: 56.86%,  Time: 0:00:05 *
Iter:    100,  Train Loss:  0.21,  Train Acc: 93.75%,  Val Loss:   0.4,  Val Acc: 82.74%,  Time: 0:00:36 *
Epoch [2/50]
Iter:    200,  Train Loss:  0.49,  Train Acc: 81.25%,  Val Loss:  0.43,  Val Acc: 82.27%,  Time: 0:01:07 
Epoch [3/50]
Iter:    300,  Train Loss: 0.074,  Train Acc: 100.00%,  Val Loss:  0.31,  Val Acc: 89.13%,  Time: 0:01:38 *
Epoch [4/50]
Iter:    400,  Train Loss: 0.042,  Train Acc: 100.00%,  Val Loss:  0.35,  Val Acc: 88.42%,  Time: 0:02:08 
Epoch [5/50]
Iter:    500,  Train Loss: 0.0038,  Train Acc: 100.00%,  Val Loss:  0.35,  Val Acc: 90.19%,  Time: 0:02:39 *
Epoch [6/50]
Iter:    600,  Train Loss: 0.035,  Train Acc: 100.00%,  Val Loss:  0.35,  Val Acc: 90.31%,  Time: 0:03:11 *
Epoch [7/50]
Iter:    700,  Train Loss: 0.0016,  Train Acc: 100.00%,  Val Loss:  0.54,  Val Acc: 88.30%,  Time: 0:03:41 
Epoch [8/50]
Iter:    800,  Train Loss: 0.00079,  Train Acc: 1