In [1]:
import os
import re
import jieba
import random
import json
from langconv import *

import nltk
import nltk.metrics
from sklearn.naive_bayes import BernoulliNB
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import classification_report

import collections

from tqdm import tqdm

# data file
path = "/home/lchen/Datasets/weibo/Weibo"
weibo_label_path = "/home/lchen/Datasets/weibo/Weibo.txt"

class NB:
    # initialize
    def __init__(self, path, weibo_label_path):
        self.path = path
        self.weibo_label_path = weibo_label_path
    
    # load label --> return id_list: [[nonrumor_id_0, nonrumor_id_1, ...], [rumor_id_0, rumor_id_1, ...]]
    def load_label(self, weibo_label_path):     # nonrumor-0 rumor-1
        print("---------- load labels ----------")
        with open(weibo_label_path, "r") as f:
            s = f.readlines()
        id_list = [[],[]]
        for line in s:
            ID_label = [item.split(':')[1] for item in line.split('\t')[:2]]
            if ID_label[1] == '0':
                id_list[0].append(ID_label[0])
            if ID_label[1] == '1':
                id_list[1].append(ID_label[0])
        return id_list
    
    # preprocess text (convert traditional to simple; cut sentence into words)
    def preprocess(self, text):
        text = Converter("zh-hans").convert(text)
        text = text.encode("utf-8").decode("utf-8")
        return " ".join(jieba.cut(text)).split()      
    
    # load data --> build self.train, self.test | data form for each event: (feature, label)
    def load_data(self):
        id_list = self.load_label(self.weibo_label_path)
        filenames = [os.path.join(self.path, "%s.json" % ID) for ID in id_list[0]]
        nonrumor = []
        for filename in tqdm(filenames, ncols=50):
            with open(filename, "r", encoding="utf-8") as f:
                data = json.load(f)
                word_list = []
                for item in data[:1]: # only consider the source weibo
                    word_list.extend(self.preprocess(item["text"]))
                nonrumor.append((dict([(word, True) for word in word_list]), 0))
        filenames = [os.path.join(self.path, "%s.json" % ID) for ID in id_list[1]]
        rumor = []
        for filename in tqdm(filenames, ncols=50):
            with open(filename, "r", encoding="utf-8") as f:
                data = json.load(f)
                word_list = []
                for item in data[:1]:
                    word_list.extend(self.preprocess(item["text"]))
                rumor.append((dict([(word, True) for word in word_list]), 1))
        random.seed(20)
        random.shuffle(rumor)
        random.shuffle(nonrumor)
        rumor_cut = int(len(rumor)*0.8)        # num of rumor train set
        nonrumor_cut = int(len(nonrumor)*0.8)  # num of nonrumor train set
        self.train = rumor[:rumor_cut] + nonrumor[:nonrumor_cut]        
        self.test = rumor[rumor_cut:] + nonrumor[nonrumor_cut:]
        random.shuffle(self.train)
        random.shuffle(self.test)
    
    # Naive Bayes classifier
    def NaiveBayes(self, feature_num):
        classifier = nltk.NaiveBayesClassifier.train(self.train)
        print("Naive Bayes Accuracy:", nltk.classify.accuracy(classifier, self.test))
        y_test = []
        y_pred = []
        for i, (feats, label) in enumerate(self.test):
            observed = classifier.classify(feats)
            y_test.append(label)
            y_pred.append(observed)
        print(classification_report(y_test, y_pred, target_names=["non-rumor", "rumor"], digits=3))
        classifier.show_most_informative_features(feature_num)    # print most informative features        
        
        
nb = NB(path, weibo_label_path)
nb.load_data()
nb.NaiveBayes(50)

  0%|                    | 0/2351 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


---------- load labels ----------


Loading model cost 0.715 seconds.
Prefix dict has been built successfully.
100%|█████████| 2351/2351 [00:29<00:00, 80.63it/s]
100%|█████████| 2313/2313 [00:47<00:00, 49.11it/s]


Naive Bayes Accuracy: 0.880085653104925
             precision    recall  f1-score   support

  non-rumor      0.935     0.820     0.873       471
      rumor      0.837     0.942     0.886       463

avg / total      0.886     0.880     0.880       934

Most Informative Features
                      抽烟 = True                1 : 0      =     26.8 : 1.0
                     袁裕来 = True                1 : 0      =     24.7 : 1.0
                      超速 = True                1 : 0      =     23.4 : 1.0
                      微信 = True                1 : 0      =     22.7 : 1.0
                      号牌 = True                1 : 0      =     22.7 : 1.0
                      遮挡 = True                1 : 0      =     22.7 : 1.0
                      天一 = True                1 : 0      =     22.0 : 1.0
                      小孩 = True                1 : 0      =     21.7 : 1.0
                       ① = True                0 : 1      =     21.2 : 1.0
                       ② = True             