In [8]:
import json
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from torch.nn.functional import softmax
from model import *
from defaultParameter import*
from utils import *

with open('../dataset/semeval2017-task8/' + 'trainSet.json', 'r') as f:
    content = f.read()
trainSet = json.loads(content)
threads = []
rumorTags = []
stanceTags = []
for threadId in trainSet['threadIds']:
    thread = []
    stanceTag = []
    structure = trainSet['structures'][threadId]
    ids = flattenStructure(structure)
    time2Id = {}
    for id in ids:
        if id in trainSet['posts']:
            time2Id[str(trainSet['posts'][id]['time'])] = id
    # post按照时间先后排序
    time2Id = sorted(time2Id.items(), key=lambda d: d[0])
    for (time, id) in time2Id:
        if id in trainSet['posts'] and id in trainSet['stanceTag']:
            thread.append(trainSet['posts'][id]['text'])
            stanceTag.append(trainSet['label2IndexStance'][trainSet['stanceTag'][id]])
    threads.append(thread)
    rumorTags.append(torch.LongTensor([trainSet['label2IndexRumor'][trainSet['rumorTag'][threadId]]]))
    stanceTags.append(torch.LongTensor(stanceTag))
cropus = []
for thread in threads:
    for text in thread:
        cropus.append(text)
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(cropus) # 使用train set训练一个tfidf向量化转换器
label2IndexRumor = trainSet['label2IndexRumor']
label2IndexStance = trainSet['label2IndexStance']

device = torch.device('cpu')

model = MTUS(embeddingDim=100, hiddenDim=100, inputDim=tfidf_matrix.shape[1], numGRULayer=2,
             numRumorClass=len(label2IndexRumor), numStanceClass=len(label2IndexStance))
model.load_state_dict(torch.load('./model.pt', map_location=device))
model = model.set_device(device)
loss_func = torch.nn.CrossEntropyLoss(reduction='sum').to(device)

In [32]:
with open('../dataset/semeval2017-task8/' + 'testSet.json', 'r') as f:
    content = f.read()
testSet = json.loads(content)
threads = []
rumorTags = []
stanceTags = []
for threadId in testSet['threadIds']:
    thread = []
    stanceTag = []
    structure = testSet['structures'][threadId]
    ids = flattenStructure(structure)
    time2Id = {}
    for id in ids:
        if id in testSet['posts']:
            time2Id[str(testSet['posts'][id]['time'])] = id
    # post按照时间先后排序
    time2Id = sorted(time2Id.items(), key=lambda d: d[0])
    for (time, id) in time2Id:
        if id in testSet['posts'] and id in testSet['stanceTag']:
            thread.append(testSet['posts'][id]['text'])
            stanceTag.append(testSet['label2IndexStance'][testSet['stanceTag'][id]])
    threads.append(thread)
    rumorTags.append(torch.LongTensor([testSet['label2IndexRumor'][testSet['rumorTag'][threadId]]]))
    stanceTags.append(torch.LongTensor(stanceTag))
cropus = []
for thread in threads:
    for text in thread:
        cropus.append(text)
tfidf_matrix = tfidf_vec.transform(cropus).toarray()


In [33]:
counter = 0
for i in range(len(threads)):
    tfidf = []
    for _ in threads[i]:
        tfidf.append(tfidf_matrix[counter])
        counter += 1
    threads[i] = torch.Tensor(tfidf)

label2IndexRumor = testSet['label2IndexRumor']
label2IndexStance = testSet['label2IndexStance']
testSet = []
for i in range(len(threads)):
    testSet.append((threads[i], rumorTags[i], stanceTags[i]))

In [34]:
model.eval()
rumorTrue = []
stanceTrue = []
rumorPre = []
stancePre = []
totalLossRumor = 0.
totalLossStance = 0.
for i in range(len(testSet)):
    x = testSet[i][0].to(device)
    rumorTag = testSet[i][1].to(device)
    stanceTag = testSet[i][2].to(device)
    rumorTrue += testSet[i][1].tolist()
    stanceTrue += testSet[i][2].tolist()

    pRumor = model.forwardRumor(x)
    pStance = model.forwardStance(x).view(-1, len(label2IndexStance))
    loss = loss_func(pRumor, rumorTag)
    totalLossRumor += loss
    loss = loss_func(pStance, stanceTag)
    totalLossStance += loss
    pRumor = softmax(pRumor, 1)
    rumorPre += pRumor.max(dim=1)[1].tolist()
    pStance = softmax(pStance, 1)
    stancePre += pStance.max(dim=1)[1].tolist()

microF1Rumor = f1_score(rumorTrue, rumorPre, labels=[0,1,2], average='micro')
macroF1Rumor = f1_score(rumorTrue, rumorPre, labels=[0,1,2], average='macro')
microF1Stance = f1_score(stanceTrue, stancePre, labels=[0,1,2], average='micro')
macroF1Stance = f1_score(stanceTrue, stancePre, labels=[0,1,2], average='macro')
accuracyRumor = (np.array(rumorTrue) == np.array(rumorPre)).sum() / len(rumorPre)
accuracyStance = (np.array(stanceTrue) == np.array(stancePre)).sum() / len(stancePre)
print('rumor detection:')
print('average loss: {:f}, accuracy: {:f}, micro-f1: {:f}, macro-f1: {:f}'.format(
    totalLossRumor / len(testSet), accuracyRumor, microF1Rumor, macroF1Rumor
))
print('stance analyze:')
print('average loss: {:f}, accuracy: {:f}, micro-f1: {:f}, macro-f1: {:f}'.format(
    totalLossStance / len(testSet), accuracyStance, microF1Stance, macroF1Stance
))

rumor detection:
average loss: 1.472029, accuracy: 0.321429, micro-f1: 0.321429, macro-f1: 0.271912
stance analyze:
average loss: 164.627884, accuracy: 0.584130, micro-f1: 0.145390, macro-f1: 0.131475
