In [1]:
import json
dataSetPath = '../dataset/semeval2017-task8/'
with open(dataSetPath + 'trainSet.json', 'r') as f:
    content = f.read()
trainSet = json.loads(content)

In [15]:
import torch
from utils import *
from  sklearn.feature_extraction.text import TfidfVectorizer
trainSet = json.loads(content)
threads = []
rumorTags = []
stanceTags = []
for threadId in trainSet['threadIds']:
    thread = []
    stanceTag = []
    structure = trainSet['structures'][threadId]
    ids = flattenStructure(structure)
    time2Id = {}
    for id in ids:
        if id in trainSet['posts']:
            time2Id[str(trainSet['posts'][id]['time'])] = id
    # post按照时间先后排序
    time2Id = sorted(time2Id.items(), key=lambda d: d[0])
    for (time, id) in time2Id:
        if id in trainSet['posts'] and id in trainSet['stanceTag']:
            thread.append(trainSet['posts'][id]['text'])
            stanceTag.append(trainSet['label2IndexStance'][trainSet['stanceTag'][id]])
    threads.append(thread)
    rumorTags.append(torch.LongTensor([trainSet['label2IndexRumor'][trainSet['rumorTag'][threadId]]]))
    stanceTags.append(torch.LongTensor(stanceTag))
cropus = []
for thread in threads:
    for text in thread:
        cropus.append(text)
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(cropus).toarray()
counter = 0
for i in range(len(threads)):
    tfidf = []
    for _ in threads[i]:
        tfidf.append(tfidf_matrix[counter])
        counter += 1
    threads[i] = torch.Tensor(tfidf)

label2IndexRumor = trainSet['label2IndexRumor']
label2IndexStance = trainSet['label2IndexStance']
trainSet = []
for i in range(len(threads)):
    trainSet.append((threads[i], rumorTags[i], stanceTags[i]))


In [3]:
import torch
import torch.nn as nn
from torch.nn.modules.module import Module

class MTUS(nn.Module):
    def __init__(self, embeddingDim: int, hiddenDim: int, inputDim: int, 
                 numGRULayer: int, numRumorClass: int, numStanceClass: int,
                 batchSize = 1, bidirectional = False):
        super().__init__() # 调用nn.Moudle父类的初始化方法
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 优先使用cuda
        self.embeddingDim = embeddingDim
        self.hiddenDim = hiddenDim
        self.batchSize = batchSize
        self.bidirectional = bidirectional
        self.D = 2 if self.bidirectional else 1
        
        # embedding使用线性层来把tf-idf向量转换成句嵌入
        self.embeddingRumor = nn.Linear(inputDim, embeddingDim)
        self.embeddingStance = nn.Linear(inputDim, embeddingDim)
        
        # 共享GRU层
        self.shareGRU = nn.GRU(embeddingDim, hiddenDim, numGRULayer, bidirectional = self.bidirectional)
        self.h0 = nn.Parameter(torch.randn((self.D * numGRULayer, self.batchSize, hiddenDim)))

        # 把GRU的隐状态映射成概率
        self.vRumor = nn.Linear(self.D * hiddenDim, numRumorClass)
        self.vStance = nn.Linear(self.D * 2 * hiddenDim, numStanceClass) # stance的预测需要拼接第一句的隐状态
        
    # 训练集前向传递，返回对特定任务的概率向量/矩阵
    def forwardRumor(self, sentences: torch.Tensor):
        seqLen = sentences.size()[0] # 取tensor size的第一维，是本次训练的thread的长度
        embeddings = self.embeddingRumor(sentences).view(seqLen, self.batchSize, self.embeddingDim) # view是为了适配gru的输入样式
        # hs(seqLen, batch, numDirection * hiddenDim), ht(numLayers*numDirections, batch, hiddenDim)
        # 舍弃掉ht的输出是因为下一个thread和这次训练的thread是独立的，不应该用本次的隐状态作为其h0输入
        gruOut, _ = self.shareGRU(embeddings, self.h0) 
        ht = gruOut[gruOut.size()[0] - 1].view(self.batchSize, self.D * self.hiddenDim) # 取出最后一层的隐状态
        p = self.vRumor(ht)
        return p # 返回的概率矩阵是包含batch维度的size():(batch, numDirection)
    
    def forwardStance(self, sentences: torch.Tensor):
        seqLen = sentences.size()[0]
        embeddings = self.embeddingRumor(sentences).view(seqLen, self.batchSize, self.embeddingDim)
        gruOut, _ = self.shareGRU(embeddings, self.h0) # hs(seqLen, batch, numDirection * hiddenDim)
        h1Repeat = gruOut[0].repeat(seqLen, 1, 1) # h1Repeat(seqLen, batch, numDirection * hiddenDim)
        p = self.vStance(torch.cat([h1Repeat, gruOut], dim=2))
        return p

    # 更换计算设备
    def set_device(self, device: torch.device) -> torch.nn.Module:
        _model = self.to(device)
        _model.device = device
        return _model

    # 保存模型
    def save(self, path: str):
        pass
    # 加载模型
    def load(self, path: str):
        pass
    
mtus = MTUS(embeddingDim=100, hiddenDim=100, inputDim=threads[0].size()[1],
            numGRULayer=2, numRumorClass=3, numStanceClass=4, bidirectional=True)

In [4]:
from torch import optim
from torch.nn.functional import softmax
loss_func = torch.nn.CrossEntropyLoss(reduction='sum')
optimizer = optim.Adagrad(mtus.parameters(), lr=1e-3)

In [5]:
#rumor test
x = threads[0]
rumorTag = rumorTags[0]
stanceTag = stanceTags[0]
print(x.size(), rumorTag.size(), stanceTag.size())

optimizer.zero_grad()
p1 = mtus.forwardRumor(x)
print(p1.size(), rumorTag)
loss = loss_func(p1, rumorTag)
loss.backward()
#optimizer.step()
p1 = softmax(p1, 1)
print(p1)
p1 = p1.max(dim=1)[1].item()
print(p1)

torch.Size([5, 6586]) torch.Size([1]) torch.Size([5])
torch.Size([1, 3]) tensor([0])
tensor([[0.3227, 0.3302, 0.3471]], grad_fn=<SoftmaxBackward>)
2


In [48]:
#stance test
optimizer.zero_grad()
p2 = mtus.forwardStance(x).view(-1, len(trainSet['label2IndexStance']))
print(p2.size(), stanceTag)
loss = loss_func(p2, stanceTag)
loss.backward()
#optimizer.step()
p2 = softmax(p2, 1)
print(p2)
p2 = p2.max(dim=1)[1].tolist()
print(p2)

torch.Size([5, 4]) tensor([0, 3, 3, 3, 0])
tensor([[0.5297, 0.0167, 0.0130, 0.4406],
        [0.2648, 0.0127, 0.0096, 0.7128],
        [0.1875, 0.0102, 0.0079, 0.7943],
        [0.2673, 0.0109, 0.0089, 0.7128],
        [0.6245, 0.0136, 0.0116, 0.3503]], grad_fn=<SoftmaxBackward>)
[0, 3, 3, 3, 0]


In [59]:
from sklearn.metrics import f1_score
y_true = [0,1,2,2,1,0]
y_pred = [0,0,0,1,0,1]
print(f1_score(y_true, y_pred, labels=[0,1,2], average='micro'))

0.16666666666666666
