In [1]:
import gensim
from gensim.models.keyedvectors import KeyedVectors
from data import *
from utils import *

GLOVE_PATH = '../glove/' # glove预训练词嵌入的相对位置
WORD2VEC_DIMS = ['glove.twitter.27B.25d.gensim.txt', 
                 'glove.twitter.27B.50d.gensim.txt', 
                 'glove.twitter.27B.100d.gensim.txt', 
                 'glove.twitter.27B.200d.gensim.txt']

In [2]:
trainPostIds, trainStanceTag = readPostIdAndTag()
trainThreadIds, trainRumorTag = readThreadIdAndTag()
trainPosts, structures = readPostsStruct()
testPostIds, testStanceTag = readTestPostIdAndTag()
testThreadIds, testRumorTag = readTestThreadIdAndTag()
testPosts, testStructures = readTestPostsStruct()

In [3]:
glove25d = KeyedVectors.load_word2vec_format(GLOVE_PATH + WORD2VEC_DIMS[0], binary=False)

In [4]:
trainSet = {'postIds': trainPostIds, 'stanceTag': trainStanceTag,
            'threadIds': trainThreadIds, 'rumorTag': trainRumorTag,
            'structures': structures}
testSet = {'postIds': testPostIds, 'stanceTag': testStanceTag,
           'threadIds': testThreadIds, 'rumorTag': testRumorTag,
           'structures': testStructures}

In [5]:
for key in trainPosts:
    text = trainPosts[key]['text']
    words = fixText(text)
    for i in range(len(words)):
        if words[i] not in glove25d:
            words[i] = '<pad>'
    trainPosts[key]['text'] = ' '.join(words)
trainSet['posts'] = trainPosts
trainSet['label2IndexRumor'] = label2IndexRumor
trainSet['label2IndexStance'] = label2IndexStance
trainSet['index2LabelRumor'] = index2LabelRumor
trainSet['index2LabelStance'] = index2LabelStance

for key in testPosts:
    text = testPosts[key]['text']
    words = fixText(text)
    for i in range(len(words)):
        if words[i] not in glove25d:
            words[i] = '<pad>'
    testPosts[key]['text'] = ' '.join(words)
testSet['posts'] = testPosts
testSet['label2IndexRumor'] = label2IndexRumor
testSet['label2IndexStance'] = label2IndexStance
testSet['index2LabelRumor'] = index2LabelRumor
testSet['index2LabelStance'] = index2LabelStance

In [6]:
import json
with open('trainSet.json', 'w') as f:
    f.write(json.dumps(trainSet))
with open('testSet.json', 'w') as f:
    f.write(json.dumps(testSet))