In [2]:
# read data from raw data
import json

TRAIN_RUMOUR_JSON_PATH = 'data/raw/train.json'
TRAIN_NONRUMOUR_TXT_PATH = 'data/raw/non-rumor.txt'
DEV_JSON_PATH = 'data/raw/dev.json'
TEST_JSON_PATH = 'data/raw/test-unlabelled.json'

def read_json(path):
    with open(path, 'r') as json_file:
        data = json.loads(json_file.read())
    return data

def read_txt(path):
    data = dict()
    with open(path, 'r', encoding='utf-8') as txt_file:
        num = 0
        for line in txt_file:
            text = dict()
            text['text'] = line
            data[num] = text
            num = num + 1
    return data
                
train_rumour_json_data = read_json(TRAIN_RUMOUR_JSON_PATH)
train_nonrumour_txt_data = read_txt(TRAIN_NONRUMOUR_TXT_PATH)
dev_json_data = read_json(DEV_JSON_PATH)
test_json_data = read_json(TEST_JSON_PATH)

In [3]:
# convert dictionary and txt to list with same format
def get_data_list(data):
    data_list = []
    for key, value in data.items():
        text = value['text'].replace('\n', ' ')
        text = text.replace('\u00A0', '')
        data_list.append(text)
    return data_list

train_rumour_list = get_data_list(train_rumour_json_data)
train_nonrumour_list = get_data_list(train_nonrumour_txt_data)
dev_list = get_data_list(dev_json_data)
test_list = get_data_list(test_json_data)

In [4]:
# get train list and train label
train_list = []
train_label = []
for i in train_rumour_list:
    train_list.append(i)
    train_label.append(1)
for i in train_nonrumour_list:
    train_list.append(i)
    train_label.append(0)

In [5]:
# get dev label
def get_dev_label(dev_json_data):
    dev_label = []
    for key, value in dev_json_data.items():
        dev_label.append(value['label'])
    return dev_label

dev_label = get_dev_label(dev_json_data)

In [8]:
# data preprocessing (punctuation/special punctuation/single letter/lemma)
import re
import nltk
import string
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

def clean_text(text, tt, lemmatizer, punc, chinesepunc, letter, stopword_list):
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

    def lemma_process(word_token, lemmatizer):
        word_token_pos_tag = nltk.pos_tag([word_token])
        for word_token, pos_tag in word_token_pos_tag:
            word_token_lower = word_token.lower()
            wordnet_pos = get_wordnet_pos(pos_tag)
            if wordnet_pos == None:
                word_token_lemma = lemmatizer.lemmatize(word_token_lower, wordnet.NOUN)
            else:
                word_token_lemma = lemmatizer.lemmatize(word_token_lower, wordnet_pos)
        return word_token_lemma     

    def get_new_text(text, tt, lemmatizer, punc, chinesepunc, letter, stopword_list):
        new_word_list = []
        new_text = ""
        for word in tt.tokenize(text.lower()):
            word_only_string = re.sub (r'([^a-zA-Z ]+?)', '', word)
            if (word_only_string not in punc) and (word_only_string not in chinesepunc) and (word_only_string not in stopword_list) and (word_only_string not in letter):
                word_lemma = lemma_process(word_only_string, lemmatizer)
                new_word_list.append(word_lemma)
        new_text = " ".join(new_word_list)
        return new_text
    
    new_text = get_new_text(text, tt, lemmatizer, punc, chinesepunc, letter, stopword_list)
    
    return new_text
    
tt = TweetTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        
punc = string.punctuation
chinesepunc = ['“', '”', '‘', '’', '–', '—', '...', '‐', '\u200b', '.\u2009.\u2009.', '\uf0b7', '\uf020', '\u200e', '\u2066', 
                '\u2069', '..', '. .', '…']
letter = ['a', 'b', 'c', 'd', "e", "f", 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 
          'x', 'y', 'z'] 
stopword_list = set(stopwords.words('english'))

preprocessed_train_list = []
preprocessed_dev_list = []
preprocessed_test_list = []

for train in train_list:
    new_train = clean_text(train, tt, lemmatizer, punc, chinesepunc, letter, stopword_list)
    preprocessed_train_list.append(new_train)
for dev in dev_list:
    new_dev = clean_text(dev, tt, lemmatizer, punc, chinesepunc, letter, stopword_list)
    preprocessed_dev_list.append(new_dev)
for test in test_list:
    new_test = clean_text(test, tt, lemmatizer, punc, chinesepunc, letter, stopword_list)
    preprocessed_test_list.append(new_test)

In [9]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 1), tokenizer=tt.tokenize)
train_count_vector = cv.fit_transform(preprocessed_train_list)
dev_count_vector = cv.transform(preprocessed_dev_list)
test_count_vector = cv.transform(preprocessed_test_list)

In [10]:
# CountVectorizer + TfidfTransformer = TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

tf = TfidfTransformer()
train_tf_vector = tf.fit_transform(train_count_vector)
dev_tf_vector = tf.transform(dev_count_vector)
test_tf_vector = tf.transform(test_count_vector)

In [11]:
# save dev label and test label
def save_dev_label(prediction_list, path):
    dev_total_labels_dict = dict()
    num = 0
    for dev_label in prediction_list:
        label_dict = dict()
        label_dict['label'] = dev_label
        dev_total_labels_dict['dev-'+str(num)] = label_dict
        num = num + 1
    json_str = json.dumps(dev_total_labels_dict)
    with open(path, 'w') as json_file:
        json_file.write(json_str)
        
def save_test_label(prediction_list, path):
    test_total_labels_dict = dict()
    num = 0
    for test_label in prediction_list:
        label_dict = dict()
        label_dict['label'] = test_label
        test_total_labels_dict['test-'+str(num)] = label_dict
        num = num + 1
    json_str = json.dumps(test_total_labels_dict)
    with open(path, 'w') as json_file:
        json_file.write(json_str)

In [16]:
# svm
from sklearn import svm
   
DEV_LABEL_TFIDF_PREDICTION_PATH = 'data/dev/svm/prediction-tfidf-linear.json'
TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/test/svm/test-output-tfidf-linear.json'
#DEV_LABEL_TFIDF_PREDICTION_PATH = 'data/dev/svm/prediction-tfidf-rbf.json'
#TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/test/svm/test-output-tfidf-rbf.json'

svm_clf = svm.SVC(kernel='linear') #kernel='rbf'
svm_clf.fit(train_tf_vector, train_label)
prediction_dev = svm_clf.predict(dev_tf_vector)
prediction_list = prediction_dev.tolist()
save_dev_label(prediction_list, DEV_LABEL_TFIDF_PREDICTION_PATH)
prediction_test = svm_clf.predict(test_tf_vector)
prediction_list_test = prediction_test.tolist()
save_test_label(prediction_list_test, TEST_LABEL_TFIDF_PREDICTION_PATH)

In [17]:
# get vocabulary size
vocab_size = train_tf_vector.shape[1]
print(vocab_size)

58945


In [41]:
# mlp
from keras.models import Sequential
from keras import layers
import numpy

DEV_LABEL_COUNT_PREDICTION_PATH = 'data/dev/mlp/prediction-count-2layers150.json'
TEST_LABEL_COUNT_PREDICTION_PATH = 'data/test/mlp/test-output-count-2layers150.json'
DEV_LABEL_TFIDF_PREDICTION_PATH = 'data/dev/mlp/prediction-tfidf-2layers150.json'
TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/test/mlp/test-output-tfidf-2layers150.json'
#DEV_LABEL_TFIDF_PREDICTION_PATH = 'data/dev/mlp/prediction-tfidf-1layer20.json'
#TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/test/mlp/test-output-tfidf-1layer20.json'
#DEV_LABEL_TFIDF_PREDICTION_PATH = 'data/dev/mlp/prediction-tfidf-1layer150.json'
#TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/test/mlp/test-output-tfidf-1layer150.json'
#TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/test/mlp/test-output-tfidf-nopre.json'

model = Sequential(name="feedforward-bow-input")
model.add(layers.Dense(150, input_dim=vocab_size, activation='relu'))
model.add(layers.Dense(20, input_dim=150, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(train_tf_vector, train_label, epochs=5, verbose=True, validation_data=(dev_tf_vector, dev_label), batch_size=14)
prediction_dev = model.predict_classes(dev_tf_vector)
prediction_test = model.predict_classes(test_tf_vector)
prediction_list = [int(numpy.round(x)) for x in prediction_dev]
save_dev_label(prediction_list, DEV_LABEL_TFIDF_PREDICTION_PATH)
prediction_list_test = [int(numpy.round(x)) for x in prediction_test]
save_test_label(prediction_list_test, TEST_LABEL_TFIDF_PREDICTION_PATH)

Model: "feedforward-bow-input"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_44 (Dense)             (None, 150)               8841900   
_________________________________________________________________
dense_45 (Dense)             (None, 20)                3020      
_________________________________________________________________
dropout_22 (Dropout)         (None, 20)                0         
_________________________________________________________________
dense_46 (Dense)             (None, 1)                 21        
Total params: 8,844,941
Trainable params: 8,844,941
Non-trainable params: 0
_________________________________________________________________
Train on 4252 samples, validate on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [37]:
# lr
from sklearn.linear_model import LogisticRegression

DEV_LABEL_TFIDF_PREDICTION_PATH = 'data/final/dev/lr/prediction-tfidf.json'
TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/final/test/lr/test-output-tfidf.json'

lr = LogisticRegression()
lr.fit(train_tf_vector, train_label)
prediction_dev = lr.predict(dev_tf_vector)
prediction_list = prediction_dev.tolist()
save_dev_label(prediction_list, DEV_LABEL_TFIDF_PREDICTION_PATH)
prediction_test = lr.predict(test_tf_vector)
prediction_list_test = prediction_test.tolist()
save_test_label(prediction_list_test, TEST_LABEL_TFIDF_PREDICTION_PATH)

In [39]:
# nb
from sklearn.naive_bayes import MultinomialNB

DEV_LABEL_TFIDF_PREDICTION_PATH = 'data/final/dev/nb/prediction-tfidf.json'
TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/final/test/nb/test-output-tfidf.json'

nb = MultinomialNB()
nb.fit(train_tf_vector, train_label)
prediction_dev = nb.predict(dev_tf_vector)
prediction_list = prediction_dev.tolist()
save_dev_label(prediction_list, DEV_LABEL_TFIDF_PREDICTION_PATH)
prediction_test = nb.predict(test_tf_vector)
prediction_list_test = prediction_test.tolist()
save_test_label(prediction_list_test, TEST_LABEL_TFIDF_PREDICTION_PATH)