In [1]:
# read data from raw data
import json

TRAIN_RUMOUR_JSON_PATH = 'data/raw/train.json'
DEV_JSON_PATH = 'data/raw/dev.json'
TEST_JSON_PATH = 'data/raw/test-unlabelled.json'

def read_json(path):
    with open(path, 'r') as json_file:
        data = json.loads(json_file.read())
    return data
  
train_rumour_json_data = read_json(TRAIN_RUMOUR_JSON_PATH)
dev_json_data = read_json(DEV_JSON_PATH)
test_json_data = read_json(TEST_JSON_PATH)

In [2]:
# convert dictionary to list with same format
def get_data_list(data):
    data_list = []
    for key, value in data.items():
        text = value['text'].replace('\n', ' ')
        text = text.replace('\u00A0', '')
        data_list.append(text)
    return data_list

train_rumour_list = get_data_list(train_rumour_json_data)
dev_list = get_data_list(dev_json_data)
test_list = get_data_list(test_json_data)

In [3]:
# get train list and train label
train_list = []
train_label = []
for i in train_rumour_list:
    train_list.append(i)
    train_label.append(1)

In [4]:
# data preprocessing (punctuation/special punctuation/single letter/lemma)
import nltk
import string
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

def clean_text(text, tt, lemmatizer, punc, chinesepunc, letter, stopword_list):
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

    def lemma_process(word_token, lemmatizer):
        word_token_pos_tag = nltk.pos_tag([word_token])
        for word_token, pos_tag in word_token_pos_tag:
            word_token_lower = word_token.lower()
            wordnet_pos = get_wordnet_pos(pos_tag)
            if wordnet_pos == None:
                word_token_lemma = lemmatizer.lemmatize(word_token_lower, wordnet.NOUN)
            else:
                word_token_lemma = lemmatizer.lemmatize(word_token_lower, wordnet_pos)
        return word_token_lemma     

    def get_new_text(text, tt, lemmatizer, punc, chinesepunc, letter, stopword_list):
        new_word_list = []
        new_text = ""
        for word in tt.tokenize(text.lower()):
            if (word not in punc) and (word not in chinesepunc) and (word not in stopword_list) and (word not in letter):
                word_lemma = lemma_process(word, lemmatizer)
                new_word_list.append(word_lemma)
        new_text = " ".join(new_word_list)
        return new_text
    
    new_text = get_new_text(text, tt, lemmatizer, punc, chinesepunc, letter, stopword_list)
    
    return new_text
    
tt = TweetTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        
punc = string.punctuation
chinesepunc = ['“', '”', '‘', '’', '–', '—', '...', '‐', '\u200b', '.\u2009.\u2009.', '\uf0b7', '\uf020', '\u200e', '\u2066', 
                '\u2069', '..', '. .', '…']
letter = ['a', 'b', 'c', 'd', "e", "f", 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 
          'x', 'y', 'z'] 
stopword_list = set(stopwords.words('english'))

preprocessed_train_list = []
preprocessed_dev_list = []
preprocessed_test_list = []

for train in train_list:
    new_train = clean_text(train, tt, lemmatizer, punc, chinesepunc, letter, stopword_list)
    preprocessed_train_list.append(new_train)
for dev in dev_list:
    new_dev = clean_text(dev, tt, lemmatizer, punc, chinesepunc, letter, stopword_list)
    preprocessed_dev_list.append(new_dev)
for test in test_list:
    new_test = clean_text(test, tt, lemmatizer, punc, chinesepunc, letter, stopword_list)
    preprocessed_test_list.append(new_test)

In [5]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 1), tokenizer=tt.tokenize)
train_count_vector = cv.fit_transform(preprocessed_train_list)
dev_count_vector = cv.transform(preprocessed_dev_list)
test_count_vector = cv.transform(preprocessed_test_list)

In [6]:
# CountVectorizer + TfidfTransformer = TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

tf = TfidfTransformer()
train_tf_vector = tf.fit_transform(train_count_vector)
dev_tf_vector = tf.transform(dev_count_vector)
test_tf_vector = tf.transform(test_count_vector)

In [7]:
# save dev label and test label
def save_dev_label(prediction_list, path):
    dev_total_labels_dict = dict()
    num = 0
    for dev_label in prediction_list:
        label_dict = dict()
        label_dict['label'] = dev_label
        dev_total_labels_dict['dev-'+str(num)] = label_dict
        num = num + 1
    json_str = json.dumps(dev_total_labels_dict)
    with open(path, 'w') as json_file:
        json_file.write(json_str)
        
def save_test_label(prediction_list, path):
    test_total_labels_dict = dict()
    num = 0
    for test_label in prediction_list:
        label_dict = dict()
        label_dict['label'] = test_label
        test_total_labels_dict['test-'+str(num)] = label_dict
        num = num + 1
    json_str = json.dumps(test_total_labels_dict)
    with open(path, 'w') as json_file:
        json_file.write(json_str)

In [10]:
# one class svm
from sklearn.svm import OneClassSVM

DEV_LABEL_TFIDF_PREDICTION_PATH = 'data/dev/oneclasssvm/prediction-tfidf.json'
TEST_LABEL_TFIDF_PREDICTION_PATH = 'data/test/oneclasssvm/test-output-tfidf.json'

one_svm = OneClassSVM(gamma='scale', nu=0.01)
one_svm.fit(train_tf_vector, train_label)
prediction_dev = one_svm.predict(dev_tf_vector)
prediction_list = prediction_dev.tolist()
change_prediction_list = []
for prediction in prediction_list:
    if prediction == (-1):
        prediction = 0
        change_prediction_list.append(prediction)
    else:
        change_prediction_list.append(prediction)
save_dev_label(change_prediction_list, DEV_LABEL_TFIDF_PREDICTION_PATH)
prediction_test = one_svm.predict(test_tf_vector)
prediction_list_test = prediction_test.tolist()
change_prediction_list_test = []
for prediction in prediction_list_test:
    if prediction == (-1):
        prediction = 0
        change_prediction_list_test.append(prediction)
    else:
        change_prediction_list_test.append(prediction)
save_test_label(change_prediction_list_test, TEST_LABEL_TFIDF_PREDICTION_PATH)