In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import nltk
import spacy
from time import time
from tqdm import tqdm
from pprint import pprint
import pandas as pd
import random
import pickle
import _pickle as cPickle
import os
import torch

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])

In [3]:
def make_data(body_data_path, stance_data_path, training=True):
    body_data = pd.read_csv(body_data_path, names=['BodyID', 'Body'])
    body_data = list(zip(body_data.BodyID.tolist(), body_data.Body.tolist()))[1:]
    if training:
        stance_data = pd.read_csv(stance_data_path, names=['Headline', 'BodyID', 'Stance'])
        stance_data = list(zip(stance_data.Headline.tolist(), stance_data.BodyID.tolist(), stance_data.Stance.tolist()))[1:]
    else:
        stance_data = pd.read_csv(stance_data_path, names=['Headline', 'BodyID'])
        stance_data = list(zip(stance_data.Headline.tolist(), stance_data.BodyID.tolist()))[1:]
    id_text = {}
    data = []
    for _id, text in body_data:
        id_text[_id] = text
    for row in tqdm(stance_data):
        headline = nlp(row[0])
        _id = row[1]
        if training:
            stance = row[2]
            data.append((_id, headline, nlp(id_text[_id]), stance))
        else:
            data.append((_id, headline, nlp(id_text[_id])))
    return data

def dev_split(data, split=0.8):
    n = len(data)
    random.shuffle(data)
    train_data = data[:int(split*n)]
    dev_data = data[int(split*n):]
    assert len(train_data)+len(dev_data)==n
    return train_data, dev_data

In [4]:
def refuting_features(hl, body):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    hl_lemmas = get_tokenized_lemmas(hl)
    features = [1 if word in hl_lemmas else 0 for word in _refuting_words]
    return features

def get_tokenized_lemmas(s):
    return [t.lemma_ for t in s]

def binary_co_occurence(hl, body):
    # Count how many times a token in the title
    # appears in the body text.
    bin_count = 0
    bin_count_early = 0
    for token in hl:
        if token.text in body.text:
            bin_count += 1
        if token.text in body.text[:255]:
            bin_count_early += 1
    return [bin_count, bin_count_early]

def binary_co_occurence_stops(hl, body):
    # Count how many times a token in the title
    # appears in the body text. Stopwords in the title
    # are ignored.
    bin_count = 0
    bin_count_early = 0
    hl = [i for i in hl if i.is_punct == False]
    for token in hl:
        if token.text in body.text:
            bin_count += 1
            bin_count_early += 1
    return [bin_count, bin_count_early]

def polarity_features(headline, body):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]

    def calculate_polarity(text):
        tokens = get_tokenized_lemmas(text)
        return sum([t in _refuting_words for t in tokens]) % 2
    features = []
    features.append(calculate_polarity(headline))
    features.append(calculate_polarity(body))
    return features

def chargrams(hl, body, size):
    chargram_hits = 0
    for i in range(len(hl.text)-size+1):
        chgram = body.text[i:i+size]
        if chgram in body.text:
            #print(chgram)
            chargram_hits += 1
    return chargram_hits

def ngrams(hl, body, size):
    ngram_hits = 0
    for i in range(len(hl.doc)-size+1):
        ngram = hl.doc[i:i+size]
        if ngram.text in body.text:
            #print(ngram)
            ngram_hits += 1
    return ngram_hits

def clean(text):
    return ''.join(x.lower_ for x in text)

def jaccard_sim(hline, body):
    hset = set(clean(hline))
    bset = set(clean(body))
    if len(bset) == 0.0: return 0.0
    else:
        return len(hset.intersection(bset))/len(hset.union(bset))


def make_features(X):
    fvecs = []
    for (i, h, b) in tqdm(X):
        fvec = []
        fvec.append(jaccard_sim(h, b))
        fvec.append(ngrams(h, b, 2))
        fvec.append(ngrams(h, b, 3))
        fvec.append(ngrams(h, b, 4))
        fvec.append(ngrams(h, b, 5))
        fvec.append(ngrams(h, b, 6))
        fvec.append(chargrams(h, b, 2))
        fvec.append(chargrams(h, b, 4))
        fvec.append(chargrams(h, b, 8))
        fvec.append(chargrams(h, b, 16))
        fvec.extend(polarity_features(h, b))
        fvec.extend(refuting_features(h, b))
        fvec.extend(binary_co_occurence(h, b))
        fvec.extend(binary_co_occurence_stops(h, b))
        fvecs.append(fvec)
    return np.array(fvecs)

In [5]:
def save_or_load_file(path, obj=None, load=True):
    if load:
        with open(path, 'rb') as f:
            obj = cPickle.load(f)
        return obj
    else:
        with open(path, 'wb') as f:
            assert obj is not None
            cPickle.dump(obj, f)
        
train_path = './data/train/train_data.pkl'
dev_path = './data/train/dev_data.pkl'
test_path = './data/test/test_data.pkl'

if os.path.exists(train_path) and os.path.exists(dev_path):
    train_data = save_or_load_file(train_path)
    dev_data = save_or_load_file(dev_path)
else:
    train_data, dev_data = dev_split(make_data('./data/train/train_bodies.csv', './data/train/train_stances.csv'))
    save_or_load_file(train_path, train_data, False)
    save_or_load_file(dev_path, dev_data, False)

if os.path.exists(test_path):
    test_data = save_or_load_file(test_path)
else:
    test_data = make_data('./data/test/test_bodies.csv', './data/test/test_stances_unlabeled.csv', False)
    save_or_load_file(test_path, test_data, False)

print("Training data:", len(train_data))
print("Dev data:", len(dev_data))
print("Test data:", len(test_data))

Training data: 39977
Dev data: 9995
Test data: 25413


In [None]:
train_X, train_y = [i[:-1] for i in train_data], [i[-1] for i in train_data]

In [None]:
if os.path.exists('./data/train/train_features.npz'):
    feat_train_X = np.load('./data/train/train_features.npz')['arr_0']
else:
    feat_train_X = make_features(train_X)
    np.savez('./data/train/train_features.npz', feat_train_X)
        
dev_X, dev_y = [i[:-1] for i in dev_data], [i[-1] for i in dev_data]

if os.path.exists('./data/train/dev_features.npz'):
    feat_dev_X = np.load('./data/train/dev_features.npz')['arr_0']
else:
    feat_dev_X = make_features(dev_X)
    np.savez('./data/train/dev_features.npz', feat_dev_X)

In [None]:
class FakeNet(torch.nn.Module):
    def __init__(self):
        super(FakeNet, self).__init__()
        self.input = torch.nn.Linear(31, 24)
        self.hidden = torch.nn.Linear(24, 12)
        self.another_hidden = torch.nn.Linear(12, 6)
        #self.yet_another_hidden = torch.nn.Linear(16, 8)
        self.last = torch.nn.Linear(6, 4)

    def forward(self, X):
        if isinstance(X, np.ndarray) or isinstance(X, list):
            X = torch.autograd.Variable(torch.FloatTensor(X))
        inp_layer = self.input(X)
        hidden1 = torch.nn.functional.relu(self.hidden(inp_layer))
        hidden2 = torch.nn.functional.relu(self.another_hidden(hidden1))
        #hidden3 = torch.nn.functional.relu(self.yet_another_hidden(hidden2))
        output = torch.nn.functional.relu(self.last(hidden2))
        return output

mapping = {'agree': 0, 'discuss':1, 'disagree':2, 'unrelated':3}
revmapping = {v:k for k, v in mapping.items()}

def train(m, X, y):
    opt = torch.optim.Adam(m.parameters(), lr=1e-3)
    wts = torch.FloatTensor([1/138, 1/348, 1/36, 1/1478])
    for epoch in range(200):
        bs = 100
        tloss = 0.0
        for i in range(0, len(y)-bs+1, bs):
            opt.zero_grad()
            pred = m.forward(X[i:i+bs]) #prediction on batch features
            yb = y[i:i+bs] # batch target
            if isinstance(yb, list):
                yb = list(map(lambda x: mapping[x], yb)) # str labels to indices
                yb = torch.autograd.Variable(torch.LongTensor(yb))
            loss = torch.nn.functional.cross_entropy(pred, yb, weight=wts)
            tloss += loss.data[0]
            loss.backward()
            opt.step()
        print(epoch, tloss)
def predict(model, X):
    return torch.max(torch.nn.functional.log_softmax(model.forward(X)), 1)[1]

In [None]:
#clf = MultinomialNB()
#clf = LogisticRegression()
clf = SVC(kernel='rbf')
#clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

In [None]:
clf.fit(feat_train_X, train_y)
print(feat_train_X.shape)
#f = FakeNet()
#train(f, feat_train_X, train_y)
#dev_pred = list(map(lambda x: revmapping[x], list(predict(f, feat_dev_X).data.numpy())))
dev_pred = clf.predict(feat_dev_X)
print(len(dev_pred))

In [None]:
LABELS = ['agree', 'discuss', 'disagree', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]

def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1
    return score, cm

def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))

In [None]:
score = 0.0
total = 0.0
n = len(dev_data)
#predicted = list(clf.predict(feat_dev_X))
actual = dev_y
score, cm = score_submission(actual, dev_pred)
best_score = score_submission(actual, actual)[0]
print_confusion_matrix(cm)
print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(round(score*100/best_score, 4)) + "%)")

In [None]:
print("True distribution")
best_score, cmtrue = score_submission(actual, actual)
print_confusion_matrix(cmtrue)