In [14]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import nltk
import spacy
from time import time
from tqdm import tqdm
from pprint import pprint
import pandas as pd
import random
import pickle
import _pickle as cPickle
import os
import torch

from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission

In [15]:
def generate_features(X, name):
    h = [i[1] for i in X]
    b = [i[2] for i in X]
    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X

def make_data(body_data_path, stance_data_path, training=True):
    body_data = pd.read_csv(body_data_path, names=['BodyID', 'Body'])
    body_data = list(zip(body_data.BodyID.tolist(), body_data.Body.tolist()))[1:]
    if training:
        stance_data = pd.read_csv(stance_data_path, names=['Headline', 'BodyID', 'Stance'])
        stance_data = list(zip(stance_data.Headline.tolist(), stance_data.BodyID.tolist(), stance_data.Stance.tolist()))[1:]
    else:
        stance_data = pd.read_csv(stance_data_path, names=['Headline', 'BodyID'])
        stance_data = list(zip(stance_data.Headline.tolist(), stance_data.BodyID.tolist()))[1:]
    id_text = {}
    data = []
    for _id, text in body_data:
        id_text[_id] = text
    for row in tqdm(stance_data):
        headline = row[0]
        _id = row[1]
        if training:
            stance = row[2]
            data.append((_id, headline, id_text[_id], stance))
        else:
            data.append((_id, headline, id_text[_id]))
    return data

def dev_split(data, split=0.8):
    n = len(data)
    random.shuffle(data)
    train_data = data[:int(split*n)]
    dev_data = data[int(split*n):]
    assert len(train_data)+len(dev_data)==n
    return train_data, dev_data

In [16]:
def save_or_load_file(path, obj=None, load=True):
    if load:
        with open(path, 'rb') as f:
            obj = cPickle.load(f)
        return obj
    else:
        with open(path, 'wb') as f:
            assert obj is not None
            cPickle.dump(obj, f)
        
train_path = '../data/train/train_data.pkl'
dev_path = '../data/train/dev_data.pkl'
test_path = '../data/test/test_data.pkl'

if os.path.exists(train_path) and os.path.exists(dev_path):
    train_data = save_or_load_file(train_path)
    dev_data = save_or_load_file(dev_path)
else:
    train_data, dev_data = dev_split(make_data('../data/train/train_bodies.csv', '../data/train/train_stances.csv'))
    save_or_load_file(train_path, train_data, False)
    save_or_load_file(dev_path, dev_data, False)

if os.path.exists(test_path):
    test_data = save_or_load_file(test_path)
else:
    test_data = make_data('../data/test/test_bodies.csv', '../data/test/test_stances_unlabeled.csv', False)
    save_or_load_file(test_path, test_data, False)

print("Training data:", len(train_data))
print("Dev data:", len(dev_data))
print("Test data:", len(test_data))

Training data: 39977
Dev data: 9995
Test data: 25413


In [22]:
train_X, train_y = [i[:-1] for i in train_data], [i[-1] for i in train_data]
feat_train_X = generate_features(train_X, 'train')
dev_X, dev_y = [i[:-1] for i in train_data], [i[-1] for i in train_data]
feat_dev_X = generate_features(dev_X, 'dev')

39977it [02:57, 224.68it/s]
39977it [00:10, 3893.30it/s]
39977it [03:03, 217.50it/s]
39977it [04:03, 164.11it/s]


In [20]:
class FakeNet(torch.nn.Module):
    def __init__(self):
        super(FakeNet, self).__init__()
        self.input = torch.nn.Linear(44, 32)
        self.hidden = torch.nn.Linear(32, 16)
        self.another_hidden = torch.nn.Linear(16, 8)
        #self.yet_another_hidden = torch.nn.Linear(16, 8)
        self.last = torch.nn.Linear(8, 4)

    def forward(self, X):
        if isinstance(X, np.ndarray) or isinstance(X, list):
            X = torch.autograd.Variable(torch.FloatTensor(X))
        inp_layer = self.input(X)
        hidden1 = torch.nn.functional.relu(self.hidden(inp_layer))
        hidden2 = torch.nn.functional.relu(self.another_hidden(hidden1))
        #hidden3 = torch.nn.functional.relu(self.yet_another_hidden(hidden2))
        output = torch.nn.functional.relu(self.last(hidden2))
        return output

    def fit(self, X, y, wts=None):
        mapping = {'agree': 0, 'disagree':1, 'discuss':2, 'unrelated':3}
        revmapping = {v:k for k, v in mapping.items()}

        opt = torch.optim.Adam(self.parameters(), lr=1e-3)
        if wts is not None and isinstance(wts, list):
            wts = torch.FloatTensor(wts)
        for epoch in range(20):
            bs = 100
            tloss = 0.0
            for i in range(0, len(y)-bs+1, bs):
                opt.zero_grad()
                pred = self.forward(X[i:i+bs]) #prediction on batch features
                yb = y[i:i+bs] # batch target
                if isinstance(yb, list):
                    yb = list(map(lambda x: mapping[x], yb)) # str labels to indices
                    yb = torch.autograd.Variable(torch.LongTensor(yb))
                if isinstance(yb, np.ndarray):
                    yb = torch.autograd.Variable(torch.LongTensor(yb))
                loss = torch.nn.functional.cross_entropy(pred, yb, weight=wts)
                tloss += loss.data[0]
                loss.backward()
                opt.step()
            print(epoch, "::", tloss)

    def predict(self, X):
        result = torch.max(torch.nn.functional.log_softmax(self.forward(X)), 1)[1]
        return list(result.data.numpy())

In [21]:
#clf = MultinomialNB()
#clf = LogisticRegression()
clf = SVC(kernel='rbf')
#clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

In [None]:
clf.fit(feat_train_X, train_y)
print(feat_train_X.shape)
#f = FakeNet()
#train(f, feat_train_X, train_y)
#dev_pred = list(map(lambda x: revmapping[x], list(predict(f, feat_dev_X).data.numpy())))
dev_pred = clf.predict(feat_dev_X)
print(len(dev_pred))

In [None]:
LABELS = ['agree', 'discuss', 'disagree', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]

def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1
    return score, cm

def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))

In [None]:
score = 0.0
total = 0.0
n = len(dev_data)
#predicted = list(clf.predict(feat_dev_X))
actual = dev_y
score, cm = score_submission(actual, dev_pred)
best_score = score_submission(actual, actual)[0]
print_confusion_matrix(cm)
print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(round(score*100/best_score, 4)) + "%)")

In [None]:
print("True distribution")
best_score, cmtrue = score_submission(actual, actual)
print_confusion_matrix(cmtrue)