In [1]:
import numpy as np
import pandas as pd
import spacy
import networkx as nx
import score as sc
import time
from sklearn.metrics import confusion_matrix, pairwise, f1_score, precision_score
from scipy.spatial import distance
from preprocessing.utils import plot_confusion_matrix
import matplotlib.pyplot as plt
import nltk
from collections import Counter, defaultdict
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import itertools
import torch
import importlib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

  from numpy.core.umath_tests import inner1d


In [2]:
import sys
importlib.reload(sys.modules['preprocessing.utils'])
from preprocessing.utils import plot_confusion_matrix

In [3]:
#https://github.com/huggingface/neuralcoref
#note: this NEEDS spacy 2.0.12 to work! downgrade with pip install spacy=2.0.12
import en_coref_md
nlp = spacy.load('en_core_web_sm')
coref = en_coref_md.load()

In [4]:
negating_words = set([
    "n't", "not", "no", 
    "never", "nobody", "non", "nope"])
doubting_words = set([
    'fake','fraud', 'hoax', 
    'false', 'deny', 'denies', 
    'despite', 'doubt', 
    'bogus', 'debunk', 'prank', 
    'retract', 'scam', "withdrawn",
    "misinformation"])
hedging_words = set([
    'allege', 'allegedly','apparently',
    'appear','claim','could',
    'evidently','largely','likely',
    'mainly','may', 'maybe', 'might',
    'mostly','perhaps','presumably',
    'probably','purport', 'purportedly',
    'reported', 'reportedly',
    'rumor', 'rumour', 'rumored', 'rumoured',
    'says','seem','somewhat',
    'unconfirmed'])
sus_words = doubting_words.union(hedging_words)

In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

def get_sentiment(sentence):
    sent =  vader.polarity_scores(sentence.text)
    return [sent["pos"],sent["neg"],sent["neu"],sent["compound"]]

def get_avg_sentiment(lst):
    sents = np.array([get_sentiment(s) for s in lst])
    return list(np.mean(sents, axis = 0))

def get_diff_sentiment(a,b):
    return list(np.array(a) - np.array(b))

In [6]:
def train_test_split(bodies, stances, split=0.8):
    idx = np.random.permutation(np.arange(len(bodies)))
    bodies = bodies.values[idx]
    train = int(len(bodies)*0.8)
    bodies_tr = set([i[0] for i in bodies[:train]])
    bodies_val = set([i[0] for i in bodies[train:]])
    stances_tr = stances.loc[stances["Body ID"].isin(bodies_tr), :]
    stances_val = stances.loc[stances["Body ID"].isin(bodies_val), :]
    return stances_tr, stances_val

In [7]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
print(train_stances.shape)
train_stances.head(10)

(49972, 3)


Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree
5,'Nasa Confirms Earth Will Experience 6 Days of...,154,agree
6,Accused Boston Marathon Bomber Severely Injure...,962,unrelated
7,Identity of ISIS terrorist known as 'Jihadi Jo...,2033,unrelated
8,Banksy 'Arrested & Real Identity Revealed' Is ...,1739,agree
9,British Aid Worker Confirmed Murdered By ISIS,882,unrelated


In [8]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [9]:
def get_body(n):
    return train_bodies.loc[lambda x: x["Body ID"] == n, "articleBody"].item()

In [10]:
def preprocess(text):
    text = text.replace("' ",' ')
    text = text.replace("'\n",'\n')
    text = text.replace(" '",' ')
    text = text.replace('"',' ')
    text = text.replace('“',' ')
    text = text.replace('”', ' ')
    text = text.replace(":", ". ")
    text = text.replace(";", ". ")
    text = text.replace("...", " ")
    return text

In [11]:
def cosine_similarity(x,y):
    if all([a == 0 for a in x]) or all([a == 0 for a in y]):
        return 0
    return 1 - np.nan_to_num(distance.cosine(x,y))

In [12]:
def get_topics(doc):
    """
    get topics of a sentence
    input: spacy doc
    output: dictionary with nouns as the key, and the set of noun chunks that contain the noun as the value
    special entry _vocab has the set of all tokens in the dict
    """
    subjs = {}
    for chunk in doc.noun_chunks:
        if len(chunk.root.text) > 2 and chunk.root.pos_ not in ["NUM", "SYM","PUNCT"]:
            txt = chunk.root.lemma_.lower()
            if txt not in subjs:
                subjs[txt] = set([txt])
            subjs[txt].add(chunk.text.lower())
    subjects_= []
    for word in subjs:
        for phrase in subjs[word]:
            subjects_ += phrase.split(" ")
    subjs["_vocab"] = set(subjects_)
    return subjss

In [24]:
def get_svos(sent):
    """
    input: Spacy processed sentence
    output: dict of subj, dict of v, dict of obj (each word is lemmatized and lowercased)
    each entry in dict has key of lemmatized token, value is actual token (to do traversals with later if needed)
    """
    s = {}
    v = {}
    o = {}
    for token in sent:
        if token.dep_ == 'ROOT':
            v[token.lemma_.lower()] = token
        elif token.dep_ in ["nsubj", "nsubjpass", "csubj","csubjpass", "agent","compound"]:
            s[token.lemma_.lower()] = token
        elif token.dep_ in ["dobj", "dative", "attr", "oprd", "pobj"]:
            o[token.lemma_.lower()] = token
    # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
    return (s,v,o)

In [14]:
def build_graph(doc):
    """
    build a NetworkX graph of the dependency tree
    input: spacy Doc
    output: networkx graph
    """
    edges = set()
    for token in doc:
        if token.pos_ not in ['SPACE']:
            for child in token.children:
                if child.pos_ not in ['SPACE']:
                    edges.add((token.lemma_.lower(),child.lemma_.lower()))
    graph = nx.DiGraph(list(edges))
    return graph

In [12]:
def get_edges(doc):
    """
    return list of edges
    """
    edges = []
    for token in doc:
        if token.pos_ not in ['SPACE', 'PUNCT', 'SYM']:
            for child in token.children:
                if child.pos_ not in ['SPACE', 'PUNCT', 'SYM']:
                    edges.append((
                        {"token":token.lemma_.lower(), "dep":token.dep_ , "pos":token.pos_},
                        {"token":child.lemma_.lower(), "dep":child.dep_ , "pos":child.pos_}
                    ))
    return edges

In [40]:
def get_summary(doc, subjects, n = 5):
    """
    get summary of n sentences in document
    first meaningful sentence will always be returned
    """
    subjects_ = subjects
    def score_sentence(sent):
        # not very robust right now
        score = 0
        word_count = 0
        for token in sent:
            word_count += 1
            t = token.lemma_.lower()
            if t in subjects_:
                score += 1
            elif t in negating_words or t in doubting_words or t in hedging_words:
                score += 1
        return score/word_count if word_count > 4 else 0
    sentences = [s for s in doc.sents]
    scored_sentences = [[idx, sent, score_sentence(sent)] for idx, sent in enumerate(sentences)]
    scored_sentences = [s for s in scored_sentences if s[2] > 0 and s[0] > 0] #filter out non-scoring sentences
    scored_sentences.sort(key = lambda x: x[2], reverse = True)
    top = scored_sentences[:n]
    top.sort(key = lambda x: x[0])
    scored_sentences.sort(key = lambda x: x[0])
    result = None
    if len(scored_sentences) == 0:
        result = [sentences[0]]
    else:
        result = [scored_sentences[0][1]] + [s[1] for s in top]
    return result

In [13]:
def get_shortest_path_to_negating(graph, subjects):
    """
    get the shortest path from each subject to any negating or doubting/hedging word
    returns: dictionary with subject as key, and 2-element list of path lengths [negating, doubting]
    - if a subject does not exist in graph or have a path to any negating word, then the value will be [None, None]
    """
    results = {}
    for s in subjects:
        results[s] = [None, None, None]
        if graph.has_node(s):
            for word in negating_words:
                if word in graph:
                    try:
                        path = nx.shortest_path(graph, source = s, target = word)
                        if results[s][0] == None or len(path) < results[s][0]:
                            results[s][0] = len(path)
                    except:
                        continue
            for word in hedging_words:
                if word in graph:
                    try:
                        path = nx.shortest_path(graph, source = s, target = word)
                        if results[s][1] == None or len(path) < results[s][1]:
                            results[s][1] = len(path)
                    except:
                        continue
            for word in doubting_words:
                if word in graph:
                    try:
                        path = nx.shortest_path(graph, source = s, target = word)
                        if results[s][2] == None or len(path) < results[s][2]:
                            results[s][2] = len(path)
                    except:
                        continue
    return results

In [16]:
def root_distance(graph, root):
    """
    as implemented in the Emergent paper - return the shortest distance between the given root and any 
    doubting or hedging words in the graph, or None if no such path exists
    """
    if root == None:
        return None
    min_dist = None
    for word in sus_words:
        if word in graph:
            try:
                path = nx.shortest_path(graph, source = root, target = word)
                if min_dist == None or len(path) < min_dist:
                    min_dist = len(path)
            except:
                continue
    return min_dist

In [17]:
def get_neg_ancestors(doc):
    """
    get the ancestors of every negating word
    input: spacy Doc
    returns: tuple  - set of words that were in the ancestor list of negating words, 
    set of words that were in ancestor list of refuting words, # negating words, # refuting words
    """
    results = [set(), set(), set(), 0, 0, 0]
    for token in doc:
        if token.lemma_.lower() in negating_words:
            results[0] = results[0].union(
                set([ancestor.lemma_.lower() for ancestor in token.ancestors if len(ancestor) > 2]).union(
                    set([child.lemma_.lower() for child in token.head.children if child.text != token.text and len(child) > 2])
                )
            )
            results[3] += 1
        elif token.lemma_.lower() in doubting_words:
            results[1] = results[1].union(
                set([ancestor.lemma_.lower() for ancestor in token.ancestors if len(ancestor) > 2]).union(
                    set([child.lemma_.lower() for child in token.head.children if child.text != token.text and len(child) > 2])
                )
            )
            results[4] += 1
        elif token.lemma_.lower() in hedging_words:
            results[2] = results[1].union(
                set([ancestor.lemma_.lower() for ancestor in token.ancestors if len(ancestor) > 2]).union(
                    set([child.lemma_.lower() for child in token.head.children if child.text != token.text and len(child) > 2])
                )
            )
            results[5] += 1
    return tuple(results)

In [18]:
import math
def build_idf(body_data):
    idf = {}
    for body in range(len(body_data)):
        if body % 100 == 0:
            print("Processed "+str(body))
        b_id, txt = tuple(body_data[body])
        nlp_b = nlp(preprocess(txt))
        tokens = [t for t in nlp_b if not t.is_stop and t.pos_ not in ['PUNCT','NUM','SYM','SPACE','PART']]
        lemmatized = set([token.lemma_.lower() for token in tokens])
        for tok in lemmatized:
            if tok not in idf:
                idf[tok] = 0
            idf[tok] += 1
    avg = float(sum(idf.values())) / len(idf)
    for i in idf:
        idf[i] = math.log(len(body_data)/idf[i])
    idf["_avg"] = math.log(len(body_data)/avg)
    return idf

In [19]:
stance_data = list(train_stances.values)
body_data = list(train_bodies.values)

In [20]:
idf = build_idf(body_data)

Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000
Processed 1100
Processed 1200
Processed 1300
Processed 1400
Processed 1500
Processed 1600


In [69]:
import json

with open('saved_data/idf.json', 'w') as fp:
    json.dump(idf, fp)

In [21]:
def process_sentence(sentence):
    svo = get_svos(sentence)

    # list of words that belong to that part of speech
    nouns = []
    verbs = []
    adjectives = []
    adverbs = []
    tokens = []
    for token in sentence:
        if not token.is_stop and token.pos_ not in ['PUNCT', 'NUM', 'SYM','SPACE','PART']:
            if token.pos_ == "NOUN":
                nouns.append(token.lemma_.lower())
            elif token.pos_ == "VERB":
                verbs.append(token.lemma_.lower())
            elif token.pos_ == "ADJ":
                adjectives.append(token.lemma_.lower())
            elif token.pos_ == "ADV":
                adverbs.append(token.lemma_.lower())
            tokens.append(token.lemma_.lower())   
    
    bigram = list(nltk.bigrams(tokens))
    bigram_str = [x[0]+' '+x[1] for x in bigram]

    return {
        "raw": sentence.text,
        "tokens": tokens,
        "bigrams": bigram_str,
        "nouns": nouns,
        "verbs": verbs,
        "adjectives": adjectives,
        "adverbs": adverbs,
        "svo": [list(item) for item in svo]
    }

In [22]:
def score_sentence_idf(sent, idf):
    # not very robust right now
    score = 0
    word_count = 0
    for token in sent:
        word_count += 1
        t = token.lemma_.lower()
        if t in idf:
            score += idf[t]
    return score/word_count if word_count > 4 else 0

def process_body(body, idf):
    sentences = [s for s in body.sents]
    if len(sentences) == 0:
        sentences = [body]

    # first sentence of article
    first_sentence_data = process_sentence(sentences[0])

    nouns = []
    verbs = []
    adjectives = []
    adverbs = []
    tokens = []
    for token in body:
        if not token.is_stop and token.pos_ not in ['PUNCT', 'NUM', 'SYM','SPACE','PART']:
            if token.pos_ == "NOUN":
                nouns.append(token.lemma_.lower())
            elif token.pos_ == "VERB":
                verbs.append(token.lemma_.lower())
            elif token.pos_ == "ADJ":
                adjectives.append(token.lemma_.lower())
            elif token.pos_ == "ADV":
                adverbs.append(token.lemma_.lower())
            tokens.append(token.lemma_.lower())   
    
    bigram = list(nltk.bigrams(tokens))
    bigram_str = [x[0]+' '+x[1] for x in bigram]

    doc_len = len(tokens)
    n_counter = Counter(nouns)
    v_counter = Counter(verbs)
    b_counter = Counter(bigram)
    t_counter = Counter(tokens)

    avg_idf = idf["_avg"]
    n_tfidf, v_tfidf, t_tfidf = {}, {}, {}
    for n in n_counter:
        n_tfidf[n] = (n_counter[n]/doc_len) * \
            (idf[n] if n in idf else avg_idf)
    for v in v_counter:
        v_tfidf[v] = (v_counter[v]/doc_len) * \
            (idf[v] if v in idf else avg_idf)
    for t in t_counter:
        t_tfidf[t] = (t_counter[t]/doc_len) * \
            (idf[t] if t in idf else avg_idf)
    
    common_nouns = sorted(n_tfidf, key=n_tfidf.get, reverse=True)[:5]
    common_verbs = sorted(v_tfidf, key=v_tfidf.get, reverse=True)[:5]
    common_tokens = sorted(t_tfidf, key=t_tfidf.get, reverse=True)[:5]

    # no idf for bigrams increase "common" count to 10
    common_bigrams = [x[0] for x in b_counter.most_common(10)]
    
    scored_sentences = [[idx, sent, score_sentence_idf(sent, idf)] for idx, sent in enumerate(sentences)]
    scored_sentences = [s for s in scored_sentences] #filter out non-scoring sentences
    scored_sentences.sort(key = lambda x: x[2], reverse = True)
    most_significant_sentence_data = process_sentence(scored_sentences[0][1])

    return {
        "raw" : body.text,
        "tokens": tokens,
        "bigrams": bigram_str,
        "nouns": nouns,
        "verbs": verbs,
        "first_sentence": first_sentence_data,
        "significant_sentence": most_significant_sentence_data,
        "vocabulary": list(set(tokens)),
        "common_tokens": common_tokens,
        "common_nouns": common_nouns,
        "common_verbs": common_verbs,
        "common_bigrams": common_bigrams,
    }

In [25]:
headline_info = {}
body_info = {}
start = time.time()
for headline in range(len(stance_data)):
    if headline % 2500 == 0:
        print("Processed "+str(headline))
    h, b_id, s = tuple(stance_data[headline])
    if h not in headline_info:
        nlp_h = nlp(preprocess(h))
        headline_processed = process_sentence(nlp_h)
        headline_info[h] = headline_processed
print("Done!")
for body in range(len(body_data)):
    if body % 100 == 0:
        print("Processed "+str(body))
    b_id, txt = tuple(body_data[body])
    nlp_a = coref(preprocess(txt))
    nlp_b = nlp(nlp_a._.coref_resolved.lower())
    body_processed = process_body(nlp_b, idf)
    body_info[b_id] = body_processed
print("Done!")
end = time.time()
print(int(end-start))

Processed 0
Processed 2500
Processed 5000
Processed 7500
Processed 10000
Processed 12500
Processed 15000
Processed 17500
Processed 20000
Processed 22500
Processed 25000
Processed 27500
Processed 30000
Processed 32500
Processed 35000
Processed 37500
Processed 40000
Processed 42500
Processed 45000
Processed 47500
Done!
Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000
Processed 1100
Processed 1200
Processed 1300
Processed 1400
Processed 1500
Processed 1600
Done!
673


In [71]:
with open('saved_data/relevance_headline_info.json', 'w') as fp:
    json.dump(headline_info, fp)
    
json_body_info = {}
for k in body_info:
    body_info[k]['vocabulary'] = list(body_info[k]['vocabulary'])
    json_body_info[str(k)] = body_info[k]
with open('saved_data/relevance_body_info.json', 'w') as fp:
    json.dump(json_body_info, fp)

In [37]:
print(body_info[0])

{'raw': "a small meteorite crashed into a wooded area in nicaragua's capital of managua overnight, nicaragua said sunday. residents reported hearing a mysterious boom that left a 16-foot deep crater near the city's airport, the associated press reports. \n\ngovernment spokeswoman rosario murillo said a committee formed by nicaragua to study the event determined a committee formed by the government to study the event was a  relatively small  meteorite that  appears to have come off an asteroid that was passing close to earth.  house-sized asteroid 2014 rc, which measured 60 feet in diameter, skimmed earth this weekend, abc news reports. \ngovernment spokeswoman rosario murillo said nicaragua will ask international experts to help local scientists in understanding what happened.\n\nthe crater left by a  relatively small  meteorite that  appears to have come off an asteroid that was passing close to earth had a radius of 39 feet and a depth of 16 feet,  said humberto saballos, a volcanolo

In [38]:
def get_features(stance_df):
    start = time.time()
    data = list(stance_df.values)
    features = []
    actual = []
    for item in data:
        h, b, s = tuple(item)
        fts = get_feats(h, b)
        features.append(fts)
        actual.append(s)
    end = time.time()
    print(int(end-start))
    return features, actual

In [59]:
def bow_cos_similarity(a, b):
    vocab = list(set(a).union(set(b)))
    a_bow, b_bow = set(a), set(b)
    if len(a) == 0 or len(b) == 0:
        return -1
    a_vec = [(1 if i in a_bow else 0) for i in vocab]
    b_vec = [(1 if i in b_bow else 0) for i in vocab]
    return 1 - distance.cosine(a_vec, b_vec)

def get_feats(headline, body_id):
    headline_data = headline_info[headline]
    body_data = body_info[body_id]

    shared_common_nouns = len(set(headline_data['nouns']).intersection(
        set(body_data['common_nouns'])))
    shared_common_verbs = len(set(headline_data['verbs']).intersection(
        set(body_data['common_verbs'])))
    shared_common_tokens = len(set(headline_data['tokens']).intersection(
        set(body_data['common_tokens'])))
    shared_bigrams = len(set(headline_data['bigrams']).intersection(
        set(body_data['common_bigrams'])))

    shared_nouns_first = len(set(headline_data['nouns']).intersection(
        set(body_data['first_sentence']['nouns'])))
    shared_verbs_first = len(set(headline_data['verbs']).intersection(
        set(body_data['first_sentence']['verbs'])))
    shared_bigrams_first = len(set(headline_data['bigrams']).intersection(
        set(body_data['first_sentence']['bigrams'])))
    shared_tokens_first = len(set(headline_data['tokens']).intersection(
        set(body_data['first_sentence']['tokens'])))

    shared_nouns_sig = len(set(headline_data['nouns']).intersection(
        set(body_data['significant_sentence']['nouns'])))
    shared_verbs_sig = len(set(headline_data['verbs']).intersection(
        set(body_data['significant_sentence']['verbs'])))
    shared_bigrams_sig = len(set(headline_data['bigrams']).intersection(
        set(body_data['significant_sentence']['bigrams'])))
    shared_tokens_sig = len(set(headline_data['tokens']).intersection(
        set(body_data['significant_sentence']['tokens'])))

    headline_svo = headline_data['svo']
    body_fst_svo = body_data['first_sentence']['svo']
    body_sig_svo = body_data['significant_sentence']['svo']

    # cosine similarity - no verbs because relatively few per sentence
    cos_nouns_first = bow_cos_similarity(
        headline_data['nouns'], body_data['first_sentence']['nouns'])
    cos_bigrams_first = bow_cos_similarity(
        headline_data['bigrams'], body_data['first_sentence']['bigrams'])
    cos_tokens_first = bow_cos_similarity(
        headline_data['tokens'], body_data['first_sentence']['tokens'])

    cos_nouns_sig = bow_cos_similarity(
        headline_data['nouns'], body_data['significant_sentence']['nouns'])
    cos_bigrams_sig = bow_cos_similarity(
        headline_data['bigrams'], body_data['significant_sentence']['bigrams'])
    cos_tokens_sig = bow_cos_similarity(
        headline_data['tokens'], body_data['significant_sentence']['tokens'])
    
    svo_cos_sim_fst = bow_cos_similarity(
        body_fst_svo[0]+body_fst_svo[1]+body_fst_svo[2], 
        headline_svo[0]+headline_svo[1]+headline_svo[2])

    svo_cos_sim_sig = bow_cos_similarity(
        body_sig_svo[0]+body_sig_svo[1]+body_sig_svo[2], 
        headline_svo[0]+headline_svo[1]+headline_svo[2])
    
    svo_s_fst = len(set(body_fst_svo[0]).intersection(set(headline_svo[0]))) 
    svo_v_fst = len(set(body_fst_svo[1]).intersection(set(headline_svo[1])))
    svo_o_fst = len(set(body_fst_svo[2]).intersection(set(headline_svo[2])))
    svo_s_sig = len(set(body_sig_svo[0]).intersection(set(headline_svo[0])))
    svo_v_sig = len(set(body_sig_svo[1]).intersection(set(headline_svo[1])))
    svo_o_sig = len(set(body_sig_svo[2]).intersection(set(headline_svo[2])))
    
    return {
        'shared_nouns': shared_common_nouns,
        'shared_verbs': shared_common_verbs,
        'shared_bigrams': shared_bigrams,
        'shared_tokens': shared_common_tokens,

        'shared_nouns_fst': shared_nouns_first,
        'shared_verbs_fst': shared_verbs_first,
        'shared_bigrams_fst': shared_bigrams_first,
        'shared_tokens_fst': shared_tokens_first,

        'shared_nouns_sig': shared_nouns_sig,
        'shared_verbs_sig': shared_verbs_sig,
        'shared_bigrams_sig': shared_bigrams_sig,
        'shared_tokens_sig': shared_tokens_sig,

        'cos_nouns_sig': cos_nouns_sig,
        'cos_bigrams_sig': cos_bigrams_sig,
        'cos_tokens_sig': cos_tokens_sig,

        'cos_nouns_fst': cos_nouns_first,
        'cos_bigrams_fst': cos_bigrams_first,
        'cos_tokens_fst': cos_tokens_first,

        'svo_cos_sim_fst' : svo_cos_sim_fst,
        'svo_cos_sim_sig' : svo_cos_sim_sig,
        
        'svo_s_fst': svo_s_fst,
        'svo_v_fst': svo_v_fst,
        'svo_o_fst': svo_o_fst,

        'svo_s_sig': svo_s_sig,
        'svo_v_sig': svo_v_sig,
        'svo_o_sig': svo_o_sig,
    }

In [56]:
stance_info = get_features(train_stances)
stance_dict = {}
for idx, d in enumerate(list(train_stances.values)):
    h, b, s = d
    stance_dict[(h, b)] = stance_info[0][idx]

21


In [54]:
for i in range(100):
    print(list(stance_info[0][i].values()), stance_info[1][i])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] unrelated
[1, 1, 0, 3, 2, 1, 1, 5, 0, 0, 0, 1, 0.0, 0.0, 0.09449111825230683, 0.3086066999241839, 0.08247860988423228, 0.3857583749052298, 0.30304576336566325, 0.13363062095621214] agree
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] unrelated
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] unrelated
[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0.20412414523193156, 0.0, 0.10846522890932808, 0.0, 0.0, 0.0, 0.0, 0.16666666666666663] disagree
[1, 0, 0, 5, 1, 0, 2, 8, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.33333333333333326, 0.15384615384615385, 0.5714285714285714, 0.5017452060042545, 0.0] agree
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0.0, 0.0, -1, 0.0, 0.0, 0.0, 0.0] unrelated
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] unrelated
[0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07692307692307687, 0.14433756729740643, 0.0] ag

In [31]:
def label_to_int(labels):
    return [(1 if l == "agree" else (0 if l == "discuss" else -1)) for l in labels]

In [177]:
import json
with open('saved_data/headline_info.json', 'w') as fp:
    json.dump(headline_info, fp)
with open('saved_data/body_info.json', 'w') as fp:
    json.dump(body_info, fp)
with open('saved_data/idf.json', 'w') as fp:
    json.dump(idf, fp)
with open('saved_data/stance_info.json', 'w') as fp:
    json.dump(stance_info, fp)

In [172]:
idf['say'], idf['report'], idf['spider']

(0.29983724851076954, 0.5771482666970632, 4.096128684015602)

# Simple Model

In [67]:
stances_tr, stances_val = train_test_split(train_bodies, train_stances)

training_data = [[],[]]
for h,b,s in list(stances_tr.values):
    training_data[0].append(list(stance_dict[(h,b)].values()))
    training_data[1].append(s if s == "unrelated" else "discuss")

testing_data = [[],[]]
for h,b,s in list(stances_val.values):
    testing_data[0].append(list(stance_dict[(h,b)].values()))
    testing_data[1].append(s if s == "unrelated" else "discuss")

c1, c2 = Counter(training_data[1]), Counter(testing_data[1])
baseline_tr = max(c1.values())/sum(c1.values())
baseline_val = max(c2.values())/sum(c2.values())
print(c1, c2)
print("Training Baseline {0:.2f}% Testing Baseline {1:.2f}%".format(baseline_tr * 100, baseline_val * 100))

model = RandomForestClassifier(n_estimators = 500, min_samples_split = 10, min_samples_leaf = 5, max_depth = 8)
# model = LogisticRegression(max_iter = 200)
# model = SVC()
# model = GradientBoostingClassifier(n_estimators = 500, subsample = 0.1, learning_rate = 0.01, random_state=0)

model.fit(training_data[0], training_data[1])
tr_acc = model.score(training_data[0], training_data[1])
print('{0:.2f}% training accuracy'.format(tr_acc*100))

val_acc = model.score(testing_data[0], testing_data[1])
print('{0:.2f}% validation accuracy'.format(val_acc*100))
print("Baseline comparison: TR {0:.2f}% VAL {1:.2f}%".format((tr_acc-baseline_tr)*100,(val_acc-baseline_val)*100))

actual = testing_data[1]
predicted = model.predict(testing_data[0])
sc.report_score(actual, predicted)
print("F1 Score")
print(f1_score(actual, predicted, average = None))
print("Avg Precision Score")
print(precision_score(actual, predicted, average = None))
matrix = confusion_matrix(actual,predicted)

Counter({'unrelated': 29275, 'discuss': 10734}) Counter({'unrelated': 7270, 'discuss': 2693})
Training Baseline 73.17% Testing Baseline 72.97%
94.90% training accuracy
94.62% validation accuracy
Baseline comparison: TR 21.73% VAL 21.65%
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     0     |     0     |     0     |     0     |
-------------------------------------------------------------
| disagree  |     0     |     0     |     0     |     0     |
-------------------------------------------------------------
|  discuss  |     0     |     0     |   2360    |    333    |
-------------------------------------------------------------
| unrelated |     0     |     0     |    203    |   7067    |
-------------------------------------------------------------
Score: 4126.75 out of 4510.5	(91.49207404944019%)
F1 Score
[0.89802131 0.96346285

# save final model

In [73]:
stances_tr = train_stances

training_data = [[],[]]
for h,b,s in list(stances_tr.values):
    training_data[0].append(list(stance_dict[(h,b)].values()))
    training_data[1].append(s if s == "unrelated" else "discuss")

model = RandomForestClassifier(n_estimators = 500, min_samples_split = 10, min_samples_leaf = 5, max_depth = 10)

model.fit(training_data[0], training_data[1])

from joblib import dump, load
dump(model, 'saved_models/relevance_detection_trained.joblib')

['saved_models/relevance_detection_trained.joblib']