In [1]:
import numpy as np
import pandas as pd
import spacy
import networkx as nx
nlp = spacy.load('en_core_web_sm')
import score as sc
import time

In [2]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
train_stances = train_stances.loc[lambda x: x.Stance != "unrelated"]
print(train_stances.shape)
train_stances.head(10)

(13427, 3)


Unnamed: 0,Headline,Body ID,Stance
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
4,Spider burrowed through tourist's stomach and ...,1923,disagree
5,'Nasa Confirms Earth Will Experience 6 Days of...,154,agree
8,Banksy 'Arrested & Real Identity Revealed' Is ...,1739,agree
10,Gateway Pundit,2327,discuss
11,Woman detained in Lebanon is not al-Baghdadi's...,1468,agree
14,"Soon Marijuana May Lead to Ticket, Not Arrest,...",47,discuss
16,Boko Haram Denies Nigeria Cease-Fire Claim,2463,discuss
17,"No, Robert Plant Didn’t Rip Up an $800 Million...",295,agree
19,ISIL Beheads American Photojournalist in Iraq,608,discuss


In [3]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [4]:
def preprocess(text):
    text = text.replace("' ",' ')
    text = text.replace(" '",' ')
    text = text.lower()
    return text

In [5]:
def get_headline_subj(doc):
    subjs = {}
    for chunk in doc.noun_chunks:
        if chunk.root.text not in subjs:
            subjs[chunk.root.text] = set([chunk.root.text])
        subjs[chunk.root.text].add(chunk.text)
    return subjs
#         print(chunk.text, "|",chunk.root.text, "|",chunk.root.dep_,"|",
#               chunk.root.head.text

In [6]:
def build_graph(doc):
    edges = set()
    for token in doc:
        if len(token) > 1:
            for child in token.children:
                if len(child) > 1:
                    edges.add((token.lower_,child.lower_))
    graph = nx.Graph(list(edges))
    return graph

In [7]:
neg_words = set(["n't", "not"])
doubt_words = set(['fake','fraud', 'hoax', 'false', 'deny', 'denies', 'despite', 'nope', 'doubt', 'bogus', 'debunk', 'prank', 'retract', 'scam'])
nr_words = neg_words.union(doubt_words)

In [231]:
#print(len([p for p in paths if len(set(p).intersection(nr_words)) > 0]))

In [8]:
# graph = build_graph(nlp(b_n))
# paths = []
# # simple_paths = nx.all_simple_paths(graph, source='hoax', target="banksy",cutoff=5)
# # for p in simple_paths:
# #     paths.append(p)
# # print(len(paths))
# # print(paths)

In [9]:
def get_shortest_path_to_negating(graph, subjects):
    results = {}
    for s in subjects:
        if graph.has_node(s) and s not in nr_words:
            results[s] = [None, None, 0] #length of path, paths, number of total paths
            for word in nr_words:
                if word in graph:
                    try:
                        path = nx.shortest_path(graph, source = s, target = word)
                        paths = nx.all_simple_paths(graph, source = s, target = word,cutoff=5)
                        if results[s][0] == None or len(path) < results[s][0]:
                            results[s][0] = len(path)
                            results[s][1] = path
                        results[s][2] += len(list(paths))
                    except:
                        continue
    return results

In [None]:
headline_info = {}
body_info = {}
start = time.time()
stance_data = list(train_stances.values)
body_data = list(train_bodies.values)
for body in range(len(body_data)):
    if body % 100 == 0:
        print("Processed "+str(body))
    b_id, txt = tuple(body_data[body])
    txt = preprocess(txt)
    nlp_b = nlp(txt)
    body_graph = build_graph(nlp_b)
    body_info[b_id] = (nlp_b, body_graph)
print("Done!")
for headline in range(len(stance_data)):
    if headline % 2500 == 0:
        print("Processed "+str(headline))
    h, b_id, s = tuple(stance_data[headline])
    h = preprocess(h)
    nlp_h = nlp(h)
    headline_graph = build_graph(nlp_h)
    headline_info[h] = (nlp_h, headline_graph)
print("Done!")
end = time.time()
print(int(end-start))

Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000
Processed 1100


In [249]:
start = time.time()
data = list(train_stances.values)
predicted = []
actual = []
for item in data:
    h, b, s = tuple(item)
    nlp_h, headline_graph = headline_info[h]
    nlp_b, body_graph = body_info[b]
    subj = get_headline_subj(nlp_h)
    
    neg_h = get_shortest_path_to_negating(headline_graph, subj)
    neg_b = get_shortest_path_to_negating(body_graph, subj)
    
    shared_subjects = set(neg_h).intersection(set(neg_b))
    score = 0 # positive is agree, negative is disagree
    for sub in shared_subjects:
        nh, nb = neg_h[sub],  neg_b[sub]
        #one doc mentions negatively, the other does not
        if nh[0] == None and nb[0] != None and nb[0] < 8:
            score -= 1
        elif nh[0] != None and nb[0] == None and nh[0] < 8:
            score -= 1
        #both docs mention negatively
        elif nh[0] != None and nb[0] != None:
            score += 1
        #both don't mention negatively -> discuss
    if score < 0:
        predicted.append("disagree")
    elif score > 0:
        predicted.append("agree")
    else:
        predicted.append("discuss")
    actual.append(s)
end = time.time()
print(int(end-start))

5460


In [252]:
sc.report_score(actual, predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    671    |   1863    |   1144    |     0     |
-------------------------------------------------------------
| disagree  |    84     |    620    |    136    |     0     |
-------------------------------------------------------------
|  discuss  |    494    |   6006    |   2409    |     0     |
-------------------------------------------------------------
| unrelated |     0     |     0     |     0     |     0     |
-------------------------------------------------------------
Score: 6131.75 out of 13427.0	(45.667312132270794%)


45.667312132270794