In [10]:
import pandas as pd
import query_elasticsearch as es

# Claim 
re_step1_results = "/home/squirrel/ccg-new/projects/perspective/data/database_output/re-step1/webapp_restep1results.csv"
claim = "/home/squirrel/ccg-new/projects/perspective/data/database_output/re-step1/webapp_claim.csv"
perspective = "/home/squirrel/ccg-new/projects/perspective/data/database_output/re-step1/webapp_perspective.csv"

re_step1_df = pd.read_csv(re_step1_results)
claim_df = pd.read_csv(claim)
perspective_df = pd.read_csv(perspective)
perspective_df = perspective_df.dropna()

re_step1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27507 entries, 0 to 27506
Data columns (total 12 columns):
id                        27507 non-null int64
claim_id                  27507 non-null int64
perspective_id            27507 non-null int64
vote_support              27507 non-null int64
vote_leaning_support      27507 non-null int64
vote_leaning_undermine    27507 non-null int64
vote_undermine            27507 non-null int64
vote_not_valid            27507 non-null int64
p_i_5                     27507 non-null float64
p_i_3                     27507 non-null float64
label_3                   27507 non-null object
label_5                   27507 non-null object
dtypes: float64(2), int64(8), object(2)
memory usage: 2.5+ MB


In [11]:
perspective_df.info()
claim_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20797 entries, 0 to 20797
Data columns (total 7 columns):
id                       20797 non-null int64
source                   20797 non-null object
title                    20797 non-null object
pilot1_high_agreement    20797 non-null int64
similar_persps           20797 non-null object
more_than_two_tokens     20797 non-null int64
pilot1_have_stance       20797 non-null int64
dtypes: int64(4), object(3)
memory usage: 1.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 7 columns):
id                          1006 non-null int64
source                      1006 non-null object
title                       1006 non-null object
assignment_counts           1006 non-null int64
finished_counts             1006 non-null int64
evidence_assign_counts      1006 non-null int64
evidence_finished_counts    1006 non-null int64
dtypes: int64(5), object(2)
memory usage: 55.1+ KB


In [12]:
# Filter out low quality ones and not-valid ones
th = 0.5
re_step1_df = re_step1_df[(re_step1_df["p_i_3"] > th) & (re_step1_df.label_3.isin(["S", "U"]))]
re_step1_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 10 to 23602
Data columns (total 12 columns):
id                        5782 non-null int64
claim_id                  5782 non-null int64
perspective_id            5782 non-null int64
vote_support              5782 non-null int64
vote_leaning_support      5782 non-null int64
vote_leaning_undermine    5782 non-null int64
vote_undermine            5782 non-null int64
vote_not_valid            5782 non-null int64
p_i_5                     5782 non-null float64
p_i_3                     5782 non-null float64
label_3                   5782 non-null object
label_5                   5782 non-null object
dtypes: float64(2), int64(8), object(2)
memory usage: 587.2+ KB


In [20]:
# Make claim + persp data, for index
def get_claim(claim_id):
    q = claim_df[claim_df.id == claim_id]
    if len(q) > 0:
        return q.iloc[0].title
    else:
        print("Claim not found! cid = {}".format(claim_id))
        return None

def get_perspective(perspective_id):
    q = perspective_df[perspective_df.id == perspective_id]
    if len(q) > 0:
        p = q.iloc[0]
        if p.pilot1_have_stance:
            return p.title
        else:
            print("Perspective not valid! pid = {}".format(perspective_id))
            return None
    else:
        print("Perspective not found! pid = {}".format(perspective_id))
        return None
    
def concat_claim_persp(claim, persp):
    if not claim.endswith("."):
        try:
            return claim + ". " + persp
        except TypeError:
            print(claim, persp)
    else:
        return claim + " " + persp

data = []
for idx, row in re_step1_df.iterrows():
    cid = row.claim_id
    pid = row.perspective_id
    claim_title = get_claim(cid)
    persp_title = get_perspective(pid)
    
    if persp_title and claim_title:
        c_p_concat = concat_claim_persp(claim_title, persp_title)
        data.append({
            "claim_id" : int(cid), 
            "perspective_id" : int(pid), 
            "concat_title" : c_p_concat
        })

# save data, for later usage
import json

out_path = "/home/squirrel/ccg-new/projects/perspective/data/pilot12_evidence_verification/stance_claim_persp.json"
with open(out_path, 'w') as fout:
    json.dump(data, fout)

In [5]:
# For each claim + perspective pair in results, we index them in lucene (elastic search)

from elasticsearch import Elasticsearch
es = Elasticsearch(['http://bronte.cs.illinois.edu'],port=8080)

# Create indices 
index_name = "re_step1_claim_persp_high_quality"

# es.indices.delete(index_name)
es.indices.create(index_name)
    
for doc in data:
    cpid = str(doc["claim_id"]) + "_" + str(doc["perspective_id"])
    es.index(index=index_name, doc_type='text', id=cpid, body=doc)



In [None]:
from elasticsearch import TransportError
# For each evidence, retrieve the list of perspectives 
def get_top_re_step1_perspectives(text, num_cands=20):
    res = es.search(index="re_step1_claim_persp_high_quality", doc_type="text", body={"query": {"match": {"concat_title": text}}}, size=num_cands)
    # print("%d documents found:" % res['hits']['total'])
    output = []
    for doc in res['hits']['hits']:
        cid = doc['_source']["claim_id"]
        pid = doc['_source']["perspective_id"]
        score = doc['_score']
        perspective_text = doc['_source']["concat_title"]
        output.append((perspective_text, cid, pid, score))

    return output

evidence_path = "/home/squirrel/ccg-new/projects/perspective/data/database_output/re-step1/webapp_evidence.csv"

edf = pd.read_csv(evidence_path)

result = []
for idx, row in edf.iterrows():
    print("Querying evidence: {}".format(row.id))
    try:
        output = get_top_re_step1_perspectives(row.content)
    except TransportError:
        output = []
        
    result.append({
        "evidence_id": row.id,
        "evidence_text": row.content,
        "candidates": output
    })

out_path = "/home/squirrel/ccg-new/projects/perspective/data/pilot12_evidence_verification/perspective_candidates.json"
with open(out_path, 'w') as fout:
    json.dump(result, fout)

In [16]:
# Index evidences
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://bronte.cs.illinois.edu'],port=8080, timeout=60)

# Create indices 
index_name = "evidences"

# es.indices.delete(index_name)
es.indices.create(index_name)

evidence_path = "/home/squirrel/ccg-new/projects/perspective/data/database_output/re-step1/webapp_evidence.csv"
edf = pd.read_csv(evidence_path)
edf = edf.dropna()
edf.info()

result = []
for idx, row in edf.iterrows():
    if row.content:
        doc = {
            "id" : row.id,
            "content" : row.content
        }
        es.index(index=index_name, doc_type='text', id=row.id, body=doc)    

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8092 entries, 0 to 8121
Data columns (total 3 columns):
id         8092 non-null int64
source     8092 non-null object
content    8092 non-null object
dtypes: int64(1), object(2)
memory usage: 252.9+ KB


In [21]:
from elasticsearch import Elasticsearch

es = Elasticsearch(['http://bronte.cs.illinois.edu'],port=8080, timeout=30)
cp_path = "/home/squirrel/ccg-new/projects/perspective/data/pilot12_evidence_verification/stance_claim_persp.json"

def get_top_evidences(text, num_cands=50):
    res = es.search(index="evidences", doc_type="text", body={"query": {"match": {"content": text}}}, size=num_cands)
    # print("%d documents found:" % res['hits']['total'])
    output = []
    for doc in res['hits']['hits']:
        eid = doc['_source']["id"]
        score = doc['_score']
        evidence = doc['_source']["content"]
        output.append((evidence, eid, score))

    return output

with open(cp_path) as fin:
    cp_data = json.load(fin)

# Evidence 
# For each claim + perspctive, find evidence candidates, 
persp_candidates = {}

for cp in cp_data:
    cid = cp["claim_id"]
    pid = cp["perspective_id"]
    
    for evidence, eid, score in get_top_evidences(cp["concat_title"]):
        if eid not in persp_candidates:
            persp_candidates[eid] = []
        
        token_len = len(cp["concat_title"].split(" "))
        persp_candidates[eid].append((cid, pid, score/token_len))



In [22]:
# Sort 
for eid in persp_candidates:
    persp_candidates[eid] = sorted(persp_candidates[eid], key=lambda trip: trip[2], reverse=True)

out_path = "/home/squirrel/ccg-new/projects/perspective/data/pilot12_evidence_verification/reverse_persp_candidates.json"

with open(out_path, 'w') as fout:
    json.dump(persp_candidates, fout)

In [24]:
from collections import OrderedDict

valid_pids = perspective_df[(perspective_df.pilot1_have_stance == 1)].id.unique()

er = "/home/squirrel/ccg-new/projects/perspective/data/database_output/re-step1/webapp_evidencerelation.csv"

er_df = pd.read_csv(er)

k = [1, 5, 20, 50]
correct = {_k: 0 for _k in k}

total = 0

for eid in persp_candidates:
    q = er_df[er_df.evidence_id == eid]
    if len(q) == 0:
        continue
        
    gold_pid = q.iloc[0].perspective_id
    
    if gold_pid in valid_pids:
        total += 1
        p_cands = [t[1] for t in persp_candidates[eid]]
        p_cands = list(OrderedDict.fromkeys(p_cands))

        for _k in k:
            _cands = p_cands[:_k]
            if gold_pid in _cands:
                correct[_k] += 1

for _k in correct:
    print("Top {} accuracy: {}".format(_k, correct[_k]/total))

Top 1 accuracy: 0.31815882650480526
Top 5 accuracy: 0.650227617602428
Top 20 accuracy: 0.8606474456246839
Top 50 accuracy: 0.9001011633788568


In [17]:
in_path = "/home/squirrel/ccg-new/projects/perspective/data/pilot12_evidence_verification/perspective_candidates.json"

k = [1, 5, 20, 50]
correct = {_k: 0 for _k in k}

with open(in_path) as fin:
    ev2pp = json.load(fin)
    total = 0
    for ev in ev2pp:
        eid = ev["evidence_id"]
        gold_pid = er_df[er_df.evidence_id == eid].iloc[0].perspective_id
        
        if gold_pid in valid_pids:
            total += 1
            cands = [p[2] for p in ev["candidates"]]

            for _k in k:
                _cands = cands[:_k]
                if gold_pid in _cands:
                    correct[_k] += 1
            

for _k in correct:
    print("Top {} accuracy: {}".format(_k, correct[_k]/total))

Top 1 accuracy: 0.3212669683257919
Top 5 accuracy: 0.5321769733534439
Top 20 accuracy: 0.6759678230266466
Top 50 accuracy: 0.6759678230266466
