In [20]:
# First, use heuristics to get google perspectives that are not valid but are complete sentences.
import pandas as pd

perspective_db_path = "../data/database_output/re-step1/webapp_perspective.csv"

pdf = pd.read_csv(perspective_db_path).dropna()

# Only keep the google ones that (1) high agreement (2) Non-valid perspective
pdf = pdf[(pdf.pilot1_high_agreement == 1) & (pdf.pilot1_have_stance == 0) & (pdf.source == 'google')]
pdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4675 entries, 8137 to 20797
Data columns (total 7 columns):
id                       4675 non-null int64
source                   4675 non-null object
title                    4675 non-null object
pilot1_high_agreement    4675 non-null int64
similar_persps           4675 non-null object
more_than_two_tokens     4675 non-null int64
pilot1_have_stance       4675 non-null int64
dtypes: int64(4), object(3)
memory usage: 292.2+ KB


In [26]:
# Produce ids of the google perspectives that (1) high agreement (2) Non-valid perspective
from nltk import word_tokenize
import json

def complete_sentence_candidate(text, token_length=5):
    text = text.rstrip()
    toks = word_tokenize(text)
    legit = text[0].isupper() and toks[-1] == '.' and len(toks) > token_length
    return legit

pdf['legit_sentence'] = False
for idx, row in pdf.iterrows():
    if complete_sentence_candidate(row.title):
        pdf.at[idx, "legit_sentence"] = True

legit_sent_pdf = pdf[pdf.legit_sentence == True]
legit_sent_pdf.info()

index = legit_sent_pdf.id.unique().tolist()
indices_out_path = "../data/pilot17_making_the_dataset/indices/non_valid_high_agreement_google_persps.json"
with open(indices_out_path, 'w') as fout:
    json.dump(index, fout)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2427 entries, 8148 to 20792
Data columns (total 8 columns):
id                       2427 non-null int64
source                   2427 non-null object
title                    2427 non-null object
pilot1_high_agreement    2427 non-null int64
similar_persps           2427 non-null object
more_than_two_tokens     2427 non-null int64
pilot1_have_stance       2427 non-null int64
legit_sentence           2427 non-null bool
dtypes: bool(1), int64(4), object(3)
memory usage: 154.1+ KB


In [69]:
# 2. Produce ids of perspectives that (1) support or undermine (2) high agreement (aka > 0.5) (3) Original
# (4) have >= 15 other perspectives connected to the same claim 
perspective_db_path = "../data/database_output/re-step1/webapp_perspective.csv"
pdf = pd.read_csv(perspective_db_path).dropna()

# (1) support or undermine (2) high agreement (aka > 0.5) (3) Original
pdf = pdf[(pdf.pilot1_high_agreement == 1) & (pdf.pilot1_have_stance == 1) & (pdf.source != "google")]
pdf.info()

# Same criteria but in different table, for sanity check
re_step1_path = "../data/database_output/re-step1/webapp_restep1results.csv"
step1_df = pd.read_csv(re_step1_path)

step1_df = step1_df[(step1_df.p_i_3 > 0.5) & (step1_df.label_3.isin(["S", "U"])) & (step1_df.perspective_id < 8136)]
step1_df.info()

# Counts of how many perspectives each claim have
# For claims with > 10, get all perspectives ids
claim_persp_count_df = step1_df[['claim_id', 'perspective_id']].groupby(['claim_id']).count()
claim_persp_count_df = claim_persp_count_df[claim_persp_count_df.perspective_id > 10]
claim_ids = claim_persp_count_df.index.tolist()

many_p_df = step1_df.loc[step1_df["claim_id"].isin(claim_ids)]
many_p_df.info()
index = many_p_df.perspective_id.unique().tolist()
indices_out_path = "../data/pilot17_making_the_dataset/indices/perspective/claim_with_more_than_10_persp_perspectives.json"

with open(indices_out_path, 'w') as fout:
    json.dump(index, fout)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3985 entries, 0 to 8133
Data columns (total 7 columns):
id                       3985 non-null int64
source                   3985 non-null object
title                    3985 non-null object
pilot1_high_agreement    3985 non-null int64
similar_persps           3985 non-null object
more_than_two_tokens     3985 non-null int64
pilot1_have_stance       3985 non-null int64
dtypes: int64(4), object(3)
memory usage: 249.1+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3985 entries, 19372 to 23602
Data columns (total 12 columns):
id                        3985 non-null int64
claim_id                  3985 non-null int64
perspective_id            3985 non-null int64
vote_support              3985 non-null int64
vote_leaning_support      3985 non-null int64
vote_leaning_undermine    3985 non-null int64
vote_undermine            3985 non-null int64
vote_not_valid            3985 non-null int64
p_i_5                     3985 non-null floa

In [71]:
# Step 3, produce high agreement + support/undermine + complete sentence google perspectives
perspective_db_path = "../data/database_output/re-step1/webapp_perspective.csv"

pdf = pd.read_csv(perspective_db_path).dropna()

# Only keep the google ones that (1) high agreement (2) Non-valid perspective
pdf = pdf[(pdf.pilot1_high_agreement == 1) & (pdf.pilot1_have_stance == 1) & (pdf.source == 'google')]
pdf.info()

pdf['legit_sentence'] = False
for idx, row in pdf.iterrows():
    if complete_sentence_candidate(row.title):
        pdf.at[idx, "legit_sentence"] = True

legit_sent_pdf = pdf[pdf.legit_sentence == True]
legit_sent_pdf.info()

index = legit_sent_pdf.id.unique().tolist()
indices_out_path = "../data/pilot17_making_the_dataset/indices/valid_high_agreement_google_persps.json"
with open(indices_out_path, 'w') as fout:
    json.dump(index, fout)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1652 entries, 8140 to 20738
Data columns (total 7 columns):
id                       1652 non-null int64
source                   1652 non-null object
title                    1652 non-null object
pilot1_high_agreement    1652 non-null int64
similar_persps           1652 non-null object
more_than_two_tokens     1652 non-null int64
pilot1_have_stance       1652 non-null int64
dtypes: int64(4), object(3)
memory usage: 103.2+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 954 entries, 8140 to 20738
Data columns (total 8 columns):
id                       954 non-null int64
source                   954 non-null object
title                    954 non-null object
pilot1_high_agreement    954 non-null int64
similar_persps           954 non-null object
more_than_two_tokens     954 non-null int64
pilot1_have_stance       954 non-null int64
legit_sentence           954 non-null bool
dtypes: bool(1), int64(4), object(3)
memory usage: 60.6+ 

In [72]:
# Now index results from step 2 and step 3 in lucene
perspective_db_path = "../data/database_output/re-step1/webapp_perspective.csv"
pdf = pd.read_csv(perspective_db_path).dropna()

google_valid_stance = "../data/pilot17_making_the_dataset/indices/valid_high_agreement_google_persps.json"
potential_equivalent_original = "../data/pilot17_making_the_dataset/indices/perspective/claim_with_more_than_10_persp_perspectives.json"

with open(google_valid_stance) as fin:
    gvs_pids = json.load(fin)

with open(potential_equivalent_original) as fin:
    peo_pids = json.load(fin)
    
to_be_indexed_pids = set(gvs_pids + peo_pids)

pdf = pdf[pdf.id.isin(to_be_indexed_pids)]
pdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1488 entries, 4161 to 20738
Data columns (total 7 columns):
id                       1488 non-null int64
source                   1488 non-null object
title                    1488 non-null object
pilot1_high_agreement    1488 non-null int64
similar_persps           1488 non-null object
more_than_two_tokens     1488 non-null int64
pilot1_have_stance       1488 non-null int64
dtypes: int64(4), object(3)
memory usage: 93.0+ KB


In [76]:
# Now index results from step 2 and step 3 in lucene
from elasticsearch import Elasticsearch

es = Elasticsearch(['http://bronte.cs.illinois.edu'],port=8080, timeout=30)

# Create indices 
index_name = "potentially_equivalent_perspectives"

# es.indices.delete(index_name)
es.indices.create(index_name)

for idx, row in pdf.iterrows():
    if row.title:
        doc = {
            "id" : row.id,
            "title" : row.title
        }
        es.index(index=index_name, doc_type='text', id=row.id, body=doc)  

In [None]:
# For those in the original "potentially equivalent" perspectives, retrieve the results from index created in previous cell
def get_top_potential_equivalent_perspectives(text, num_cands=50):
    res = es.search(index="potentially_equivalent_perspectives", doc_type="text", body={"query": {"match": {"title": text}}}, size=num_cands)
    # print("%d documents found:" % res['hits']['total'])
    output = []
    for doc in res['hits']['hits']:
        pid = doc['_source']["id"]
        score = doc['_score']
        perspective_text = doc['_source']["title"]
        output.append((perspective_text, pid, score))

    return output


origin_pdf = pdf[pdf.id.isin(peo_pids)]
origin_pdf.info()

# Use this to find all claims connected to this perspective
re_step1_path = "../data/database_output/re-step1/webapp_restep1results.csv"
step1_df = pd.read_csv(re_step1_path)
step1_df = step1_df[(step1_df.p_i_3 > 0.5) & (step1_df.label_3.isin(["S", "U"]))]

data = []

# Query candidates for each perspectives, only retain the ones that share the same claim.
for idx, row in origin_pdf.iterrows():
    pid = row.id
    cands = get_top_potential_equivalent_perspectives(row.title)
    
    associated_claims = step1_df[step1_df.perspective_id == pid].claim_id.unique()
    peer_perspective_pids = set(step1_df[step1_df.claim_id.isin(associated_claims)].perspective_id.unique())
    
    cands = [c[1] for c in cands]
    cands = [c for c in cands if c in peer_perspective_pids]
    data.append({
        "perspective_id": pid,
        "candidates": cands
    })
    
out_path = "../data/pilot17_making_the_dataset/persps_of_claims_with_gt10_persps.json"
with open(out_path, 'w') as fout:
    json.dump(data, fout)
    