In [7]:
import pandas as pd

# Now the experiments are done, first read the data
result_table = "../data/pilot16_paraphrase/webapp_perspectiveparaphrase.csv"
persps = "../data/database_output/re-step1/webapp_perspective.csv"

rdf = pd.read_csv(result_table)
pdf = pd.read_csv(persps)

rdf.info()
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2709 entries, 0 to 2708
Data columns (total 5 columns):
id                2709 non-null int64
perspective_id    2709 non-null int64
user_generated    2709 non-null object
session_ids       2709 non-null object
hints             2709 non-null object
dtypes: int64(2), object(3)
memory usage: 105.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20798 entries, 0 to 20797
Data columns (total 7 columns):
id                       20798 non-null int64
source                   20798 non-null object
title                    20797 non-null object
pilot1_high_agreement    20798 non-null int64
similar_persps           20798 non-null object
more_than_two_tokens     20798 non-null int64
pilot1_have_stance       20798 non-null int64
dtypes: int64(4), object(3)
memory usage: 1.1+ MB


In [9]:
# Put user generated paraphrases beside gold perspectives for visualization
rdf = rdf.loc[rdf.user_generated != "[]"]

merged = pd.merge(rdf[["perspective_id", "user_generated"]], pdf[["id", "title"]], left_on="perspective_id", right_on="id")
merged.info()

result_table = "../data/pilot16_paraphrase/result.csv"

merged.to_csv(result_table, index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2604 entries, 0 to 2603
Data columns (total 4 columns):
perspective_id    2604 non-null int64
user_generated    2604 non-null object
id                2604 non-null int64
title             2604 non-null object
dtypes: int64(2), object(2)
memory usage: 101.7+ KB


In [19]:
# Now we have asked AMT workers to verify the results, let's see what they think!
import pandas as pd
import json

equivalence_table = "../data/pilot16_paraphrase/webapp_equivalenceannotation.csv" 
persp_table = "../data/pilot16_paraphrase/webapp_perspective.csv"

edf = pd.read_csv(equivalence_table)[["session_id", "author", "perspective_id", "user_choice"]]
pdf = pd.read_csv(persp_table)

edf = edf.drop_duplicates() # There was a bug in the interface, so there are a lot of duplicate annotations

annotation_count = {} # key: tuple (orig_persp_id, para_persp_id) value: (yes_count, no_count)

def update_annotation_count(gold_pid, para_pid, yes_count=0, no_count=0):
    if (gold_pid, para_pid) not in annotation_count:
        annotation_count[(gold_pid, para_pid)] = (yes_count, no_count)
    else:
        cur_count = annotation_count[(gold_pid, para_pid)]
        annotation_count[(gold_pid, para_pid)] = (cur_count[0] + yes_count, cur_count[1] + no_count)
    

for idx, row in edf.iterrows():
    pid = row.perspective_id
    p = pdf[pdf.id == pid].iloc[0]
    user_choice = [int(x) for x in json.loads(row.user_choice)]
    alles = json.loads(p.similar_persps)
    
    for correct_para_pid in user_choice:
        update_annotation_count(pid, correct_para_pid, yes_count=1)
    
    not_para_ids = [i for i in alles if i not in user_choice]
        
    for wrong_para_pid in not_para_ids:
        update_annotation_count(pid, wrong_para_pid, no_count=1)

# Save the aggregated votes for paraphrases
out_path = "../data/pilot16_paraphrase/paraphrase_counts.csv"

with open(out_path, 'w') as fout:
    fout.write('perspective,paraphrase_perspective,vote_yes,vote_no\n')
    for key, val in annotation_count.items():
        line = [key[0], key[1], val[0], val[1]]
        line = [str(x) for x in line]
        fout.write(','.join(line))
        fout.write('\n')
    

In [25]:
# Annotate the previous result with title of two perspectives + Claims
claim_table = "../data/pilot16_paraphrase/webapp_claim.csv"
re_step1 = "../data/pilot16_paraphrase/webapp_restep1results.csv"

cdf = pd.read_csv(claim_table)
rdf = pd.read_csv(re_step1)
rdf = rdf[(rdf.label_3.isin(["S", "U"])) & (rdf.p_i_3 > 0.5)]
           
result_df = pd.read_csv(out_path)
result_df['p_title'] = ""
result_df['para_title'] = ""
result_df['claim_title'] = ""
result_df['claim_id'] = 0

for idx, row in result_df.iterrows():
    pid = row.perspective
    para_id = row.paraphrase_perspective
    result_df.at[idx, "p_title"] = pdf[pdf.id == pid].iloc[0].title
    result_df.at[idx, "para_title"] = pdf[pdf.id == para_id].iloc[0].title
    cid = rdf[rdf.perspective_id == pid].iloc[0].claim_id
    result_df.at[idx, "claim_id"] = cid
    result_df.at[idx, "claim_title"] = cdf[cdf.id == cid].iloc[0].title
    # Get Claim
    
result_df.to_csv(out_path, index=False)