In [36]:
import pandas as pd

result_path = "../data/pilot14_step3_pilot/webapp_evidencerelation.csv"

_df = pd.read_csv(result_path)
df = _df[(_df.comment == 'pilot') & (_df.author != 'Sihao') & (_df.author != 'Daniel')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2374 entries, 8122 to 11295
Data columns (total 6 columns):
id                2374 non-null int64
author            2374 non-null object
perspective_id    2374 non-null int64
evidence_id       2374 non-null int64
comment           2374 non-null object
anno              2374 non-null object
dtypes: int64(3), object(3)
memory usage: 129.8+ KB


In [37]:
# Aggregate turker results

arr_al = []
_data = []

evidences = df.evidence_id.unique()

for eid in evidences:
    e_df = df[df.evidence_id == eid]
    
    persps = e_df.perspective_id.unique()
    
    for pid in persps:
        p_df = e_df[e_df.perspective_id == pid]
        
        group = p_df.anno.value_counts()
        
        _s = _n = 0
        if "S" in group.index:
            _s = group.S
        if "N" in group.index:
            _n = group.N
        
        _data.append([eid, pid, _s, _n])


result_path = "../data/pilot14_step3_pilot/annotation_counts.csv"
with open(result_path, 'w') as fout:
    fout.write("evidence,perspective,sup,nsup\n")
    for entry in _data:
        _entry = [str(x) for x in entry]
        fout.write(",".join(_entry)+'\n')

In [13]:
# Merge annotations from Sihao and Daniel

sihao_df = _df[_df.author == 'Sihao']
daniel_df = _df[_df.author == 'Daniel']

# Rename the annotation columns to specify the author
sihao_df = sihao_df[["perspective_id", "evidence_id", "anno"]]
daniel_df = daniel_df[["perspective_id", "evidence_id", "anno"]]
sihao_df = sihao_df.rename(index=str, columns={"anno": "sihao_label"})
daniel_df = daniel_df.rename(index=str, columns={"anno": "daniel_label"})

merged_df = pd.merge(sihao_df, daniel_df, on=['evidence_id', 'perspective_id'])
merged_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 4 columns):
perspective_id    400 non-null int64
evidence_id       400 non-null int64
sihao_label       400 non-null object
daniel_label      400 non-null object
dtypes: int64(2), object(2)
memory usage: 15.6+ KB


In [16]:
# Add perspective and evidence content for adjudication

perspective = "/home/squirrel/ccg-new/projects/perspective/data/database_output/re-step1/webapp_perspective.csv"
evidence = "/home/squirrel/ccg-new/projects/perspective/data/database_output/re-step1/webapp_evidence.csv"

merged_df["evidence"] = ""
merged_df["perspective"] = ""

pdf = pd.read_csv(perspective)
edf = pd.read_csv(evidence)

for idx, row in merged_df.iterrows():
    pid = row.perspective_id
    eid = row.evidence_id
    merged_df.at[idx, "evidence"] = edf[edf.id==eid].iloc[0].content
    merged_df.at[idx, "perspective"] = pdf[pdf.id==pid].iloc[0].title
    
merged_df.info()

result_path = "../data/pilot14_step3_pilot/our_annotations.csv"

merged_df.to_csv(result_path)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 6 columns):
perspective_id    400 non-null int64
evidence_id       400 non-null int64
sihao_label       400 non-null object
daniel_label      400 non-null object
evidence          400 non-null object
perspective       400 non-null object
dtypes: int64(2), object(4)
memory usage: 41.9+ KB


In [46]:
adjudcation = "../data/pilot14_step3_pilot/adjudication.csv"
agreement = "../data/pilot14_step3_pilot/agreement_enhanced.csv"

adj_df = pd.read_csv(ajudcation)[["perspective_id", "evidence_id", "Adjudicated"]]
adj_df.info()

agr_df = pd.read_csv(agreement)
agr_df.info()

adj_agr_df = pd.merge(agr_df, adj_df, left_on=['evidence', 'perspective'], right_on=['evidence_id', 'perspective_id'])
adj_agr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 3 columns):
perspective_id    400 non-null float64
evidence_id       400 non-null float64
Adjudicated       400 non-null object
dtypes: float64(2), object(1)
memory usage: 9.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 5 columns):
evidence       720 non-null int64
perspective    720 non-null int64
sup            720 non-null int64
nsup           720 non-null int64
p_i            720 non-null float64
dtypes: float64(1), int64(4)
memory usage: 28.2 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 8 columns):
evidence          400 non-null int64
perspective       400 non-null int64
sup               400 non-null int64
nsup              400 non-null int64
p_i               400 non-null float64
perspective_id    400 non-null float64
evidence_id       400 non-null float64
Adjudicated       400 non-null o

In [53]:
# Measure accuracy of ALL evidences/perspectivs with respect of our adjudicated annotations

correct = 0
for idx, row in adj_agr_df.iterrows():
    if row.sup > row.nsup:
        label = "S"
    elif row.sup == row.nsup:
        label = "D"
    else:
        label = "N"
    
    if label == row.Adjudicated:
        correct += 1

print('Accuracy (All perspectives) = {}'.format(correct/len(adj_agr_df.index)))
print('Count = {} out of {}'.format(len(adj_agr_df.index), len(adj_agr_df.index)))

Accuracy (All perspectives) = 0.765
Count = 400 out of 400


In [52]:
# Measure accuracy of ABSOLUTE AGREEMENT (p_i = 1) evidences/perspectivs with respect of our adjudicated annotations
correct = 0

hq_df = adj_agr_df[adj_agr_df["p_i"] == 1]

for idx, row in hq_df.iterrows():
    if row.sup > row.nsup:
        label = "S"
    elif row.sup == row.nsup:
        label = "D"
    else:
        label = "N"
    
    if label == row.Adjudicated:
        correct += 1

print('Accuracy (p_i = 1 only) = {}'.format(correct/len(hq_df.index)))
print('Count = {} out of {}'.format(len(hq_df.index), len(adj_agr_df.index)))

Accuracy (p_i = 1 only) = 0.9271523178807947
Count = 151 out of 400


In [56]:
# Measure accuracy of HIGH AGREEMENT (p_i >= 0.5) evidences/perspectivs with respect of our adjudicated annotations
correct = 0

hq_df = adj_agr_df[adj_agr_df["p_i"] >= 0.5]

for idx, row in hq_df.iterrows():
    if row.sup > row.nsup:
        label = "S"
    elif row.sup == row.nsup:
        label = "D"
    else:
        label = "N"
    
    if label == row.Adjudicated:
        correct += 1

print('Accuracy (p_i >= 0.5) = {}'.format(correct/len(hq_df.index)))
print('Count = {} out of {}'.format(len(hq_df.index), len(adj_agr_df.index)))

Accuracy (p_i >= 0.5) = 0.8992537313432836
Count = 268 out of 400


In [35]:
# Prepare annotations
agr_df["label"] = ""

for idx, row in agr_df.iterrows():
    vote_sup = row.sup
    vote_nsup = row.nsup
    
    if vote_sup > vote_nsup:
        agr_df.at[idx, "label"] = "S"
    elif vote_sup == vote_nsup:
        agr_df.at[idx, "label"] = "D"
    else:
        agr_df.at[idx, "label"] = "N"
        
agr_df.info()

out_path = "../data/pilot14_step3_pilot/agreement_w_label.csv"
agr_df.to_csv(out_path, index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 6 columns):
evidence       720 non-null int64
perspective    720 non-null int64
sup            720 non-null int64
nsup           720 non-null int64
p_i            720 non-null float64
label          720 non-null object
dtypes: float64(1), int64(4), object(1)
memory usage: 33.8+ KB
