In [1]:
import pandas as pd
import json

# Goal here: 1. generate tfidf representation for all evidences and perspectives
#            2. Record top similar K evidences to each perspective

# First, we load all data we need. 
pilot1_result = "/home/squirrel/ccg-new/projects/perspective/data/pilot1_persp_verification/persp_iaa_w_source.csv"
evidence_data = "/home/squirrel/ccg-new/projects/perspective/data/database_output/webapp_evidence.csv"
perspective_data = "/home/squirrel/ccg-new/projects/perspective/data/database_output/webapp_perspective.csv"
persp_evi_gold_annotation = "/home/squirrel/ccg-new/projects/perspective/data/database_output/webapp_evidencerelation.csv"

pilot1_df = pd.read_csv(pilot1_result)
p_df = pd.read_csv(perspective_data)
evi_df = pd.read_csv(evidence_data)
p_evi_anno_df = pd.read_csv(persp_evi_gold_annotation)

pilot1_df.info()
evi_df.info()
p_df.info()
p_evi_anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4743 entries, 0 to 4742
Data columns (total 13 columns):
claim                   4743 non-null int64
perspective             4743 non-null int64
sup                     4743 non-null int64
und                     4743 non-null int64
ns                      4743 non-null int64
suporting (scaled)      4743 non-null float64
undermining (scaled)    4743 non-null float64
not sure (scaled)       4743 non-null float64
P_i                     4743 non-null float64
Total                   4743 non-null int64
id                      4743 non-null int64
source                  4743 non-null object
title                   4743 non-null object
dtypes: float64(4), int64(7), object(2)
memory usage: 481.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8122 entries, 0 to 8121
Data columns (total 3 columns):
id         8122 non-null int64
source     8122 non-null object
content    8122 non-null object
dtypes: int64(1), object(2)
memory usage: 190.

In [3]:
# Output the list of ids of "high quality" perspectives we get in pilot1, so that we can add this information in the database


high_qual_pid = pilot1_df.perspective.unique().tolist()
len(high_qual_pid)

out_path = "/home/squirrel/ccg-new/projects/perspective/data/pilot1_persp_verification/high_qual_persp_id_list.json"
with open(out_path, 'w') as fout:
    json.dump(high_qual_pid, fout)

In [2]:
# Load tfidf components 
from similarity.tfidf import Tfidf

tfidf = Tfidf("/home/squirrel/ccg-new/data/nyt_idf/nyt_idf.pkl")

In [3]:
# Test if tfidf is working
text1 = "My name is Rick."
text1_a = "My name is also Rick."


t1 = tfidf.vectorize(text1)
t1_a = tfidf.vectorize(text1_a)

print(t1.cos_similiarity(t1_a))
print(t1.dict)

0.9862324934756503
{'my': 0.1450864797467333, 'name': 0.18125692009584993, 'is': 0.027692536533637697, 'rick': 0.372001689166361, '.': 0.0014797220355288764}


In [4]:
# Add tfidf representation to the perspective table
p_df["tfidf"] = p_df["title"].apply(lambda t: json.dumps(tfidf.vectorize(str(t)).dict))
p_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8135 entries, 0 to 8134
Data columns (total 5 columns):
id                       8135 non-null int64
source                   8135 non-null object
title                    8134 non-null object
pilot1_high_agreement    8135 non-null int64
tfidf                    8135 non-null object
dtypes: int64(2), object(3)
memory usage: 317.9+ KB


In [5]:
# Add tfidf representation to the evidence table
evi_df["tfidf"] = evi_df["content"].apply(lambda t: json.dumps(tfidf.vectorize(str(t)).dict))

In [16]:
# Save the tables
persp_save_path = '/home/squirrel/ccg-new/projects/perspective/data/pilot2_tfidf/perspective_tfidf.csv'
evi_save_path = '/home/squirrel/ccg-new/projects/perspective/data/pilot2_tfidf/evidence_tfidf.csv'

p_df.to_csv(persp_save_path, index=False)
evi_df.to_csv(evi_save_path, index=False)

In [None]:
from similarity.tfidf import SparseUnigramDoc
import csv
import sys

csv.field_size_limit(sys.maxsize)

sim_out_path = "/home/squirrel/ccg-new/projects/perspective/data/pilot2_tfidf/persp_tfidf_sim.csv"

# For each perspective, rank the evidence from the most similar to least 
# First load every tfidf representation in the evidence table
evidences_tfidf = []
for idx, row in evi_df.iterrows():
    evidences_tfidf.append((row.id, SparseUnigramDoc(json.loads(row.tfidf))))

with open(sim_out_path, 'w') as fout:
    writer = csv.DictWriter(fout, fieldnames=['perspective', 'evidence_candidates'])
    writer.writeheader()
    for idx, row in p_df.iterrows():

        print(row.id, end=' ')
        p_doc = SparseUnigramDoc(json.loads(row.tfidf))

        evi_sim = [(idx, doc.cos_similiarity(p_doc)) for idx, doc in evidences_tfidf]

        evi_sim = sorted(evi_sim, key=lambda t:t[1], reverse=True)
        
        writer.writerow({'perspective': row.id, 'evidence_candidates': json.dumps(evi_sim)})

In [46]:
# Verify against golden annotations from website. Top 1 and top 10 accuracy

top1_correct = 0
top5_correct = 0
top10_correct = 0
total_count = 0

with open(sim_out_path, 'r') as fin:
    reader = csv.DictReader(fin)
    for row in reader:
        persp_id = int(row['perspective'])
#         print(persp_id, end=' ')
        q = p_evi_anno_df.loc[p_evi_anno_df['perspective_id'] == persp_id]
        have_evidence = len(q) > 0 
        is_high_agreement = p_df.loc[p_df['id'] == persp_id].iloc[0].pilot1_high_agreement
        
        if have_evidence and is_high_agreement:
            
            evi_id = q.iloc[0].evidence_id
        
            cands = json.loads(row['evidence_candidates'])
            top1 = cands[0][0]
            top5 = set([tpl[0] for tpl in cands[:5]])
            top10 = set([tpl[0] for tpl in cands[:10]])
            
            if evi_id == top1:
                top1_correct += 1
            
            if evi_id in top5:
                top5_correct += 1
                
            if evi_id in top10:
                top10_correct += 1
            
            total_count += 1
            
            if evi_id not in top10:
                print("Persp id: {}".format(persp_id))
                print("Persp Title: {}".format(p_df.loc[p_df.id == persp_id].iloc[0].title))
                evidence_text = [(eid, evi_df.loc[evi_df.id == eid].iloc[0].content) for eid in top5]
                print("Top 5 evidences: {}".format(evidence_text))
            if total_count >= 10:
                break

Persp id: 5
Persp Title: Allowing children to perform pushes them to grow up too soon
Top 5 evidences: [(5289, "A child who is constantly expected to practise and perform well isn't being given chance to be a child - to play, be light-hearted and innocent, do things just for fun and think in imaginative ways rather than learning practical skills all the time. They will grow up an overly serious adult."), (5292, "Just because a mother gives birth to a child, and the parents financially support the child does not mean that they would try to control all aspects of the child's life. It should be remembered that while it the responsibility of the parents to care for their children, it does not mean that they treat their children like subordinates who must do as the boss (in this case the parents) say. However, it is true that children are in turn responsible towards their parents and should be grateful to their parents for all they have done for the sake of the child, it does not mean that 

In [42]:
print('Total Count: {}'.format(total_count))
print('Top 1 accuracy: {}'.format(top1_correct / total_count))
print('Top 5 accuracy: {}'.format(top5_correct / total_count))
print('Top 10 accuracy: {}'.format(top10_correct / total_count))

Total Count: 4736
Top 1 accuracy: 0.19552364864864866
Top 5 accuracy: 0.43517736486486486
Top 10 accuracy: 0.5399070945945946
