In [1]:
import pandas as pd

# Goal here: 1. generate tfidf representation for all evidences and perspectives
#            2. Record top similar K evidences to each perspective

# First, we load all data we need. 
pilot1_result = "/home/squirrel/ccg-new/projects/perspective/data/pilot1_persp_verification/persp_iaa_w_source.csv"
evidence_data = "/home/squirrel/ccg-new/projects/perspective/data/database_output/webapp_evidence.csv"
perspective_data = "/home/squirrel/ccg-new/projects/perspective/data/database_output/webapp_perspective.csv"
persp_evi_gold_annotation = "/home/squirrel/ccg-new/projects/perspective/data/database_output/webapp_evidencerelation.csv"

pilot1_df = pd.read_csv(pilot1_result)
p_df = pd.read_csv(perspective_data)
evi_df = pd.read_csv(evidence_data)
p_evi_anno_df = pd.read_csv(persp_evi_gold_annotation)

pilot1_df.info()
evi_df.info()
p_df.info()
p_evi_anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4743 entries, 0 to 4742
Data columns (total 13 columns):
claim                   4743 non-null int64
perspective             4743 non-null int64
sup                     4743 non-null int64
und                     4743 non-null int64
ns                      4743 non-null int64
suporting (scaled)      4743 non-null float64
undermining (scaled)    4743 non-null float64
not sure (scaled)       4743 non-null float64
P_i                     4743 non-null float64
Total                   4743 non-null int64
id                      4743 non-null int64
source                  4743 non-null object
title                   4743 non-null object
dtypes: float64(4), int64(7), object(2)
memory usage: 481.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8122 entries, 0 to 8121
Data columns (total 3 columns):
id         8122 non-null int64
source     8122 non-null object
content    8122 non-null object
dtypes: int64(1), object(2)
memory usage: 190.

In [3]:
# Output the list of ids of "high quality" perspectives we get in pilot1, so that we can add this information in the database
import json

high_qual_pid = pilot1_df.perspective.unique().tolist()
len(high_qual_pid)

out_path = "/home/squirrel/ccg-new/projects/perspective/data/pilot1_persp_verification/high_qual_persp_id_list.json"
with open(out_path, 'w') as fout:
    json.dump(high_qual_pid, fout)

In [4]:
# Load tfidf components 
from similarity.tfidf import Tfidf

tfidf = Tfidf("/home/squirrel/ccg-new/data/nyt_idf/nyt_idf.pkl")

In [6]:
# Test if tfidf is working
text1 = "My name is Rick."
text1_a = "My name is also Rick."


t1 = tfidf.vectorize(text1)
t1_a = tfidf.vectorize(text1_a)

print(t1.cos_similiarity(t1_a))
print(t1.dict)

0.9882445694476926
{'My': 0.2521814326295094, 'name': 0.18242214176255825, 'is': 0.027953020782761663, 'Rick': 0.37698177000070504, '.': 0.0014797220355288764}


In [14]:
# Add tfidf representation to the perspective table
p_df["tfidf"] = p_df["title"].apply(lambda t: json.dumps(tfidf.vectorize(str(t)).dict))
p_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8135 entries, 0 to 8134
Data columns (total 5 columns):
id                       8135 non-null int64
source                   8135 non-null object
title                    8134 non-null object
pilot1_high_agreement    8135 non-null int64
tfidf                    8135 non-null object
dtypes: int64(2), object(3)
memory usage: 317.9+ KB


In [15]:
# Add tfidf representation to the evidence table
evi_df["tfidf"] = evi_df["content"].apply(lambda t: json.dumps(tfidf.vectorize(str(t)).dict))

In [16]:
# Save the tables
persp_save_path = '/home/squirrel/ccg-new/projects/perspective/data/pilot2_tfidf/perspective_tfidf.csv'
evi_save_path = '/home/squirrel/ccg-new/projects/perspective/data/pilot2_tfidf/evidence_tfidf.csv'

p_df.to_csv(persp_save_path, index=False)
evi_df.to_csv(evi_save_path, index=False)

In [19]:
from similarity.tfidf import SparseUnigramDoc

# First load every tfidf representation in the evidence table
evidences_tfidf = []
for idx, row in evi_df.iterrows():
    evidences_tfidf.append((row.id, SparseUnigramDoc(json.loads(row.tfidf))))

for idx, row in p_df.iterrows():
    print(row.id)
    p_doc = SparseUnigramDoc(json.loads(row.tfidf))
    
    evi_sim = [(idx, doc.cos_similiarity(p_doc)) for idx, doc in evidences_tfidf]
    
    evi_sim = sorted(evi_sim, key=lambda t:t[1], reverse=True)
    
    print(evi_sim[:10])
    break

1
[(1, 0.5397837778532381), (936, 0.4753651741564662), (6426, 0.46372959852768175), (5564, 0.45391315532521476), (7379, 0.44959867056180164), (3422, 0.4432187087277606), (3399, 0.43855461807250334), (7317, 0.4329749585634605), (931, 0.4328383103247568), (949, 0.42592905715582635)]
