# Prep ClaimReview Data for KnowledgeStream

### Imports

In [1]:
import os
import pickle
import pandas as pd
import json
import config
from utils.file import directory_check

### Function for Setting Truth Score

In [2]:
def get_truth_score(relations, uid):
    for r in relations:
        if uid in r['UID']:
            if r['verdict_relation'] == 'true':
                return 1
            else:
                return 0

### Read in Relations from original JSON

In [3]:
snippets = config.JSON_DIR + '/cr.json'
with open(snippets, 'r') as f:
    relations = json.loads(f.read())

### Mapping between project relation labels and DBpedia labels

In [4]:
db_relation_map = {
    'pob': 'dbo:birthPlace',
    'education': 'dbo:education',
    'institution': 'dbo:almaMater',
    'pod': 'dbo:deathPlace',
}

### Read in dictionary of IDs for DBpedia nodes from KnowledgeStream

In [5]:
db_nodes_file = config.KS_KG + '/nodes.txt'
db_relations_file = config.KS_KG + '/relations.txt'

In [6]:
node_dict = dict()
relation_dict = dict()

with open(db_nodes_file, 'r') as f:
    for line in f.readlines():
        node_mapping = line.split(' ')
        node_dict[node_mapping[1].replace('\n','')] = node_mapping[0]
        
with open(db_relations_file, 'r') as f:
    for line in f.readlines():
        relation_mapping = line.split(' ')
        relation_dict[relation_mapping[1].replace('\n','')] = relation_mapping[0]

### Read in ClaimReview Claims and Terminal Nodes

In [7]:
df = pd.read_pickle(config.CLAIM_DF)

nodes_file = config.SP_NODES
nodes_df = pd.read_pickle(nodes_file)

### Assign subject and object nodes

In [8]:
df['subject'] = df['UID'].apply(lambda x: nodes_df.loc[x, 'sub'])
df['object'] = df['UID'].apply(lambda x: nodes_df.loc[x, 'obj'])

### Build Dataframe to be KnowledgeStream Compatible

In [9]:
# Ensure that correct relation was predicted, and both subject and object nodes are mapped to dbpedia
df = df.loc[(df['Successful?'] == 'Yes') & (df['subject'].str.contains('dbpedia')) & (df['object'].str.contains('dbpedia'))]
df.reset_index().drop(columns=['ID'])

# Map node/edge URIs to dbr: strings
df['subject'] = df['subject'].map(lambda x: 'dbr:' + x.split('/')[-1])
df['object'] = df['object'].map(lambda x: 'dbr:' + x.split('/')[-1])
df['predicate'] = df['Relation'].map(lambda x: db_relation_map[x])

# Assign Truth Values
df['true?'] = df['UID'].map(lambda x: get_truth_score(relations,x))

# Find DBpedia Ids
df['sid'] = df['subject'].map(lambda x: node_dict[x])
df['oid'] = df['object'].map(lambda x: node_dict[x])
df['pid'] = df['predicate'].map(lambda x: relation_dict[x])

# Drop extra columns and reorganize
df = df.drop(columns=['Relation', 'UID', 'Subject', 'Object', 'Maj_Vote', 'Short_Path', 'Prediction', 'Successful?'])
df = df.reindex(['sid', 'subject', 'pid', 'predicate', 'oid', 'object', 'true?'],axis = 1)
df

Unnamed: 0_level_0,sid,subject,pid,predicate,oid,object,true?
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,752593,dbr:Barack_Obama,65,dbo:birthPlace,5287545,dbr:United_States,1
4,1944127,dbr:Fred_Trump,65,dbo:birthPlace,2060365,dbr:Germany,0
7,4995074,dbr:Tej_Pratap_Yadav,194,dbo:education,1537320,dbr:Doctorate,0
10,3542399,dbr:Michelle_Obama,194,dbo:education,2832574,dbr:Juris_Doctor,0
14,2441220,dbr:Imee_Marcos,27,dbo:almaMater,4196756,dbr:Princeton_University,0
16,4020519,dbr:Patrick_Murphy_(Florida_politician),27,dbo:almaMater,5294835,dbr:University_of_Miami,1
17,5391105,dbr:Vladimir_Putin,27,dbo:almaMater,4067676,dbr:Peoples'_Friendship_University_of_Russia,0
19,2441220,dbr:Imee_Marcos,27,dbo:almaMater,5296119,dbr:University_of_the_Philippines_College_of_Law,0
21,3334254,dbr:Manohar_Parrikar,167,dbo:deathPlace,2109862,dbr:Goa,0
23,3733234,dbr:Nasser_Al-Kharafi,167,dbo:deathPlace,3024768,dbr:Kuwait,0


### Write out results to CSV for KnowledgeStream

In [10]:
output_dir = './knowledgestream/datasets/claimreview'
directory_check(output_dir)
output = output_dir + '/claims.csv'
df.to_csv(output, index=False, header=True)