In [22]:
from rdflib import Graph, Literal, URIRef, XSD
from rdflib.namespace import RDF, FOAF, SDO, Namespace
import json
import io
from tqdm import tqdm, trange
import re
import requests
import hashlib

In [25]:
SO = Namespace("http://schema.org/")
WIKI_prefix = "http://www.wikidata.org/wiki/"
DB_prefix = "http://dbpedia.org/ontology/"

prefix = "http://data.cimple.eu/"

g = Graph()

# Load data

In [6]:
directory = '/data/peskine/ClaimReviews2023/'

claim_reviews = json.load(io.open(directory+'2023_05_02/claim_reviews.json'))

claim_reviews

[{'claim_text': ['Gun attack at Amsterdam mall'],
  'label': 'not_credible',
  'review_url': 'https://factcheck.afp.com/video-shows-drill-staged-anti-terror-police-nuremberg-germany',
  'fact_checker': {'name': 'AFP fact checking',
   'country': 'France',
   'language': 'French',
   'website': 'https://www.afp.com/en',
   'ifcn_url': 'https://ifcncodeofprinciples.poynter.org/profile/afp-fact-checking',
   'avatar': 'https://ifcncodeofprinciples.poynter.org/storage/logos/afp-fact-checking_logo.png?v=1612201098',
   'domain': 'afp.com'},
  'appearances': ['https://www.facebook.com/pakistanisonline1/videos/537849886776338?v=537849886776338'],
  'reviews': [{'label': 'not_credible',
    'original_label': 'False',
    'review_rating': {'@type': 'Rating',
     'ratingValue': '1',
     'bestRating': '5',
     'worstRating': '1',
     'alternateName': 'False'},
    'retrieved_by': None,
    'date_published': '2019-11-20'}]},
 {'claim_text': ['Elazığ Tabip Odası’nın depremde hayatını kaybedenle

# Extract Entities

In [14]:
def normalize_text(text):
    text = text.replace('&amp;', '&')
    text = text.replace('\xa0', '')
    text = re.sub(r'http\S+', '', text)
    text = " ".join(text.split())
    return text
    

In [19]:
TEXT = [cr['claim_text'][0] for cr in claim_reviews]
API_URL = "https://api.dbpedia-spotlight.org/en/annotate"


entities_dbpedia = []
for s in tqdm(TEXT):
    s = normalize_text(s)
    if not s in ['', ' ', '   ']:
        payload = {'text': s}
        a = requests.post(API_URL, 
                     headers={'accept': 'application/json'},
                     data=payload).json()

        entities_dbpedia.append(a)
    else:
        entities_dbpedia.append([])

  0%|                                         | 39/140411 [00:05<5:38:04,  6.92it/s]


KeyboardInterrupt: 

In [37]:
with open(directory+'entities.json', 'w') as f:
    json.dump(entities_dbpedia, f)

# Populate graph

In [20]:
def uri_generator(identifier):
    h = hashlib.sha224(str.encode(identifier)).hexdigest()
    
    return str(h)

In [34]:
with open(directory+'entities.json', 'r') as f:
    entities_dbpedia = json.load(f)

In [35]:
for i in trange(0, len(claim_reviews)):
    cr = claim_reviews[i]
    
    identifier = 'claim_reviews'+str(i)
    uri = 'claim_reviews/'+uri_generator(identifier)
    g.add((URIRef(prefix+uri), RDF.type, SO.ClaimReview))
    
    author = cr['fact_checker']['name']
    website = cr['fact_checker']['website']
    identifier_author = 'organization'+str(author)
    uri_author = 'organization/'+uri_generator(identifier_author)
    
    g.add((URIRef(prefix+uri_author), RDF.type, SO.Organization))
    g.add((URIRef(prefix+uri_author), SO.name, Literal(author)))
    g.add((URIRef(prefix+uri_author), SO.url, URIRef(website)))

    g.add((URIRef(prefix+uri), SO.author, URIRef(prefix+uri_author)))
    

    
    date = cr['reviews'][0]['date_published']

    g.add((URIRef(prefix+uri), SO.datePublished, Literal(date, datatype=XSD.date)))
    
    url = cr['review_url']
    url = url.replace(' ', '')
    g.add((URIRef(prefix+uri), SO.url, URIRef(url)))
    
    language = cr['fact_checker']['language']
    g.add((URIRef(prefix+uri), SO.inLanguage, Literal(language)))
    
    uri_rating = 'rating/'+cr['reviews'][0]['label']
    
    g.add((URIRef(prefix+uri), SO.reviewRating, URIRef(prefix+uri_rating)))

    
    claim = cr['claim_text'][0]
    identifier_claim = 'claims'+str(i)
    uri_claim = 'claims/'+uri_generator(identifier_claim)
    
    g.add((URIRef(prefix+uri_claim),RDF.type, SO.Claim))
    
    g.add((URIRef(prefix+uri), SO.itemReviewed, URIRef(prefix+uri_claim)))

    text = claim
    text = normalize_text(text)

    g.add((URIRef(prefix+uri_claim),SO.text, Literal(text)))
    
    dbpedia_output = entities_dbpedia[i]
    
    if 'Resources' in dbpedia_output:
        entities = dbpedia_output['Resources']
    
        for e in entities:
            dbpedia_url = e['@URI']
            dbpedia_name = e['@URI'][28:].replace('_', ' ')
            entity_types = e['@types'].split(',')

            identifier_mention = 'entity'+str(dbpedia_url)
            uri_mention = 'entity/'+uri_generator(identifier_mention)
            
            g.add((URIRef(prefix+uri_mention), RDF.type, SO.Thing))
            for t in entity_types:
                if "Wikidata" in t:
                    g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
                if "DBpedia" in t:
                    g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))
                    
            g.add((URIRef(prefix+uri_mention), SO.url, URIRef(dbpedia_url)))
            g.add((URIRef(prefix+uri_mention), SO.name, Literal(dbpedia_name)))
            g.add((URIRef(prefix+uri_claim), SO.mentions, URIRef(prefix+uri_mention)))
    

100%|█████████████████████████████████████| 140411/140411 [01:39<00:00, 1409.55it/s]


In [28]:
all_organizations_names = []
all_organizations_websites = []

for cr in claim_reviews:
    author = cr['fact_checker']['name']
    website = cr['fact_checker']['website']
    if author not in all_organizations_names:
        all_organizations_names.append(author)
        all_organizations_websites.append(website)

In [29]:
labels_mapping = json.load(io.open(directory+'2023_05_02/claim_labels_mapping.json'))

for label in tqdm(labels_mapping):
    identifier_original_rating = 'original_rating'+label['original_label']
    uri_original_rating = 'original_rating/'+uri_generator(identifier_original_rating)
    
    g.add((URIRef(prefix+uri_original_rating), RDF.type, SO.Rating))
    g.add((URIRef(prefix+uri_original_rating), SO.ratingValue, Literal(label['original_label'])))
    g.add((URIRef(prefix+uri_original_rating), SO.name, Literal(label['original_label'].replace('_', ' '))))
    
    uri_rating = 'rating/'+label['coinform_label']
    
    g.add((URIRef(prefix+uri_rating), RDF.type, SO.Rating))
    g.add((URIRef(prefix+uri_rating), SO.ratingValue, Literal(label['coinform_label'])))
    g.add((URIRef(prefix+uri_rating), SO.name, Literal(label['coinform_label'].replace('_', ' '))))
    
    
    g.add((URIRef(prefix+uri_original_rating), SO.sameAs, URIRef(prefix+uri_rating)))

    domains = label['domains'].split(',')
    for d in label['domains'].split(','):
        corresponding_org_website = ""
        for websites in all_organizations_websites:
            if d in websites:
                corresponding_org_website=websites
        corresponding_org_name = all_organizations_names[all_organizations_websites.index(corresponding_org_website)]
        identifier_author = 'organization'+str(corresponding_org_name)
        uri_author = 'organization/'+uri_generator(identifier_author)
        g.add((URIRef(prefix+uri_original_rating), SO.author, URIRef(prefix+uri_author)))
        

100%|███████████████████████████████████████| 15810/15810 [00:04<00:00, 3407.77it/s]


In [36]:
len(g)

1744901

# Serialize

In [None]:
g.serialize(destination=directory+"/claimreview-kg.ttl")