# Generate the event and article knowledge graph

In [3]:
import rdflib
from rdflib import Graph, Literal, RDF, URIRef, Namespace
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import ast
from resources import *
import re

rnews = Namespace("http://iptc.org/std/rNews/2011-10-07#")
nif = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
faro = Namespace("https://purl.org/faro/")
sem = Namespace("http://semanticweb.cs.vu.nl/2009/11/sem/")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rnews = Namespace("http://iptc.org/std/rNews/2011-10-07#")
schema = Namespace("http://schema.org/")

faro_classes = {'cause': faro.causes, 'enable': faro.enables, 'intend': faro.intends_to_cause, 'prevent': faro.prevents} #dict of faro definitions
sem_props = {'http://www.wikidata.org/prop/direct/P710': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P664': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P112': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P17': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P276': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P625': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P131': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P30': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P585': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P580': sem.hasBeginTimeStamp,
             'http://www.wikidata.org/prop/direct/P582': sem.hasEndTimeStamp,
             'http://www.wikidata.org/prop/direct/P571': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P576': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P577': sem.hasTimeStamp,
             'http://www.w3.org/2000/01/rdf-schema#label': 'what'}

sem_classes = {sem.hasActor: sem.Actor,
               sem.hasPlace: sem.Place,
               sem.hasTime: sem.Time,
               sem.hasBeginTimeStamp: sem.Time,
               sem.hasEndTimeStamp: sem.Time,
               sem.hasTimeStamp: sem.Time,
               'what': sem.Event}

In [4]:
def clean_text(text):
    '''
    This is used to remove the html codes from the text
    :param text: The text to clean
    :return: Cleaned text
    '''

    # Strip the last part of the text
    index_of_last_occurence = text.rfind('</p><p>')
    if index_of_last_occurence != -1:
        text = text[:index_of_last_occurence]

    text = re.sub(r"<.*?>", " ", text) # Strip all the special characters in the text

    text = text.strip() #Remove the whitespace at the beginning, due to deletion

    return text

In [5]:
data = pd.read_csv('Data/ASRAEL_data_full.csv')
data['Text'] = data['Text'].apply(clean_text)

In [6]:
#This converts the data into the right format, by removing uneccessary tokens in text and converting the wikidata link to text
graph = Graph()

event_mapping = {} #here the wikidata event urls and their names are saved
failed_events= [] #Events that can't be found e.g. owl:sameAs need to be removed

sparql = SPARQLWrapper(
    "https://query.wikidata.org/sparql"
)
sparql.setReturnFormat(JSON)

for event in data['Event'].unique().tolist():
    event_ = f"wd:{event.split('/')[-1]}"

    sparql.setQuery("""
    SELECT (?p as ?wiki_prop) (?o as ?result)
    WHERE {{{

        %s ?p ?temp.
      ?temp rdfs:label ?o.
      FILTER (lang(?o) = 'en') }
      }

      UNION

      {
       SELECT *
       WHERE{
        %s ?p ?o.
         FILTER(lang(?o) = 'en' || lang(?o)='') }}
     } """ % (event_, event_))

    try:
        result = sparql.queryAndConvert()
        #event_name = ret['results']['bindings'][0]['item']['value']

    except:
        print(f"Something went wrong when converting event: {event}")

    try:
        event_data = pd.json_normalize(result["results"]["bindings"])[['wiki_prop.value', 'result.value', 'result.datatype']]
        event_data = event_data.rename(columns={"wiki_prop.value": "property", "result.value": "value", "result.datatype": "datatype"})
        event_data = event_data.loc[event_data['property'].isin(sem_props.keys())].reset_index(drop=True) #Only keep the 4W attributes
        event_data['property'] = event_data['property'].replace(sem_props)
        event_name = event_data.loc[event_data['property'] == 'what']['value'].values[0] #This needs to be saved to map the wikidata urls to events
        event_data = event_data[event_data.property != 'what'] #This row needs to be deleted for the loop
        event_mapping[event] = event_name
    except:
        print(f"Error when searching for event: {event}")
        failed_events.append(event)
        continue

    #event_uri = node_creation('', event_name, base_add='/event') #Generate the URI for the event
    graph.add((URIRef(event), RDF.type, sem.Event)) #Create the event
    graph.add((URIRef(event), RDF.value, Literal(event_name)))

    for index, row in event_data.iterrows():
        uri = node_creation('', row['value'], base_add='') #Generate the URI for the property
        graph.add((uri, RDF.type, sem_classes[row['property']])) #Create the node for the property, and lookup its class
        if pd.isna(row['datatype']) == False: #It has a declared datatype
            graph.add((uri, RDF.value, Literal(row['value'], datatype=row['datatype'])))
        else:
            graph.add((uri, RDF.value, Literal(row['value']))) #Add the value of the relation to the graph
        graph.add((URIRef(event), row['property'], uri)) #Connect the event to the property

graph.serialize('Data/test/graphs/Event_graph_all.ttl', format='turtle')

data= data[~data['Event'].isin(failed_events)] #remove the rows for which the event was not found
#data['Event'] = data['Event'].map(event_mapping)
#Check if this still is oke, it adds a list of events to the column event
data = data.groupby(['URI','Identifier','Location', 'Time', 'Text']).agg({'Event': lambda x: list(x)}).reset_index(drop=False)
data.to_csv('Data/test/ASRAEL_data_full_converted.csv')


Error when searching for event: http://www.wikidata.org/entity/Q100919128
Error when searching for event: http://www.wikidata.org/entity/Q113945893
Error when searching for event: http://www.wikidata.org/entity/Q105597606


In [9]:
# Perform predictions again since the sentence number is needed for event resolution
import pandas as pd
from rebel_finetuning_faro import make_predictions
from nltk import tokenize
from tqdm import tqdm

data = pd.read_csv('Data/test/ASRAEL_data_full_converted.csv')
total_uri = []
total_identifier = []
total_location = []
total_time = []
total_event = []
total_sentence_num = []
total_sentences = []
total_subject = []
total_relation = []
total_object = []

for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    sentences, predictions = make_predictions(tokenize.sent_tokenize(row['Text']), 'rebel_finetuned.pth')
    for i, (sentence, prediction) in enumerate(zip(sentences, predictions)):

        total_uri.append(row['URI'])
        total_identifier.append(row['Identifier'])
        total_location.append(row['Location'])
        total_time.append(row['Time'])
        total_event.append(row['Event'])
        total_sentence_num.append(i)
        total_sentences.append(sentence)
        total_subject.append(prediction[0])
        total_relation.append(prediction[1])
        total_object.append(prediction[2])
    break


new_data = pd.DataFrame({'URI': total_uri, 'Identifier': total_identifier, 'Location': total_location, 'Time': total_time, 'Sentence_num': total_sentence_num, 'Sentence': total_sentences, 'Subject': total_subject, 'Relation': total_relation, 'Object': total_object, 'Event': total_event})

new_data = new_data[~((new_data['Sentence'].str.len() < 25) & new_data['Sentence'].str.contains('/'))].reset_index(drop=True)

new_data.to_csv('Data/test/final_data_with_predictions.csv')

  0%|          | 0/1122 [00:11<?, ?it/s]


In [2]:
from tqdm import tqdm
from nltk import tokenize

graph = Graph()
graph.parse('Data/graphs/Event_graph_all.ttl')
#data = pd.read_csv('Data/ASRAEL_data_full_converted.csv')
data = pd.read_csv('Data/final_data_with_predictions.csv')

data_with_predictions = True #Set this to True if the cell above was executed

for index, row in tqdm(data.iterrows(), total=data.shape[0]):

    if not row.isnull().values.any(): #If nan is present skip the row

        graph.add((URIRef(row['URI']), RDF.type, rnews.Article)) #Add the URI as article

        identifier_uri = node_creation('', row['Identifier'], base_add='/identifier')
        graph.add((URIRef(row['URI']), rnews.identifier, URIRef(identifier_uri))) #Link the PublicID to the article
        graph.add((URIRef(identifier_uri), RDF.value, Literal(row['Identifier'])))

        location_uri = node_creation('', row['Location'], base_add='')
        graph.add((URIRef(row['URI']), schema.contentLocation, URIRef(location_uri)))
        graph.add((URIRef(location_uri), RDF.value, Literal(row['Location'])))

        time_uri = node_creation('', row['Time'], base_add='')
        graph.add((URIRef(row['URI']), schema.contentReferenceTime, URIRef(time_uri)))
        graph.add((URIRef(time_uri), RDF.value, Literal(row['Time'])))

        if data_with_predictions == False: #Make the predictions

            from rebel_finetuning_faro import make_predictions

            sentences, predictions = make_predictions(tokenize.sent_tokenize(row['Text']), 'rebel_finetuned.pth')
            for sentence, prediction in zip(sentences, predictions):

                if prediction[1] in faro_classes:
                    sentence_uri = node_creation('', sentence, base_add='/sentence') #Generate the URI for the sentence
                    graph.add((URIRef(row['URI']), nif.sentence, sentence_uri)) #Link the article to the sentence
                    graph.add((sentence_uri, RDF.type, nif.Sentence)) #Make the sentence URI of class 'Sentence'
                    graph.add((sentence_uri, RDF.value, Literal(sentence))) #Set the value of the URI equal to the sentence

                    subject_uri = node_creation('', prediction[0] + str(sentence_uri), base_add='/subject') #Generate the URI for the subject, for now add the uri of sentence to make it unique
                    graph.add((sentence_uri, faro.Relata, subject_uri)) #Add the subject to the sentence
                    graph.add((subject_uri, RDF.value, Literal(prediction[0]))) #Set the value of the subject URI equal to the subject

                    object_uri = node_creation('', prediction[2] + str(sentence_uri), base_add='/object') #Generate the URI for the object, for now add the uri of sentence to make it unique
                    graph.add((sentence_uri, faro.Relata, object_uri)) #Add the object to the sentence
                    graph.add((object_uri, RDF.value, Literal(prediction[2]))) #Set the value of the subject URI equal to the object

                    graph.add((subject_uri, faro_classes[prediction[1]], object_uri)) #Add relation betwee NERs
                else:
                    continue
        else: # The data already contains the predictions
            sentence = row['Sentence']
            prediction = (row['Subject'], row['Relation'], row['Object'])

            if prediction[1] in faro_classes:
                sentence_uri = node_creation('', sentence, base_add='/sentence') #Generate the URI for the sentence
                graph.add((URIRef(row['URI']), nif.sentence, sentence_uri)) #Link the article to the sentence
                graph.add((sentence_uri, RDF.type, nif.Sentence)) #Make the sentence URI of class 'Sentence'
                graph.add((sentence_uri, RDF.value, Literal(sentence))) #Set the value of the URI equal to the sentence

                subject_uri = node_creation('', prediction[0] + str(sentence_uri), base_add='/subject') #Generate the URI for the subject, for now add the uri of sentence to make it unique
                #graph.add((sentence_uri, faro.Relata, subject_uri)) #Add the subject to the sentence
                graph.add((sentence_uri, nif.word, subject_uri)) #Add the subject to the sentence
                graph.add((subject_uri, RDF.type, faro.Relata)) #Make it of class 'Relata'
                graph.add((subject_uri, RDF.value, Literal(prediction[0]))) #Set the value of the subject URI equal to the subject

                object_uri = node_creation('', prediction[2] + str(sentence_uri), base_add='/object') #Generate the URI for the object, for now add the uri of sentence to make it unique
                #graph.add((sentence_uri, faro.Relata, object_uri)) #Add the object to the sentence
                graph.add((sentence_uri, nif.word, object_uri)) #Add the object to the sentence
                graph.add((object_uri, RDF.type, faro.Relata)) #Make it of class 'Relata'
                graph.add((object_uri, RDF.value, Literal(prediction[2]))) #Set the value of the subject URI equal to the object

                graph.add((subject_uri, faro_classes[prediction[1]], object_uri)) #Add relation betwee NERs
            else:
                continue

        for event in ast.literal_eval(row['Event']): #Link the article to the corresponding event
            #event_uri = node_creation('', event_name, base_add='/event')
            graph.add((URIRef(row['URI']), schema.about, URIRef(event)))

graph.serialize('Data/graphs/updated_ontology/event_article_graph_complete.ttl', format='turtle')

100%|██████████| 19578/19578 [00:15<00:00, 1241.90it/s]


<Graph identifier=N98e1b3f780e649c59921af1aa1ddbe50 (<class 'rdflib.graph.Graph'>)>

In [None]:
#Used for drawing the graph
import networkx as nx
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph

nx_graph = rdflib_to_networkx_multidigraph(graph)
pos = nx.spring_layout(nx_graph, scale=2)

edge_labels = nx.get_edge_attributes(nx_graph, 'r')
nx.draw_networkx_edge_labels(nx_graph, pos, edge_labels=edge_labels)
nx.draw(nx_graph, with_labels=True)

# Merge same entities
### Entity coreference resolution

In [1]:
# This code will load the clusters from a text file, and merge the nodes in the cluster together
import ast
from resources import node_creation
import pandas as pd
from rdflib import Graph, Literal, RDF, URIRef, Namespace
import os
from tqdm import tqdm

base_path = 'Data/cluster_data/output_all/'
cluster_dirs = os.listdir(base_path)
cluster_docs = [base_path + dir +'/event_clusters.txt' for dir in cluster_dirs if os.path.isdir(base_path + dir)]

In [2]:
graph = Graph()
graph.parse('Data/graphs/updated_ontology/event_article_graph_complete.ttl')
data = pd.read_csv('Data/final_data_with_predictions.csv') #Load the original dataset
owl = Namespace("http://www.w3.org/2002/07/owl#")


for doc in tqdm(cluster_docs):

    with open(doc) as f: #Open the generated cluster file
        cluster_doc = f.readlines()

    for cluster in cluster_doc:

        if cluster.startswith('['):
            cluster = ast.literal_eval(cluster)

            if len(cluster) != 1:
                prev_mention_uri = None

                for i, mention in enumerate(cluster):
                    mention = mention.split('_')

                    sentence = data[(data['URI'] == mention[1]) & (data['Sentence_num'] == int(mention[2]))]['Sentence'].values[0]
                    sentence_uri = node_creation('', sentence, base_add='/sentence') #Generate the URI for the sentence


                    #determine subject or object
                    if data[(data['URI'] == mention[1]) & (data['Sentence_num'] == int(mention[2]))]['Subject'].values[0] == mention[0]:
                        mention_uri = node_creation('', mention[0] + str(sentence_uri), base_add='/subject')

                    else:
                        mention_uri = node_creation('', mention[0] + str(sentence_uri), base_add='/object')

                    #print(mention_uri)

                    if i != 0:
                        graph.add((prev_mention_uri, owl.sameAs, mention_uri))


                    prev_mention_uri = mention_uri
graph.serialize('Data/graphs/updated_ontology/event_article_graph_complete_merged.ttl', format='turtle')

100%|██████████| 86/86 [01:40<00:00,  1.16s/it]


<Graph identifier=N11d3e619a4e4447cbeb1971e81fc8b40 (<class 'rdflib.graph.Graph'>)>

# Select most relevant information from the graph
### Lookup the values of the selected nodes

In [10]:
import pandas as pd
from rdflib.plugins.sparql import prepareQuery
from rdflib import Graph, URIRef
from resources import uri_validator, mapping_dict

In [2]:
# Lookup the values of the selected nodes

selected_nodes = pd.read_csv('Data/subgraph/10-subgraph.csv', index_col=0)
graph = Graph()
graph.parse("Data/graphs/event_article_graph_complete_merged.ttl")

subj_values = []
obj_values = []

subj_query = prepareQuery("""
    SELECT ?subj_value Where{

    ?subject rdf:value ?subj_value.

    }
""", initNs={"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"})

obj_query = prepareQuery("""
    SELECT ?obj_value Where{

    ?object rdf:value ?obj_value.

    }
""", initNs={"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"})

for index, row in selected_nodes.iterrows():


    subj_node = URIRef(row['subject'])
    obj_node = URIRef(row['object'])

    subj_qres = graph.query(subj_query, initBindings={"subject": subj_node})
    obj_qres = graph.query(obj_query, initBindings={"object": obj_node})

    if len(subj_qres) != 0:
        for row_result in subj_qres:

            subj_values.append(row_result[0].value)
            break

    elif uri_validator(row['subject']) == False:
        subj_values.append(row['subject'])

    else:
        subj_values.append(None)


    if len(obj_qres) != 0:
        for row_result in obj_qres:

            obj_values.append(row_result[0].value)
            break

    elif uri_validator(row['object']) == False:
        obj_values.append(row['object'])

    else:
        obj_values.append(None)

selected_nodes['subject_values'] = subj_values
selected_nodes['object_values'] = obj_values

"2021-02-11T00:00:00+00:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> does not look like a valid URI, trying to serialize this will break.
"2021-02-11T00:00:00+00:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> does not look like a valid URI, trying to serialize this will break.
"2021-02-11T00:00:00+00:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> does not look like a valid URI, trying to serialize this will break.
"2021-02-14T00:00:00+00:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> does not look like a valid URI, trying to serialize this will break.
"2021-02-14T00:00:00+00:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> does not look like a valid URI, trying to serialize this will break.
"2021-02-01T00:00:00+00:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> does not look like a valid URI, trying to serialize this will break.
"2021-02-01T00:00:00+00:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> does not look like a valid URI, trying to serialize this will break.
"2021-

### Generating the correct format for JointGT

In [52]:
import json

def gen_jointgt_input_format(data, output_file, subj_col= 'subject_values', rel_col= 'predicate', obj_col= 'object_values', sent_col = None, single_event = True):
    mapping_ids = mapping_dict(data[subj_col].to_list()) #Generate unique ID's per word

    full_data = []
    kbs = {}
    for index, row in data.iterrows():
        subj_id = f"W{mapping_ids[row[subj_col]]}"
        relation = row[rel_col].split('/')[-1]
        kbs[subj_id] = [row[subj_col], row[subj_col], [[relation, str(row[obj_col])]]]

        if single_event == False:
            json_dict = {"id": index,
                         "kbs": kbs,
                         "text": [row[sent_col]]}
            full_data.append(json_dict)
            kbs = {}


    if single_event:
        full_data = {"id": 1,
                     "kbs": kbs,
                     "text": ["test"]}

    with open(output_file, "w") as json_out:

        json.dump(full_data, json_out, indent = 2)

In [53]:
#Now convert rebel relationship data to JointGT input format

relation_data = pd.read_csv('Data/rebel_v2/data/new_split/train.csv', index_col=0)
gen_jointgt_input_format(relation_data, 'relation_dataset_jointgt.json', 'trigger1', 'label', 'trigger2', 'sentence', single_event= False)