# Generate the event and article knowledge graph

In [32]:
import rdflib
from rdflib import Graph, Literal, RDF, URIRef, Namespace
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import ast
from resources import *
import re

rnews = Namespace("http://iptc.org/std/rNews/2011-10-07#")
nif = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
faro = Namespace("https://purl.org/faro/")
sem = Namespace("http://semanticweb.cs.vu.nl/2009/11/sem/")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rnews = Namespace("http://iptc.org/std/rNews/2011-10-07#")
schema = Namespace("http://schema.org/")

faro_classes = {'cause': faro.causes, 'enable': faro.enables, 'intend': faro.intends_to_cause, 'prevent': faro.prevents} #dict of faro definitions
sem_props = {'http://www.wikidata.org/prop/direct/P710': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P664': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P112': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P17': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P276': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P625': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P131': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P30': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P585': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P580': sem.hasBeginTimeStamp,
             'http://www.wikidata.org/prop/direct/P582': sem.hasEndTimeStamp,
             'http://www.wikidata.org/prop/direct/P571': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P576': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P577': sem.hasTimeStamp,
             'http://www.w3.org/2000/01/rdf-schema#label': 'what'}

sem_classes = {sem.hasActor: sem.Actor,
               sem.hasPlace: sem.Place,
               sem.hasTime: sem.Time,
               sem.hasBeginTimeStamp: sem.Time,
               sem.hasEndTimeStamp: sem.Time,
               sem.hasTimeStamp: sem.Time,
               'what': sem.Event}


In [33]:
from resources import clean_text
data = pd.read_csv('Data/ASRAEL_data_full.csv')
data['Text'] = data['Text'].apply(clean_text)

In [34]:
#This converts the data into the right format, by removing uneccessary tokens in text and converting the wikidata link to text
from resources import uri_validator

graph = Graph()
place_set = set()
time_set = set()
actor_set = set()


event_mapping = {} #here the wikidata event urls and their names are saved
failed_events= [] #Events that can't be found e.g. owl:sameAs need to be removed

sparql = SPARQLWrapper(
    "https://query.wikidata.org/sparql"
)
sparql.setReturnFormat(JSON)

for event in data['Event'].unique().tolist():
    event_ = f"wd:{event.split('/')[-1]}"

    sparql.setQuery("""
    SELECT (?p as ?wiki_prop) (?o as ?result)
    WHERE {{{

        %s ?p ?temp.
      ?temp rdfs:label ?o.
      FILTER (lang(?o) = 'en') }
      }

      UNION

      {
       SELECT *
       WHERE{
        %s ?p ?o.
         FILTER(lang(?o) = 'en' || lang(?o)='') }}
     } """ % (event_, event_))

    try:
        result = sparql.queryAndConvert()
        #event_name = ret['results']['bindings'][0]['item']['value']

    except:
        print(f"Something went wrong when converting event: {event}")

    try:
        event_data = pd.json_normalize(result["results"]["bindings"])[['wiki_prop.value', 'result.value', 'result.datatype']]
        event_data = event_data.rename(columns={"wiki_prop.value": "property", "result.value": "value", "result.datatype": "datatype"})
        event_data = event_data.loc[event_data['property'].isin(sem_props.keys())].reset_index(drop=True) #Only keep the 4W attributes
        event_data['property'] = event_data['property'].replace(sem_props)
        event_name = event_data.loc[event_data['property'] == 'what']['value'].values[0] #This needs to be saved to map the wikidata urls to events
        event_data = event_data[event_data.property != 'what'] #This row needs to be deleted for the loop
        event_mapping[event] = event_name
    except:
        print(f"Error when searching for event: {event}")
        failed_events.append(event)
        continue

    #event_uri = node_creation('', event_name, base_add='/event') #Generate the URI for the event
    graph.add((URIRef(event), RDF.type, sem.Event)) #Create the event
    graph.add((URIRef(event), RDF.value, Literal(event_name)))

    for index, row in event_data.iterrows():
        uri = node_creation('', row['value'], base_add='') #Generate the URI for the property
        if uri_validator(uri) == False:
            print(f"Found issue, generated link is not an uri:\n{uri}")
        graph.add((uri, RDF.type, sem_classes[row['property']])) #Create the node for the property, and lookup its class
        if pd.isna(row['datatype']) == False: #It has a declared datatype
            graph.add((uri, RDF.value, Literal(row['value'], datatype=row['datatype'])))
        else:
            graph.add((uri, RDF.value, Literal(row['value']))) #Add the value of the relation to the graph
        graph.add((URIRef(event), row['property'], uri)) #Connect the event to the property

        if row['property'] == sem.hasActor:
            actor_set.add(event)
        elif row['property'] == sem.hasPlace:
            place_set.add(event)
        elif (row['property'] == sem.hasTime or row['property'] == sem.hasBeginTimeStamp or row['property'] == sem.hasEndTimeStamp ):
            time_set.add(event)
        else:
            print(f"property not supported: {row['property']}")


#graph.serialize('Data/graphs/final_generated/Event_graph_all.ttl', format='turtle')

data= data[~data['Event'].isin(failed_events)] #remove the rows for which the event was not found
#data['Event'] = data['Event'].map(event_mapping)
#Check if this still is oke, it adds a list of events to the column event
data = data.groupby(['URI','Identifier','Location', 'Time', 'Text']).agg({'Event': lambda x: list(x)}).reset_index(drop=False)
#data.to_csv('Data/dataset_final_generated/ASRAEL_data_full_converted.csv')

Error when searching for event: http://www.wikidata.org/entity/Q100919128
Error when searching for event: http://www.wikidata.org/entity/Q113945893
Error when searching for event: http://www.wikidata.org/entity/Q105597606


In [9]:
# Perform predictions, execute cell below to combine predictions and generate the graph at once.
import pandas as pd
from rebel_finetuning_faro import make_predictions
from nltk import tokenize
from tqdm import tqdm

data = pd.read_csv('Data/dataset_final_generated/ASRAEL_data_full_converted.csv')
total_uri = []
total_identifier = []
total_location = []
total_time = []
total_event = []
total_sentence_num = []
total_sentences = []
total_subject = []
total_relation = []
total_object = []

for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    sentences, predictions = make_predictions(tokenize.sent_tokenize(row['Text']), 'rebel_finetuned.pth')
    for i, (sentence, prediction) in enumerate(zip(sentences, predictions)):

        total_uri.append(row['URI'])
        total_identifier.append(row['Identifier'])
        total_location.append(row['Location'])
        total_time.append(row['Time'])
        total_event.append(row['Event'])
        total_sentence_num.append(i)
        total_sentences.append(sentence)
        total_subject.append(prediction[0])
        total_relation.append(prediction[1])
        total_object.append(prediction[2])
    break


new_data = pd.DataFrame({'URI': total_uri, 'Identifier': total_identifier, 'Location': total_location, 'Time': total_time, 'Sentence_num': total_sentence_num, 'Sentence': total_sentences, 'Subject': total_subject, 'Relation': total_relation, 'Object': total_object, 'Event': total_event})

new_data = new_data[~((new_data['Sentence'].str.len() < 25) & new_data['Sentence'].str.contains('/'))].reset_index(drop=True)

new_data.to_csv('Data/final_generated/final_data_with_predictions.csv')

  0%|          | 0/1122 [00:11<?, ?it/s]


In [63]:
from tqdm import tqdm
from nltk import tokenize

graph = Graph()
graph.parse('Data/graphs/final_generated/Event_graph_all.ttl')
data = pd.read_csv('Data/final_data_with_predictions.csv')

data_with_predictions = True #Set this to True if the cell above was executed

for index, row in tqdm(data.iterrows(), total=data.shape[0]):

    if not row.isnull().values.any(): #If nan is present skip the row

        graph.add((URIRef(row['URI']), RDF.type, rnews.Article)) #Add the URI as article

        identifier_uri = node_creation('', row['Identifier'], base_add='/identifier')
        graph.add((URIRef(row['URI']), rnews.identifier, URIRef(identifier_uri))) #Link the PublicID to the article
        graph.add((URIRef(identifier_uri), RDF.value, Literal(row['Identifier'])))

        location_uri = node_creation('', row['Location'], base_add='')
        graph.add((URIRef(row['URI']), schema.contentLocation, URIRef(location_uri)))
        graph.add((URIRef(location_uri), RDF.value, Literal(row['Location'])))
        graph.add((URIRef(location_uri), RDF.type, schema.Place))

        time_uri = node_creation('', row['Time'], base_add='')
        graph.add((URIRef(row['URI']), schema.contentReferenceTime, URIRef(time_uri)))
        graph.add((URIRef(time_uri), RDF.value, Literal(row['Time'])))
        graph.add((URIRef(time_uri), RDF.type, schema.Time))

        if data_with_predictions == False: #Make the predictions

            from rebel_finetuning_faro import make_predictions

            sentences, predictions = make_predictions(tokenize.sent_tokenize(row['Text']), 'rebel_finetuned.pth')
            for sentence, prediction in zip(sentences, predictions):

                if prediction[1] in faro_classes:
                    sentence_uri = node_creation('', sentence, base_add='/sentence') #Generate the URI for the sentence
                    graph.add((URIRef(row['URI']), nif.sentence, sentence_uri)) #Link the article to the sentence
                    graph.add((sentence_uri, RDF.type, nif.Sentence)) #Make the sentence URI of class 'Sentence'
                    graph.add((sentence_uri, RDF.value, Literal(sentence))) #Set the value of the URI equal to the sentence

                    subject_uri = node_creation('', prediction[0] + str(sentence_uri), base_add='/subject') #Generate the URI for the subject, for now add the uri of sentence to make it unique
                    graph.add((sentence_uri, faro.Relata, subject_uri)) #Add the subject to the sentence
                    graph.add((subject_uri, RDF.value, Literal(prediction[0]))) #Set the value of the subject URI equal to the subject

                    object_uri = node_creation('', prediction[2] + str(sentence_uri), base_add='/object') #Generate the URI for the object, for now add the uri of sentence to make it unique
                    graph.add((sentence_uri, faro.Relata, object_uri)) #Add the object to the sentence
                    graph.add((object_uri, RDF.value, Literal(prediction[2]))) #Set the value of the subject URI equal to the object

                    graph.add((subject_uri, faro_classes[prediction[1]], object_uri)) #Add relation betwee NERs
                else:
                    continue
        else: # The data already contains the predictions
            sentence = row['Sentence']
            prediction = (row['Subject'], row['Relation'], row['Object'])

            if prediction[1] in faro_classes:
                sentence_uri = node_creation('', sentence, base_add='/sentence') #Generate the URI for the sentence
                graph.add((URIRef(row['URI']), nif.sentence, sentence_uri)) #Link the article to the sentence
                graph.add((sentence_uri, RDF.type, nif.Sentence)) #Make the sentence URI of class 'Sentence'
                graph.add((sentence_uri, RDF.value, Literal(sentence))) #Set the value of the URI equal to the sentence

                subject_uri = node_creation('', prediction[0] + str(sentence_uri), base_add='/subject') #Generate the URI for the subject, for now add the uri of sentence to make it unique
                #graph.add((sentence_uri, faro.Relata, subject_uri)) #Add the subject to the sentence
                graph.add((sentence_uri, nif.word, subject_uri)) #Add the subject to the sentence
                graph.add((subject_uri, RDF.type, faro.Relata)) #Make it of class 'Relata'
                graph.add((subject_uri, RDF.value, Literal(prediction[0]))) #Set the value of the subject URI equal to the subject

                object_uri = node_creation('', prediction[2] + str(sentence_uri), base_add='/object') #Generate the URI for the object, for now add the uri of sentence to make it unique
                #graph.add((sentence_uri, faro.Relata, object_uri)) #Add the object to the sentence
                graph.add((sentence_uri, nif.word, object_uri)) #Add the object to the sentence
                graph.add((object_uri, RDF.type, faro.Relata)) #Make it of class 'Relata'
                graph.add((object_uri, RDF.value, Literal(prediction[2]))) #Set the value of the subject URI equal to the object

                graph.add((subject_uri, faro_classes[prediction[1]], object_uri)) #Add relation between NERs
            else:
                continue

        for event in ast.literal_eval(row['Event']): #Link the article to the corresponding event
            #event_uri = node_creation('', event_name, base_add='/event')
            graph.add((URIRef(row['URI']), schema.about, URIRef(event)))
            graph.add((URIRef(event), schema.subjectOf, URIRef(row['URI'])))

graph.serialize('Data/graphs/final_generated/eag_complete.ttl', format='turtle')

100%|██████████| 19578/19578 [00:25<00:00, 754.16it/s] 


<Graph identifier=N6eaa17f7cbe443be835e69bcc063e464 (<class 'rdflib.graph.Graph'>)>

In [None]:
#Used for drawing the graph
import networkx as nx
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph

nx_graph = rdflib_to_networkx_multidigraph(graph)
pos = nx.spring_layout(nx_graph, scale=2)

edge_labels = nx.get_edge_attributes(nx_graph, 'r')
nx.draw_networkx_edge_labels(nx_graph, pos, edge_labels=edge_labels)
nx.draw(nx_graph, with_labels=True)

# Merge same entities
### Entity coreference resolution

In [49]:
# This code will load the clusters from a text file, and merge the nodes in the cluster together
import ast
from resources import node_creation
import pandas as pd
from rdflib import Graph, Literal, RDF, URIRef, Namespace
import os
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON

base_path = 'Data/cluster_data/output_all/'
cluster_dirs = os.listdir(base_path)
cluster_docs = [base_path + dir +'/event_clusters.txt' for dir in cluster_dirs if os.path.isdir(base_path + dir)]

In [54]:
graph = Graph()
graph.parse('Data/graphs/final_generated/eag_complete.ttl')
data = pd.read_csv('Data/final_data_with_predictions.csv') #Load the original dataset
owl = Namespace("http://www.w3.org/2002/07/owl#")

failed_mentions = 0
double_match = 0

query = """
SELECT DISTINCT ?mention
WHERE {
    ?mention a faro:Relata ;
        rdf:value ?value
}"""

qres = graph.query(query, initNs={"faro": "https://purl.org/faro/", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"})
all_mentions = [mention['mention'] for mention in qres.bindings]

for doc in tqdm(cluster_docs):

    with open(doc) as f: #Open the generated cluster file
        cluster_doc = f.readlines()

    for cluster in cluster_doc:

        if cluster.startswith('['):
            cluster = ast.literal_eval(cluster)

            if len(cluster) != 1:
                prev_mention_uri = None

                for i, mention in enumerate(cluster):
                    is_subject = False
                    is_object = False
                    mention_uri = None

                    mention = mention.split('_')

                    instance = data[(data['URI'] == mention[1]) & (data['Sentence_num'] == int(mention[2]))]
                    sentence = instance['Sentence'].values[0]
                    sentence_uri = node_creation('', sentence, base_add='/sentence') #Generate the URI for the sentence

                    #determine subject or object


                    if any(part_mention in instance['Subject'].values[0] for part_mention in mention[0].split()):
                        subject_mention_uri = node_creation('',instance['Subject'].values[0]  + str(sentence_uri), base_add='/subject')
                        if subject_mention_uri in all_mentions: #Double check it
                            is_subject = True


                    if any(part_mention in instance['Object'].values[0] for part_mention in mention[0].split()):
                        object_mention_uri = node_creation('',instance['Object'].values[0] + str(sentence_uri), base_add='/object')
                        if object_mention_uri in all_mentions: #Double check again
                            is_object = True


                    """
                    if is_subject and is_object: #Look up in the graph, to see which is correct
                        if subject_mention_uri in all_mentions:
                            is_object = False
                        elif object_mention_uri in all_mentions:
                            is_subject = False
                    """

                    if not (is_subject or is_object): #The mention could not be found
                        #print(mention[0])
                        failed_mentions +=1
                        continue

                    mention_uri = subject_mention_uri if is_subject else object_mention_uri

                    if prev_mention_uri != None:
                        if is_subject:
                            graph.add((prev_mention_uri, owl.sameAs, subject_mention_uri))
                        else:
                            graph.add((prev_mention_uri, owl.sameAs, object_mention_uri))

                    prev_mention_uri = mention_uri


print(f"Number of failed mentions: {failed_mentions}\nNumber of double matches: {double_match}")
graph.serialize('Data/graphs/final_generated/eag_complete_merged.ttl', format='turtle')

100%|██████████| 86/86 [03:39<00:00,  2.56s/it]


Number of failed mentions: 1863
Number of double matches: 0


<Graph identifier=Nc930a7b4451b4f59b3729ad19eb3628d (<class 'rdflib.graph.Graph'>)>

# Select most relevant information from the graph

# Query the graph: extract triples without using graph search algorithm
## Extract: time, place, actor, contentLocation, contentReferenceTime

In [1]:
from collections import defaultdict
import pandas as pd
from rdflib import Graph, URIRef
from owlrl import DeductiveClosure
from rdflib.term import Variable
from owlrl import OWLRL_Semantics #This is needed to allow owl reasoning over the sameAs links
import random

namespaces = {"faro": "https://purl.org/faro/",
              "sem": "http://semanticweb.cs.vu.nl/2009/11/sem/",
              "rnews": "http://iptc.org/std/rNews/2011-10-07#",
              "nif": "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#",
              "owl": "http://www.w3.org/2002/07/owl#",
              "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
              "schema": "http://schema.org/"}

graph = Graph()
graph.parse("Data/graphs/final_generated/eag_complete_merged.ttl")

DeductiveClosure(OWLRL_Semantics).expand(graph)

<Graph identifier=N9b06690c3d16492aac5f0cc58fc80f98 (<class 'rdflib.graph.Graph'>)>

In [23]:
#Extract 4W's from a event
event_uri = "http://www.wikidata.org/entity/Q104705419"
query = """
SELECT ?event_name ?Time ?place ?actor ?beginTime ?endTime ?timeStamp where {
	?event a  sem:Event;
	    rdf:value ?event_name
    OPTIONAL{?event sem:hasTime ?time_uri.
            ?time_uri rdf:value ?Time}.
    OPTIONAL{?event sem:hasPlace ?place_uri.
            ?place_uri rdf:value ?place}.
    OPTIONAL{?event sem:hasActor ?actor_uri.
            ?actor_uri rdf:value ?actor}.
    OPTIONAL{?event sem:hasBeginTimeStamp ?beginTime_uri.
            ?beginTime_uri rdf:value ?beginTime}.
    OPTIONAL{?event sem:hasEndTimeStamp ?endTime_uri.
            ?endTime_uri rdf:value ?endTime}.
    OPTIONAL{?event sem:hasTimeStamp ?time_uri.
            ?time_uri rdf:value ?time}.
}"""

event = URIRef(event_uri)

qres = graph.query(query, initNs= namespaces, initBindings={"event": event})
four_W = {}

for row in qres.bindings:
    for key in row.keys():
        if key not in four_W.keys():
            four_W[key] = [row[key]]
        else:
            if row[key] not in four_W[key]:
                four_W[key].append(row[key])

In [24]:
query = """
SELECT (COUNT(?i) as ?num_input) WHERE {
    ?i ?p2 ?uri
} GROUP BY ?uri
"""

def def_value():
    return 0

four_W_scores = defaultdict(def_value)
for values in four_W.values():
    for value in values:
        qres = graph.query(query, initNs= namespaces, initBindings={"uri": value})
        four_W_scores[value] = int(qres.bindings[0]['num_input'])

In [25]:
for key, value in four_W.items():
    print(f"{key} - {value}")

event_name - [rdflib.term.Literal('2021 storming of the United States Capitol')]
Time - [rdflib.term.Literal('2021-01-06T00:00:00+00:00', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#dateTime'))]
place - [rdflib.term.Literal('Point(-77.009166666 38.889722222)', datatype=rdflib.term.URIRef('http://www.opengis.net/ont/geosparql#wktLiteral')), rdflib.term.Literal('United States of America'), rdflib.term.Literal('United States Capitol'), rdflib.term.Literal('Washington, D.C.')]
actor - [rdflib.term.Literal('Proud Boys')]


In [26]:
selected_nodes = {"mentions": []}

if ('beginTime' in str(four_W.keys()) and 'endTime' in str(four_W.keys())):#Favour the more descriptive dates
    if (four_W[Variable('beginTime')] != four_W[Variable('endTime')]): #If they are the same, it doesn't add any information
        try:
            del four_W[Variable('Time')]
        except:
            print("Key already removed")

    elif 'Time' in str(four_W.keys()):
        del four_W[Variable('beginTime')]
        del four_W[Variable('endTime')]


for key in four_W:
    most_relevant_node = None
    for node in four_W[key]:
        if four_W_scores[node] > four_W_scores[most_relevant_node]:
            most_relevant_node = node
    selected_nodes[str(key)] = str(most_relevant_node) if not 'Time' in str(key) else str(most_relevant_node).split('T')[0] #For now only take the date into account

In [27]:
#First, extract all the mentions from the sentences
query = """
PREFIX faro: <https://purl.org/faro/>
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX rnews: <http://iptc.org/std/rNews/2011-10-07#>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix schema: <http://schema.org/>
select distinct ?mention where {
	?event a sem:Event;
         schema:subjectOf ?article.
    ?article nif:sentence ?sentence.
    ?sentence nif:word ?mention.
    ?mention owl:sameAs ?o .

}"""
event = URIRef(event_uri)
qres = graph.query(query, initNs= namespaces, initBindings={"event": event})
mentions = [mention[0] for mention in qres] #Save all mentions of this event

In [28]:
#Now, find the biggest cluster
query = """
SELECT ?nodes where {
	?mention a faro:Relata;
	    owl:sameAs+ ?nodes
}
"""
clusters = {} #the key will be one of the mentions in the set
for mention in mentions:
    mention = URIRef(mention)
    qres = graph.query(query, initNs= namespaces, initBindings={"mention": mention})
    mention_cluster = set()
    for result in qres:
        mention_cluster.add(result[0])
    if not any(key in mention_cluster for key in clusters.keys()) and len(mention_cluster)!=1: #Only keep unique clusters, keep as key one of the elements in the cluster
        clusters[mention] = mention_cluster
clusters= dict(sorted(clusters.items(), key=lambda x:len(x[1]), reverse=True))

In [29]:
#Find the text values for the clusters
SELECT_MAX_MENTIONS = 3 #Select only the top x mentions

subject_query = """
SELECT ?subject_value ?predicate ?object_value where {
	?mention a faro:Relata;
	    rdf:value ?subject_value;
	    ?predicate ?object.
	?object rdf:value ?object_value
}"""

object_query = """
SELECT ?subject_value ?predicate ?object_value where {
	?mention a faro:Relata;
	    rdf:value ?object_value.
	?subject ?predicate ?mention;
	    rdf:value ?subject_value
}"""
for i, cluster in enumerate(clusters.values()):
    mention = random.choice(list(cluster)) #Only pick one mention per cluster
    if 'subject' in str(mention):
        qres = graph.query(subject_query, initNs= namespaces, initBindings={"mention": mention})
    else:
        qres = graph.query(object_query, initNs= namespaces, initBindings={"mention": mention})

    for instance in qres.bindings:
        if 'faro' in str(instance['predicate']):
            selected_nodes['mentions'].append([str(instance['subject_value']), instance['predicate'].split('/')[-1], str(instance['object_value'])])
            break
    if i+1 == SELECT_MAX_MENTIONS:
        break

In [30]:
selected_nodes

{'mentions': [['addressed', 'causes', 'stormed'],
  ['died', 'causes', 'shot'],
  ['measures', 'enables', 'removing']],
 'event_name': '2021 storming of the United States Capitol',
 'Time': '2021-01-06',
 'place': 'United States of America',
 'actor': 'Proud Boys'}

In [10]:
from resources import gen_mapping_dict
mapping_dict, _ = gen_mapping_dict('Data/jointGT/events/events_combined.json')

Processed 5 instances


# Generating the correct format for JointGT
## Convert the selected triples into the JointGT format

In [4]:
import sys, importlib
importlib.reload(sys.modules['resources'])

<module 'resources' from 'C:\\Users\\mike-\\Documents\\VU\\Eurecom\\KG_mapping\\resources.py'>

In [31]:
from resources import convert_selected_triples_to_jointgt
import os
output_path = os.path.join("Data/jointGT/events/", selected_nodes['event_name'].replace(" ", "_"))
convert_selected_triples_to_jointgt(selected_nodes, output_path + '.json', mapping_dict)

Processed 6 instances


In [None]:
#Optional: merge different events together
from resources import combine_jointgt_events
import os

filepaths = [os.path.join("Data/jointGT/events", file) for file in os.listdir("Data/jointGT/events")]
combine_jointgt_events(filepaths, output_file= "Data/jointGT/events/events_combined.json")

In [3]:
#Now convert rebel relationship data to JointGT input format
from resources import gen_jointgt_input_format, gen_mapping_dict

encoding_dict, total_instances = gen_mapping_dict('..\\JointGT\\JointGT_data\\data\\webnlg\\train.json', '..\\JointGT\\JointGT_data\\data\\webnlg\\val.json', '..\\JointGT\\JointGT_data\\data\\webnlg\\test.json')

relation_data_train = pd.read_csv('Data/rebel_v2/data/new_split/train.csv', index_col=0)
relation_data_val = pd.read_csv('Data/rebel_v2/data/new_split/val.csv', index_col=0)
relation_data_test = pd.read_csv('Data/rebel_v2/data/new_split/test.csv', index_col=0)

total_instances += gen_jointgt_input_format(relation_data_train, 'Data/jointGT/faro/relation_dataset_jointgt_train.json', encoding_dict, 'trigger1', 'label', 'trigger2', 'sentence', single_event= False, start_id=total_instances)
total_instances += gen_jointgt_input_format(relation_data_val, 'Data/jointGT/faro/relation_dataset_jointgt_val.json', encoding_dict, 'trigger1', 'label', 'trigger2', 'sentence', single_event= False, start_id= total_instances)
total_instances += gen_jointgt_input_format(relation_data_test, 'Data/jointGT/faro/relation_dataset_jointgt_test.json', encoding_dict, 'trigger1', 'label', 'trigger2', 'sentence', single_event= False, start_id= total_instances)

Processed 1800 instances
Processed 201 instances
Processed 95 instances


In [5]:
#Next step: Combine both the original WebNLG data with the newly generated one
import random
def combine_datasets(dataset1, dataset2, output_file):
    dataset1 = json.load(open(dataset1))
    dataset2 = json.load(open(dataset2))

    combined = dataset1+dataset2
    random.shuffle(combined)

    with open(output_file, "w") as json_out:
        json.dump(combined, json_out, indent = 2)

In [6]:
combine_datasets('Data\\jointGT\\faro\\relation_dataset_jointgt_train.json', '..\\JointGT\\JointGT_data\\data\\webnlg\\train.json', 'Data\\jointGT\\combined\\train.json')
combine_datasets('Data\\jointGT\\faro\\relation_dataset_jointgt_val.json', '..\\JointGT\\JointGT_data\\data\\webnlg\\val.json', 'Data\\jointGT\\combined\\val.json')
combine_datasets('Data\\jointGT\\faro\\relation_dataset_jointgt_test.json', '..\\JointGT\\JointGT_data\\data\\webnlg\\test.json', 'Data\\jointGT\\combined\\test.json')

In [6]:
## Take the retrieved triples, and generate an input format for JointGT
#For now just create the triples by hand
#Changed 1 "pandemic" to "Covid-19 pandemic"
import pandas as pd
from resources import gen_jointgt_input_format_multiple

#Note: The tuple ("western sanctions", "prevent", "scientific and technological progress of Iran"), ("enemies' maximum pressure", "prevent", "scientific and technological progress of Iran"), is not really prevent, just kept it in for showing the result on this relation.

tuples_data = (
            ("Russia", "launch", "satellite", "None", 0), ("launch", "location", "Kazakhstan", "None", 0), ("launch", "time", "Tuesday", "None", 0), ("satellite", "enable", "military surveillance", "An Iranian satellite launched by Russia blasted off from Kazakhstan Tuesday and reached orbit amid controversy that Moscow might use it to boost its surveillance of military targets in Ukraine.", 0),

               ("Russia", "cause", "invasion of Ukraine", "None", 1), ("invasion of Ukraine", "cause", "western sanctions", "None", 1), ("western sanctions", "cause", "Russia's international isolation", "None", 1), ("Russia", "intend", "find new clients", "As Russia's international isolation grows following Western sanctions over its invasion of Ukraine, the Kremlin is seeking to pivot towards the Middle East, Asia and Africa and find new clients for its embattled space programme.", 1),

               ("Russian-Iranian bilateral cooperation", "enable", "implementation of new and even larger projects", "Speaking at the Moscow-controlled Baikonur Cosmodrome in the Kazakh steppe, Russian space chief Yury Borisov hailed 'an important milestone in Russian-Iranian bilateral cooperation, opening the way to the implementation of new and even larger projects'.", 2),

               ("Iran Telecommunication", "minister", "Issa Zarepour", "None", 3), ("Issa Zarepour", "attended", "launch of the Khayyam satellite", "None", 3), ("satellite", "enable", "a turning point for the start of a new interaction", "Iran's Telecommunications Minister Issa Zarepour, who also attended the launch of the Khayyam satellite, called the event 'historic' and 'a turning point for the start of a new interaction in the field of space between our two countries'.", 3),

               ("western sanctions", "not prevent", "scientific and technological progress of Iran", "None", 4), ("enemies' maximum pressure", "not prevent", "scientific and technological progress of Iran", "Nasser Kanani, the Iranian foreign ministry spokesman, said on Twitter that 'the brilliant path of scientific and technological progress of the Islamic republic of Iran continues despite sanctions and the enemies' maximum pressure'.", 4),

            ("Iran", "maintained ties", "Moscow", "None", 5), ("Iran", "not criticized", "Ukraine invasion", "None", 5), ("Khayyam", "enable", "spy", "Iran, which has maintained ties with Moscow and refrained from criticism of the Ukraine invasion, has sought to deflect suspicions that Moscow could use Khayyam to spy on Ukraine.", 5),

            ("Washington", "respond", "launch", "None", 6), ("Russia's growing cooperation with Iran", "cause", "profound threat", "Responding to the launch, Washington said Russia's growing cooperation with Iran should be viewed as a 'profound threat'.", 6),


               ("satellite", "sends", "information", "None", 7), ("encrypted algorithm", "prevent", "third countries accessing information", "None", 7), ("satellite", "has", "encrypted algorithm", "'No third country is able to access the information' sent by the satellite due to its 'encrypted algorithm', it said.", 7),

               ("Iran", "intend", "salvage 2015 deal", "None", 8), ("2015 deal", "prevent", "Iran's nuclear ambitions", "Iran is currently negotiating with world powers, including Moscow, to salvage a 2015 deal aimed at reining in Tehran's nuclear ambitions.", 8),

               ("satellite", "contains", "ballistic missle technologies", "None", 9), ("ballistic missile technologies", "enable", "delivery of nuclear warhead", "Western governments worry that satellite launch systems incorporate technologies interchangeable with those used in ballistic missiles capable of delivering a nuclear warhead, something Iran has always denied wanting to build.", 9),

               ("Iran", "launch", "first military satellite", "None", 10), ("first military satellite", "orbit", "April 2020", "None", 10), ("first military satellite", "cause", "sharp rebuke from the United States", "Iran successfully put its first military satellite into orbit in April 2020, drawing a sharp rebuke from the United States.", 10),

            ("Borisov", "replaced", "Dmitry Rogozin", "None", 11), ("Borisov", "acknowledged", "difficult situation", "None", 11), ("tensions with the West", "cause", "difficult situation ", "Borisov, who last month replaced bombastic nationalist Dmitry Rogozin as head of the Russian space agency, had acknowledged that the national space industry is in a 'difficult situation' amid tensions with the West.", 11)
)


created_dataset = pd.DataFrame(tuples_data, columns =['subject_values', 'predicate', 'object_values', "sentence", 'instance'])

total_instances = gen_jointgt_input_format_multiple(created_dataset, 'Data/jointGT/manual_generated_for_test/article4.json', sent_col="sentence")

Processed 32 instances


In [6]:
#This generates the input format for the JointGT model from all the 4W events in the graph.
#A list with events of interest can also be passed
from event_selection_jointgt import bulk_generate

bulk_generate("Data/graphs/final_generated/eag_complete_merged.ttl", "test")

Processed 5 instances
Processed 5 instances
Processed 7 instances
Processed 7 instances
Processed 6 instances
Processed 6 instances
Processed 6 instances
Processed 7 instances
Processed 6 instances
Processed 7 instances
Processed 7 instances
Processed 6 instances
Processed 7 instances
Done and saved
Processed 13 events
Saved at: test\event_combined.json
