# Generate the event and article knowledge graph
For AFP 2021

In [1]:
from tqdm.notebook import tqdm
import rdflib
from rdflib import Graph, Literal, RDF, XSD, URIRef, Namespace
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import ast
import re
from utils import clean_text
from os import path


nif = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
faro = Namespace("https://purl.org/faro/")
sem = Namespace("http://semanticweb.cs.vu.nl/2009/11/sem/")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rnews = Namespace("http://iptc.org/std/rNews/2011-10-07#")
schema = Namespace("http://schema.org/")

faro_classes = {'cause': faro.causes, 'enable': faro.enables, 'intend': faro.intends_to_cause, 'prevent': faro.prevents} #dict of faro definitions
sem_props = {'http://www.wikidata.org/prop/direct/P710': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P664': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P112': sem.hasActor,
             'http://www.wikidata.org/prop/direct/P17': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P276': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P625': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P131': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P30': sem.hasPlace,
             'http://www.wikidata.org/prop/direct/P585': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P580': sem.hasBeginTimeStamp,
             'http://www.wikidata.org/prop/direct/P582': sem.hasEndTimeStamp,
             'http://www.wikidata.org/prop/direct/P571': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P576': sem.hasTime,
             'http://www.wikidata.org/prop/direct/P577': sem.hasTimeStamp,
             'http://www.w3.org/2000/01/rdf-schema#label': 'what'}

sem_classes = {sem.hasActor: sem.Actor,
               sem.hasPlace: sem.Place,
               sem.hasTime: sem.Time,
               sem.hasBeginTimeStamp: sem.Time,
               sem.hasEndTimeStamp: sem.Time,
               sem.hasTimeStamp: sem.Time,
               'what': sem.Event}


In [3]:
DATA_ROOT = '../data'
OUT_ROOT = '../dump/afp/'

In [3]:
data = pd.read_csv(path.join(DATA_ROOT, 'afp2021_raw.csv'))
data.head()

Unnamed: 0,file_name,name_label,news_identifier,public_identifier,dateline,headline,country,city,keywords,genre,body_content,body_length,prediction
0,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Victoria Azarenka criticised French Open organ...,518,lack of equality Victoria Azarenka criticise...
1,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,The former world number one lost in the fourth...,518,2013 semi-finals 2013 point in time
2,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,"Last year Azarenka fumed at being left to ""sit...",518,Covid-19 pandemic the 2020 tournament was de...
3,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Azarenka again took aim at officials following...,518,scheduling took aim cause
4,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Serena Williams won the first official night m...,518,curfew played cause


In [4]:
len(data)

568147

In [5]:
# l = []
# for x in data['prediction']:
#     sp = x.split('  ')
#     if len(sp) !=3:
#         print(len(sp))
#         print(x)
delimeter = '  '
mask = data['prediction'].apply(lambda x: len(x.split(delimeter)) > 2)
data = data[mask]
len(data)

566080

In [6]:
data['prediction'] = data['prediction'].apply(strip)
data[['e1','e2','rel']] = data['prediction'].str.split('  ', n=2, expand=True,regex=False)
data.head()

Unnamed: 0,file_name,name_label,news_identifier,public_identifier,dateline,headline,country,city,keywords,genre,body_content,body_length,prediction,e1,e2,rel
0,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Victoria Azarenka criticised French Open organ...,518,lack of equality Victoria Azarenka criticise...,lack of equality,Victoria Azarenka criticised French Open organ...,cause
1,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,The former world number one lost in the fourth...,518,2013 semi-finals 2013 point in time,2013 semi-finals,2013,point in time
2,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,"Last year Azarenka fumed at being left to ""sit...",518,Covid-19 pandemic the 2020 tournament was de...,Covid-19 pandemic,the 2020 tournament was delayed to late September,cause
3,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Azarenka again took aim at officials following...,518,scheduling took aim cause,scheduling,took aim,cause
4,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Serena Williams won the first official night m...,518,curfew played cause,curfew,played,cause


In [7]:
data = data[data['rel'].isin(['cause','enable','intend','prevent'])]
len(data)

538708

In [8]:
nids = data['public_identifier'].unique() # news identifiers
len(nids)

50713

In [9]:
# For each news, checking on ASRAEL KG if we have the wikidata event
graph = Graph()
place_set = set()
time_set = set()
actor_set = set()


news_mapping = {}

sparql = SPARQLWrapper("https://asrael.eurecom.fr/sparql")
sparql.setReturnFormat(JSON)


for nid in tqdm(nids):
    q = """
    SELECT DISTINCT * where {
    ?news rnews:identifier "%s" ;
          owl:sameAs ?event
    } LIMIT 1""" % nid
    sparql.setQuery(q)

    
    try:
        result = sparql.queryAndConvert()
        event_data = result["results"]["bindings"]
        if len(event_data) < 1:
            news_mapping[nid] = None
            continue
    
        news_mapping[nid] = event_data[0]['event']['value']
    except:
        print(f"Something went wrong when converting event: {nid}")

  0%|          | 0/50713 [00:00<?, ?it/s]

In [10]:
data['event'] = data['public_identifier'].apply(lambda nif: news_mapping.get(nif))
len(data[data['event'].notnull()])

5109

In [11]:
#This converts the data into the right format, by removing uneccessary tokens in text and converting the wikidata link to text
from utils import uri_validator, node_creation

graph = Graph()
place_set = set()
time_set = set()
actor_set = set()


event_mapping = {} #here the wikidata event urls and their names are saved
failed_events= [] #Events that can't be found e.g. owl:sameAs need to be removed

sparql = SPARQLWrapper(
    "https://query.wikidata.org/sparql"
)
sparql.setReturnFormat(JSON)

for event in tqdm(data['event'].unique().tolist()):
    if event is None:
        continue
    event_ = f"wd:{event.split('/')[-1]}"

    sparql.setQuery("""
    SELECT (?p as ?wiki_prop) (?o as ?result)
    WHERE {{{

        %s ?p ?temp.
      ?temp rdfs:label ?o.
      FILTER (lang(?o) = 'en') }
      }

      UNION

      {
       SELECT *
       WHERE{
        %s ?p ?o.
         FILTER(lang(?o) = 'en' || lang(?o)='') }}
     } """ % (event_, event_))

    try:
        result = sparql.queryAndConvert()
        #event_name = ret['results']['bindings'][0]['item']['value']

    except:
        print(f"Something went wrong when converting event: {event}")

    try:
        event_data = pd.json_normalize(result["results"]["bindings"])[['wiki_prop.value', 'result.value', 'result.datatype']]
        event_data = event_data.rename(columns={"wiki_prop.value": "property", "result.value": "value", "result.datatype": "datatype"})
        event_data = event_data.loc[event_data['property'].isin(sem_props.keys())].reset_index(drop=True) #Only keep the 4W attributes
        event_data['property'] = event_data['property'].replace(sem_props)
        event_name = event_data.loc[event_data['property'] == 'what']['value'].values[0] #This needs to be saved to map the wikidata urls to events
        event_data = event_data[event_data.property != 'what'] #This row needs to be deleted for the loop
        event_mapping[event] = event_name
    except:
        print(f"Error when searching for event: {event}")
        failed_events.append(event)
        continue

    #event_uri = node_creation('', event_name, base_add='/event') #Generate the URI for the event
    graph.add((URIRef(event), RDF.type, sem.Event)) #Create the event
    graph.add((URIRef(event), RDF.value, Literal(event_name)))

    for index, row in event_data.iterrows():
        uri = node_creation('', row['value'], base_add='') #Generate the URI for the property
        if uri_validator(uri) == False:
            print(f"Found issue, generated link is not an uri:\n{uri}")
        graph.add((uri, RDF.type, sem_classes[row['property']])) #Create the node for the property, and lookup its class
        if pd.isna(row['datatype']) == False: #It has a declared datatype
            graph.add((uri, RDF.value, Literal(row['value'], datatype=row['datatype'])))
        else:
            graph.add((uri, RDF.value, Literal(row['value']))) #Add the value of the relation to the graph
        graph.add((URIRef(event), row['property'], uri)) #Connect the event to the property

        if row['property'] == sem.hasActor:
            actor_set.add(event)
        elif row['property'] == sem.hasPlace:
            place_set.add(event)
        elif (row['property'] == sem.hasTime or row['property'] == sem.hasBeginTimeStamp or row['property'] == sem.hasEndTimeStamp ):
            time_set.add(event)
        else:
            print(f"property not supported: {row['property']}")

  0%|          | 0/29 [00:00<?, ?it/s]

Error when searching for event: http://www.wikidata.org/entity/Q100919128


In [15]:
graph.serialize(path.join(OUT_ROOT, 'event_data.ttl'), format='turtle')

data[data['event'].isin(failed_events)] = None

In [17]:
import time
import re

def to_date(string_date):
    if not isinstance(string_date, str):
        return None

    regex = r"(?:.+, )?(.+ \d+) \((AFP|BSW)\) -"
    matches = re.finditer(regex, string_date)
    for matchNum, match in enumerate(matches, start=1):
        clean_date = match.group(1).replace('June', 'Jun').replace('April', 'Apr').replace('2 oct 2021', 'Oct 2, 2021')
        break

    try:
        parsed = time.strptime(clean_date, '%b %d, %Y')
        return time.strftime('%Y-%m-%d', parsed)
    except:
        print(clean_date)
        return None


data['date'] = data['dateline'].apply(to_date)
data.head()

Unnamed: 0,file_name,name_label,news_identifier,public_identifier,dateline,headline,country,city,keywords,genre,body_content,body_length,prediction,e1,e2,rel,event,date
0,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Victoria Azarenka criticised French Open organ...,518.0,lack of equality Victoria Azarenka criticise...,lack of equality,Victoria Azarenka criticised French Open organ...,cause,,2021-06-06
2,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,"Last year Azarenka fumed at being left to ""sit...",518.0,Covid-19 pandemic the 2020 tournament was de...,Covid-19 pandemic,the 2020 tournament was delayed to late September,cause,,2021-06-06
3,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Azarenka again took aim at officials following...,518.0,scheduling took aim cause,scheduling,took aim,cause,,2021-06-06
4,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,Serena Williams won the first official night m...,518.0,curfew played cause,curfew,played,cause,,2021-06-06
5,afp.com-20210606T170343Z-TX-PAR-WXO51.xml,Tennis-FRA-Open-Azarenka-sexism,TX-PAR-WXO51,urn:newsml:afp.com:20210606T170343Z:TX-PAR-WXO...,"Paris, June 6, 2021 (AFP) -",Azarenka says French Open lacks true gender eq...,FRA,Paris,"Tennis, FRA, Open, Azarenka, sexism",,The French Tennis Federation's three-year part...,518.0,partnership reserved enable,partnership,reserved,enable,,2021-06-06


In [18]:
data.to_csv(path.join(DATA_ROOT, 'afp2021.csv'), columns=['file_name', 'name_label', 'news_identifier', 'public_identifier', 'date', 'headline', 'country', 'city', 'keywords', 'genre', 'body_content', 'body_length', 'e1', 'e2', 'rel', 'event'])

In [33]:
import os


def split_csv(filehandler, delimiter=',', row_limit=10000, 
    output_name_template='output_%s.csv', output_path='.', keep_headers=True):
    """
    Splits a CSV file into multiple pieces.
    
    A quick bastardization of the Python CSV library.

    Arguments:

        `row_limit`: The number of rows you want in each output file. 10,000 by default.
        `output_name_template`: A %s-style template for the numbered output files.
        `output_path`: Where to stick the output files.
        `keep_headers`: Whether or not to print the headers in each output file.

    Example usage:
    
        >> from toolbox import csv_splitter;
        >> csv_splitter.split(open('/home/ben/input.csv', 'r'));
    
    """
    reader = csv.reader(filehandler, delimiter=delimiter)
    current_piece = 1
    current_out_path = os.path.join(
         output_path,
         output_name_template  % current_piece
    )
    current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
    current_limit = row_limit
    if keep_headers:
        headers = next(reader)
        current_out_writer.writerow(headers)
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = os.path.join(
               output_path,
               output_name_template  % current_piece
            )
            current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)

In [43]:
with open(path.join(DATA_ROOT, 'afp2021.csv'), 'r') as f:
    split_csv(f, output_name_template=f'{DATA_ROOT}/afp2021_%s.csv', row_limit=180000)

In [44]:
import uuid
BASE = 'http://kflow.eurecom.fr/'

In [89]:
graph = Graph()
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    if row['file_name'] is None:
        continue
    try:
        article_URI = f"{BASE}news/{uuid.uuid5(uuid.NAMESPACE_DNS, row['file_name'])}"
    except:
        print(row['file_name'])
    article = URIRef(article_URI)
    
    graph.add((article, RDF.type, rnews.Article)) 
    
    identifier_uri = node_creation('', row['public_identifier'], base_add='/identifier')
    graph.add((article, rnews.identifier, URIRef(identifier_uri))) #Link the PublicID to the article
    graph.add((URIRef(identifier_uri), RDF.value, Literal(row['public_identifier'])))

    if isinstance(row['city'], str):
        location_uri = node_creation('', row['city'], base_add='')
        location = URIRef(location_uri)
        graph.add((article, schema.contentLocation, location))
        graph.add((location, RDF.value, Literal(row['city'])))
        graph.add((location, RDF.type, schema.Place))

    if row['date'] is not None:
        time_uri = node_creation('', row['date'], base_add='')
        time_ent = URIRef(time_uri)
        graph.add(( article, schema.contentReferenceTime, time_ent ))
        graph.add((time_ent, RDF.value, Literal(row['date'], datatype=XSD.date) ))
        graph.add((time_ent, RDF.type, schema.Time))

    sentence = row['body_content']
    e1 = row['e1'].strip()
    e2 = row['e2'].strip()
    rel = row['rel'].strip()
    prediction = (e1, rel, e2)

    sentence_uri = node_creation('', sentence, base_add='/sentence') #Generate the URI for the sentence
    graph.add((article, nif.sentence, sentence_uri)) #Link the article to the sentence
    graph.add((sentence_uri, RDF.type, nif.Sentence)) #Make the sentence URI of class 'Sentence'
    graph.add((sentence_uri, RDF.value, Literal(sentence))) #Set the value of the URI equal to the sentence

    subject_uri = node_creation('', e1 + str(sentence_uri), base_add='/subject') #Generate the URI for the subject, for now add the uri of sentence to make it unique
    #graph.add((sentence_uri, faro.Relata, subject_uri)) #Add the subject to the sentence
    graph.add((sentence_uri, nif.word, subject_uri)) #Add the subject to the sentence
    graph.add((subject_uri, RDF.type, faro.Relata)) #Make it of class 'Relata'
    graph.add((subject_uri, RDF.value, Literal(e1))) #Set the value of the subject URI equal to the subject

    object_uri = node_creation('', e2 + str(sentence_uri), base_add='/object') #Generate the URI for the object, for now add the uri of sentence to make it unique
    #graph.add((sentence_uri, faro.Relata, object_uri)) #Add the object to the sentence
    graph.add((sentence_uri, nif.word, object_uri)) #Add the object to the sentence
    graph.add((object_uri, RDF.type, faro.Relata)) #Make it of class 'Relata'
    graph.add((object_uri, RDF.value, Literal(e2))) #Set the value of the subject URI equal to the object

    graph.add((subject_uri, faro_classes[rel], object_uri)) #Add relation between NERs

    if row['event'] is not None:
        graph.add((article, schema.about, URIRef(event)))
        graph.add((URIRef(event), schema.subjectOf, article))

  0%|          | 0/538708 [00:00<?, ?it/s]

In [90]:
graph.bind('nif', nif)
graph.bind('faro', faro)
graph.bind('sem', sem)
graph.bind('owl', owl)
graph.bind('rnews', rnews)
graph.bind('schema', schema)
graph.bind('xsd', XSD)
graph.bind('kflow', 'http://kflow.eurecom.fr/')
graph.bind('kflow-s', 'http://kflow.eurecom.fr/sentence/')
graph.bind('kflow-n', 'http://kflow.eurecom.fr/news/')
graph.bind('kflow-i', 'http://kflow.eurecom.fr/identifier/')
graph.serialize(path.join(OUT_ROOT,'afp2021.ttl'), format='turtle')

<Graph identifier=N862df1e49c8646e19dc04596acab5962 (<class 'rdflib.graph.Graph'>)>

In [14]:
OUT_ROOT = '../dump/afp/'
from os import path
import math
filepath = path.join(OUT_ROOT,'afp2021.ttl')
outpath = path.join(OUT_ROOT,'afp2021.ttl').replace('.ttl', '_%d.ttl')

with open(filepath, 'r') as file:
    body=file.readlines()

filesize = path.getsize(filepath)
filesize_mega = filesize / 1024 / 1024
filesize_mega

338.5618190765381

In [15]:
input_limit = 90
limit = input_limit * 0.90
n_parts = math.ceil(filesize_mega / limit)
n_parts

5

In [16]:
prefixes = []
for i, x in enumerate(body):
    if x.startswith('@prefix'):
        prefixes.append(x)
    else:
        break
content = body[i:]

buckets = []
span = math.floor(len(content) / n_parts)
while len(content) > span:
    splitting = span
    while content[splitting] != '\n':
        splitting-=1
    buckets.append(content[0:splitting])
    content = content[splitting:]
buckets[-1] = buckets[-1] + content

In [17]:
for i, bucket in enumerate(buckets):
    newfile = outpath % i
    with open(newfile, 'w') as f:
        f.write(''.join(prefixes))
        f.write(''.join(bucket))