In [1]:
import lightrdf

# The folder paths, can be changed as necessary
HOME_FOLDER_PATH = "home/"  # Expected to house the data, input and output folders (shown just below)
DATA_FOLDER_PATH = "data/"  # The folder with the KG to be validated
INPUT_FOLDER_PATH = "input_output/"      # The folder housing the input files necessary for the KG's validation through external information
OUTPUT_FOLDER_PATH = "input_output/"     # The folder where this Jupiter Notebook's output is stored

In [2]:
kg_file_name = "yago-4.5.0.2-tiny/yago-tiny.ttl"  # The file name or file path of the KG to be validated
data_file_path = HOME_FOLDER_PATH + DATA_FOLDER_PATH + kg_file_name

doc = lightrdf.RDFDocument(data_file_path)

In [3]:
# Extra Cell 1 (No need to run it, if running the main experiments is the primary intension)
parser = lightrdf.Parser()

# Can be used to print the first triple's up to the number specified in "line_num"
line_num = 1
for triple in parser.parse(data_file_path, base_iri=None):
    print(triple)
    line_num+=1
    if line_num == 20:
        break;

('<http://yago-knowledge.org/schema#>', '<http://www.w3.org/2000/01/rdf-schema#label>', '"Manual YAGO 4.5 shapes declaration"@en')
('<http://schema.org/CreativeWork>', '<http://www.w3.org/2000/01/rdf-schema#subClassOf>', '<http://schema.org/Thing>')
('<http://schema.org/CreativeWork>', '<http://www.w3.org/ns/shacl#property>', '<http://yago-knowledge.org/schema#CreativeWork_property_27>')
('<http://schema.org/CreativeWork>', '<http://www.w3.org/ns/shacl#property>', '<http://yago-knowledge.org/schema#CreativeWork_property_38>')
('<http://schema.org/CreativeWork>', '<http://www.w3.org/ns/shacl#property>', '<http://yago-knowledge.org/schema#CreativeWork_property_20>')
('<http://schema.org/CreativeWork>', '<http://www.w3.org/ns/shacl#property>', '<http://yago-knowledge.org/schema#CreativeWork_property_28>')
('<http://schema.org/CreativeWork>', '<http://www.w3.org/ns/shacl#property>', '<http://yago-knowledge.org/schema#CreativeWork_property_19>')
('<http://schema.org/CreativeWork>', '<http:/

In [4]:
# Extra Cell 2 (No need to run it, if running the main experiments is the primary intension)
# Contains counts for all triples, the unique entities and the unique relationships.
# Given its memory intensive nature, there is no need to rerun, once the results have been created.
parser = lightrdf.Parser()

line_num = 0
entity_set = set()
relation_set = set()
for triple in doc.search_triples(None, None, None):
    #print(triple)
    line_num+=1
    #print(line_num)
    entity_set.add(triple[0])
    relation_set.add(triple[1])
    entity_set.add(triple[2])

print(line_num) # The total amount of triples found
print(len(list(entity_set)))  # The number of unique entities found
print(len(list(relation_set)))  # The number of unique relationships found
# The next two are needed to free up the memory taken by the sets, so that the rest of the experiment can be run freely.
entity_set = set()
relation_set = set()

23260619
12454422
122


In [5]:
# Normal flow continues from here
from lightrdf import Regex

In [6]:
# Find the temporal triples. in YAGO Tiny, these triples are those with o from <s,p,o> a dateTime object.
# The minimum date is also found and, if desirable, removed from all the dates.
import re
from datetime import datetime, timedelta

doc = lightrdf.RDFDocument(data_file_path)

regex_var =  Regex("\"[1-9][0-9]{3}-.+T[^.]+(Z|[+-].+)\"")

# In this dict, the s is the key while a list is the value, with this list containing the sublists with [p, o] temporal values
temporal_kg_data_dict = {}
temporal_categs_dict = {}

destination_file_name = "temporal_triples_file.txt"  # The name of the file to be created housing the temporal triples.

temporal_triple_export_file_name = HOME_FOLDER_PATH + OUTPUT_FOLDER_PATH + destination_file_name
temporal_triple_strings_list = []

line_num = 0
min_date = None
# The below regex searches for "date"-related objects.
for triple in doc.search_triples(None, None, regex_var):
    # The following lines aim to leave only the date itself from the "dateTime" part of the tail entity.
    # Since YAGO Tiny's temporal triples are all of "dateTime" form, remove the XMLSchema\dateTime identifier and all the "\" that remain
    triple_2_re = re.sub("\^\^\<http:\/\/www\.w3\.org\/2001\/XMLSchema\#dateTime\>", "", triple[2])
    #print(triple_2_re)
    triple_2_nd = re.sub("\"", "", triple_2_re)
    #print(triple_2_nd)
    # Determine if there is additional text remaining, because if there is, then the triple is not temporal
    triple_2_text = re.findall("[a-z]", triple_2_nd)
    
    # Proceed only if the tail entity, after the above process, is indeed only a date, meaning the triple is temporal
    if len(triple_2_text) == 0:
        # Append the triple to the list of temporal triples, so that it can be printed to a file afterwards
        temporal_triple_strings_list.append(str(triple[0])+"\t"+str(triple[1])+"\t"+str(triple[2]))
        # Transform the date to an understandable format and also update the record of which the minimum date is, if a new minimum one is found
        triple_2 = datetime.fromisoformat(triple_2_nd)
        if min_date is not None:
            if triple_2 < min_date:
                min_date = triple_2
        else:
            min_date = triple_2
        #print(triple_2)
        # Add the temporal triple to the dict, s as the key, [p, o] as the value
        if triple[0] not in temporal_kg_data_dict:
            temporal_kg_data_dict[triple[0]] = []
        temporal_kg_data_dict[triple[0]].append([triple[1], triple_2])
        if triple[1] not in temporal_categs_dict:
            temporal_categs_dict[triple[1]] = 0
        temporal_categs_dict[triple[1]] += 1
    line_num += 1  # This also counts the triples that ultimately were not temporal after all, due to containing text in their tail entity
    

#print(temporal_kg_data_dict.items())
print(len(temporal_triple_strings_list))
dict_list = list(temporal_kg_data_dict.items())
print(len(dict_list))

print(min_date)
"""
# Removing of the minimun date from all the records.
# Not necessary, but was implemented at some earlier point.
for dict_idx in range(len(dict_list)):
    for ins_idx in range(len(dict_list[dict_idx][1])):
        #dict_list[dict_idx][1][ins_idx][1] = datetime(dict_list[dict_idx][1][ins_idx][1] - min_date)
        dict_list[dict_idx][1][ins_idx][1] = dict_list[dict_idx][1][ins_idx][1] - min_date  # the result is a timedelta itself
        # The following line results in the AttributeError: 'datetime.timedelta' object has no attribute 'strftime'
        #dict_list[dict_idx][1][ins_idx][1] = (dict_list[dict_idx][1][ins_idx][1] - min_date).strftime("%Y%m%d")
        
        #timedelta_temp_result = dict_list[dict_idx][1][ins_idx][1] - min_date
        #timedelta_temp_result = timedelta(dict_list[dict_idx][1][ins_idx][1] - min_date)  # Results in an error message
        #dict_list[dict_idx][1][ins_idx][1] = dict_list[dict_idx][1][ins_idx][1] - timedelta_temp_result
"""
# List the first 10 temporal triples
line_num = 0
for dict_idx in range(len(dict_list)):
    print(dict_list[dict_idx])
    line_num += 1
    if line_num >= 10:
        break;

print("-------------")
# List all the temporal triples found, aggregated according to their predicate/relation
dict_list = list(temporal_categs_dict.items())
print(len(dict_list))
for dict_idx in range(len(dict_list)):
    print(dict_list[dict_idx])

  triple_2_re = re.sub("\^\^\<http:\/\/www\.w3\.org\/2001\/XMLSchema\#dateTime\>", "", triple[2])


300556
228774
1000-01-01 00:00:00+00:00
('<http://yago-knowledge.org/resource/Augusto_Pinochet>', [['<http://schema.org/birthDate>', datetime.datetime(1915, 11, 25, 0, 0, tzinfo=datetime.timezone.utc)], ['<http://schema.org/deathDate>', datetime.datetime(2006, 12, 10, 0, 0, tzinfo=datetime.timezone.utc)]])
('<http://yago-knowledge.org/resource/Andrei_Tarkovsky>', [['<http://schema.org/birthDate>', datetime.datetime(1932, 4, 4, 0, 0, tzinfo=datetime.timezone.utc)], ['<http://schema.org/deathDate>', datetime.datetime(1986, 12, 29, 0, 0, tzinfo=datetime.timezone.utc)]])
('<http://yago-knowledge.org/resource/Angola>', [['<http://schema.org/dateCreated>', datetime.datetime(1992, 8, 25, 0, 0, tzinfo=datetime.timezone.utc)]])
('<http://yago-knowledge.org/resource/Andrei_Sakharov>', [['<http://schema.org/birthDate>', datetime.datetime(1921, 5, 21, 0, 0, tzinfo=datetime.timezone.utc)], ['<http://schema.org/deathDate>', datetime.datetime(1989, 12, 14, 0, 0, tzinfo=datetime.timezone.utc)]])
('<ht

In [7]:
# Finds the temporal relations and their different variants.
# Also remembers the order of appearance of the different temporal relation variants, for eventual usage in frequent pattern mining.
# Does not have duplicate relations, but cases like DateTypeA < DateTypeB and DataTypeB > DataTypeA are not considered the same.
dict_list = list(temporal_kg_data_dict.items())
print(len(dict_list))

# Stores the name of the temporal relation as "temporal_predicate_1\t(in)equality\ttemporal_predicate_2"
temporal_relation_names_list = []
# Stores how many times each element from the "temporal_relation_names_list" just above has appeared
temporal_relation_counters = {}
# A list containing the subject as its first element, and as the rest the indexes of the temporal relations triples having this subject satisfy.
# The indexes refer to the temporal relations as they appear in "temporal_relation_names_list", in order of appearance.
# This could be useful for extensions with regards to association rule mining.
s_and_temporal_true_index_list = []
# The dict the counters of the temporal relations in the form "temporal_predicate_1\ttemporal_predicate_2""(in)equality":counter_number
temporal_key_value_relations_counters = {}

for dict_idx in range(len(dict_list)):
    # Continue only if the current subject is a subject two or more <s,p,o> temporal triples, according to what has been found above
    if len(dict_list[dict_idx][1]) > 1:
        # The list contains the current subject and the indexes of the temporal relations found, as expressed in the "temporal_relation_names_list"
        current_s_and_temporal_list = [dict_list[dict_idx][0]]  # Initialization of said list with the subject
        #print(current_s_and_temporal_list)
        # Run through all the potential pairs of [p, o] of the current s
        for trgt_idx in range(len(dict_list[dict_idx][1])):
            for othr_idx in range(len(dict_list[dict_idx][1])-1, -1, -1):
                if trgt_idx < othr_idx:
                    # This case refers to when DateTypeA's value is smaller (<) than DateTypeB's value
                    if dict_list[dict_idx][1][trgt_idx][1] < dict_list[dict_idx][1][othr_idx][1]:
                        #print(str(dict_list[dict_idx][1][trgt_idx][0]+"\t<\t"+dict_list[dict_idx][1][othr_idx][0]))  # This should work as a key
                        # If the temporal relationship of "DateTypeA < DateTypeB" does not exist in the "temporal_relation_names_list", add it
                        if str(dict_list[dict_idx][1][trgt_idx][0]+"\t<\t"+dict_list[dict_idx][1][othr_idx][0]) not in temporal_relation_names_list:
                            temporal_relation_names_list.append(str(dict_list[dict_idx][1][trgt_idx][0]+"\t<\t"+dict_list[dict_idx][1][othr_idx][0]))
                            temporal_relation_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t<\t"+dict_list[dict_idx][1][othr_idx][0])] = 0
                            if str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0]) not in temporal_key_value_relations_counters.keys():
                                temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])] = {}
                            temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])]["<"] = 0
                        #print(temporal_relation_names_list.index(str(dict_list[dict_idx][1][trgt_idx][0]+"\t<\t"+dict_list[dict_idx][1][othr_idx][0])))
                        # Append the index of the temporal relationship of "temporal_relation_names_list" found in this case to be True
                        current_s_and_temporal_list.append(temporal_relation_names_list.index(str(dict_list[dict_idx][1][trgt_idx][0]+"\t<\t"+dict_list[dict_idx][1][othr_idx][0])))
                        temporal_relation_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t<\t"+dict_list[dict_idx][1][othr_idx][0])] += 1
                        temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])]["<"] += 1
                    # This case refers to when DateTypeA's value is larger (>) than DateTypeB's value
                    elif dict_list[dict_idx][1][trgt_idx][1] > dict_list[dict_idx][1][othr_idx][1]:
                        #print(str(dict_list[dict_idx][1][trgt_idx][0]+"\t>\t"+dict_list[dict_idx][1][othr_idx][0]))  # This should work as a key
                        # If the temporal relationship of "DateTypeA > DateTypeB" does not exist in the "temporal_relation_names_list", add it
                        if str(dict_list[dict_idx][1][trgt_idx][0]+"\t>\t"+dict_list[dict_idx][1][othr_idx][0]) not in temporal_relation_names_list:
                            temporal_relation_names_list.append(str(dict_list[dict_idx][1][trgt_idx][0]+"\t>\t"+dict_list[dict_idx][1][othr_idx][0]))
                            temporal_relation_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t>\t"+dict_list[dict_idx][1][othr_idx][0])] = 0
                            if str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0]) not in temporal_key_value_relations_counters.keys():
                                temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])] = {}
                            temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])][">"] = 0
                        #print(temporal_relation_names_list.index(str(dict_list[dict_idx][1][trgt_idx][0]+"\t>\t"+dict_list[dict_idx][1][othr_idx][0])))
                        # Append the index of the temporal relationship of "temporal_relation_names_list" found in this case to be True
                        current_s_and_temporal_list.append(temporal_relation_names_list.index(str(dict_list[dict_idx][1][trgt_idx][0]+"\t>\t"+dict_list[dict_idx][1][othr_idx][0])))
                        temporal_relation_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t>\t"+dict_list[dict_idx][1][othr_idx][0])] += 1
                        temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])][">"] += 1
                    # This case refers to when DateTypeA's value is equal (==) to DateTypeB's value
                    elif dict_list[dict_idx][1][trgt_idx][1] == dict_list[dict_idx][1][othr_idx][1]:
                        #print(str(dict_list[dict_idx][1][trgt_idx][0]+"\t==\t"+dict_list[dict_idx][1][othr_idx][0]))  # This should work as a key
                        # If the temporal relationship of "DateTypeA == DateTypeB" does not exist in the "temporal_relation_names_list", add it
                        if str(dict_list[dict_idx][1][trgt_idx][0]+"\t==\t"+dict_list[dict_idx][1][othr_idx][0]) not in temporal_relation_names_list:
                            temporal_relation_names_list.append(str(dict_list[dict_idx][1][trgt_idx][0]+"\t==\t"+dict_list[dict_idx][1][othr_idx][0]))
                            temporal_relation_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t==\t"+dict_list[dict_idx][1][othr_idx][0])] = 0
                            if str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0]) not in temporal_key_value_relations_counters.keys():
                                temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])] = {}
                            temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])]["=="] = 0
                        #print(temporal_relation_names_list.index(str(dict_list[dict_idx][1][trgt_idx][0]+"\t==\t"+dict_list[dict_idx][1][othr_idx][0])))
                        # Append the index of the temporal relationship of "temporal_relation_names_list" found in this case to be True
                        current_s_and_temporal_list.append(temporal_relation_names_list.index(str(dict_list[dict_idx][1][trgt_idx][0]+"\t==\t"+dict_list[dict_idx][1][othr_idx][0])))
                        temporal_relation_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t==\t"+dict_list[dict_idx][1][othr_idx][0])] += 1
                        temporal_key_value_relations_counters[str(dict_list[dict_idx][1][trgt_idx][0]+"\t"+dict_list[dict_idx][1][othr_idx][0])]["=="] += 1
        if len(current_s_and_temporal_list) > 1:
            # Append the list of the current subject and its temporal relationship indexes to the one containing all such lists only if
            # at least one of the temporal relationships from the "temporal_relation_names_list" is found, otherwise no such need exists.
            s_and_temporal_true_index_list.append(current_s_and_temporal_list)

# Counts the amount of temporal relations found, from the subjects that have at least two temporal [p, o] pairs
total_temporal_relations = sum(temporal_relation_counters.values())

# Shows the index of each temporal relation, the temporal relation, the times it appeared and
# the support of the relation, compared to all the temporal relations found.
for item_idx in range(len(temporal_relation_names_list)):
    print(item_idx, temporal_relation_names_list[item_idx], temporal_relation_counters[temporal_relation_names_list[item_idx]], round((float(temporal_relation_counters[temporal_relation_names_list[item_idx]])/total_temporal_relations), 6))

for item_idx in range(10):
    print(s_and_temporal_true_index_list[item_idx])

print("-----")

print(temporal_key_value_relations_counters)

228774
0 <http://schema.org/birthDate>	<	<http://schema.org/deathDate> 46685 0.650372
1 <http://schema.org/deathDate>	>	<http://schema.org/birthDate> 20876 0.290825
2 <http://schema.org/dateCreated>	==	<http://schema.org/dateCreated> 215 0.002995
3 <http://schema.org/dateCreated>	<	<http://schema.org/dissolutionDate> 2357 0.032836
4 <http://schema.org/dissolutionDate>	>	<http://schema.org/dateCreated> 427 0.005949
5 <http://schema.org/birthDate>	>	<http://schema.org/deathDate> 42 0.000585
6 <http://schema.org/deathDate>	<	<http://schema.org/birthDate> 46 0.000641
7 <http://schema.org/dateCreated>	==	<http://schema.org/dissolutionDate> 31 0.000432
8 <http://schema.org/endDate>	>	<http://schema.org/startDate> 17 0.000237
9 <http://schema.org/startDate>	<	<http://schema.org/endDate> 965 0.013443
10 <http://schema.org/birthDate>	==	<http://schema.org/deathDate> 42 0.000585
11 <http://schema.org/dateCreated>	>	<http://schema.org/dissolutionDate> 3 4.2e-05
12 <http://schema.org/startDate>	==

In [8]:
print(temporal_key_value_relations_counters)
# Stores all the possible interval cases encountered before, as ["temporal_predicate_1", "temporal_predicate_2"].
# Essentially, splits some of the previously found information in its tab (\t) character and stores the two predicates in the list.
interval_cases_list = []
line_num = 0
for line_num in range(len(list(temporal_key_value_relations_counters.keys()))):
    #print(list(temporal_key_value_relations_counters.keys())[line_num].split('\t'))
    # Split the two temporal predicates that form the temporal relation by their tab character
    # separaring them and store them to the list, in the form shown in the above comment.
    interval_cases_list.append(list(temporal_key_value_relations_counters.keys())[line_num].split('\t'))
    line_num += 1

print(interval_cases_list)

{'<http://schema.org/birthDate>\t<http://schema.org/deathDate>': {'<': 46685, '>': 42, '==': 42}, '<http://schema.org/deathDate>\t<http://schema.org/birthDate>': {'>': 20876, '<': 46, '==': 14}, '<http://schema.org/dateCreated>\t<http://schema.org/dateCreated>': {'==': 215}, '<http://schema.org/dateCreated>\t<http://schema.org/dissolutionDate>': {'<': 2357, '==': 31, '>': 3}, '<http://schema.org/dissolutionDate>\t<http://schema.org/dateCreated>': {'>': 427, '<': 2, '==': 1}, '<http://schema.org/endDate>\t<http://schema.org/startDate>': {'>': 17}, '<http://schema.org/startDate>\t<http://schema.org/endDate>': {'<': 965, '==': 58, '>': 1}}
[['<http://schema.org/birthDate>', '<http://schema.org/deathDate>'], ['<http://schema.org/deathDate>', '<http://schema.org/birthDate>'], ['<http://schema.org/dateCreated>', '<http://schema.org/dateCreated>'], ['<http://schema.org/dateCreated>', '<http://schema.org/dissolutionDate>'], ['<http://schema.org/dissolutionDate>', '<http://schema.org/dateCreate

In [9]:
# Definition of the temporal intervals
import pandas as pd

# A dictionary where the key is a subject and, as values, there are additional key-value pairs, where
# the value key is the kind of relation (e.g. birthdate\tdeathdeate) and the value value the interval.
temporal_intervals_dict = {}

# Find all the cases where intervals can be created and define them
for key_subject, values_po_list in temporal_kg_data_dict.items():
    # The dict that, for the current subject, each of its [p, o] pairs as p: o (key the predicate, object the value)
    temporal_po_dict = {}
    for po_pair_list in values_po_list:
        temporal_po_dict[po_pair_list[0]] = po_pair_list[1]
    # Continue only if the current subject has at least two p: o pairs, so that intervals can be created
    if len(temporal_po_dict.keys()) > 1:
        # For the current subject, stores the k: v pairs, where they keys are the temporal relations and the values the corresponding interval
        temporal_intervals_dict[key_subject] = {}
        for trgt_idx in range(len(interval_cases_list)):
            if set(temporal_po_dict.keys()).intersection(set(interval_cases_list[trgt_idx])) == set(interval_cases_list[trgt_idx]):
                #print(interval_cases_list[trgt_idx])
                min_idx = list(temporal_po_dict.values()).index(min(list(temporal_po_dict.values())))
                max_idx = list(temporal_po_dict.values()).index(max(list(temporal_po_dict.values())))
                min_key = list(temporal_po_dict.keys())[min_idx]
                max_key = list(temporal_po_dict.keys())[max_idx]
                min_value = datetime.strftime(list(temporal_po_dict.values())[min_idx], '%Y-%m-%d %H:%M:%S')
                max_value = datetime.strftime(list(temporal_po_dict.values())[max_idx], '%Y-%m-%d %H:%M:%S')
                #print(str(min_value), str(max_value))
                #print(pd.Timestamp(str(min_value)), pd.Timestamp(str(max_value)))
                temporal_intervals_dict[key_subject][str(min_key+"\t"+max_key)] = pd.Interval(pd.Timestamp(min_value), pd.Timestamp(max_value), closed="both")

line_num = 0
for key_item, values_item in temporal_intervals_dict.items():
    print(key_item, values_item)
    line_num += 1
    if line_num == 10:
        break;

<http://yago-knowledge.org/resource/Augusto_Pinochet> {'<http://schema.org/birthDate>\t<http://schema.org/deathDate>': Interval(1915-11-25 00:00:00, 2006-12-10 00:00:00, closed='both')}
<http://yago-knowledge.org/resource/Andrei_Tarkovsky> {'<http://schema.org/birthDate>\t<http://schema.org/deathDate>': Interval(1932-04-04 00:00:00, 1986-12-29 00:00:00, closed='both')}
<http://yago-knowledge.org/resource/Andrei_Sakharov> {'<http://schema.org/birthDate>\t<http://schema.org/deathDate>': Interval(1921-05-21 00:00:00, 1989-12-14 00:00:00, closed='both')}
<http://yago-knowledge.org/resource/Alberich_of_Reims> {'<http://schema.org/birthDate>\t<http://schema.org/deathDate>': Interval(1085-01-01 00:00:00, 1141-01-01 00:00:00, closed='both')}
<http://yago-knowledge.org/resource/Anthony_Sweijs> {'<http://schema.org/birthDate>\t<http://schema.org/deathDate>': Interval(1852-07-18 00:00:00, 1937-09-30 00:00:00, closed='both')}
<http://yago-knowledge.org/resource/Andrea_del_Sarto> {'<http://schema.o

In [10]:
# Find and print all the different temporal relations between two items, as well as the corresponding support values
for outer_key_item in temporal_key_value_relations_counters.keys():
    total_relation_number = sum(temporal_key_value_relations_counters[outer_key_item].values())
    for inner_key_item, inner_value_item in temporal_key_value_relations_counters[outer_key_item].items():
        print(outer_key_item, inner_key_item, inner_value_item, round(float(inner_value_item)/total_relation_number, 4))
    print()

# Find and print only the dominant variant of a temporal relation, based on its support
for outer_key_item in temporal_key_value_relations_counters.keys():
    total_relation_number = sum(temporal_key_value_relations_counters[outer_key_item].values())
    max_value_item = max(list(temporal_key_value_relations_counters[outer_key_item].values()))
    max_key_item = list(temporal_key_value_relations_counters[outer_key_item].keys())[list(temporal_key_value_relations_counters[outer_key_item].values()).index(max_value_item)]
    print(outer_key_item, max_key_item, max_value_item, round(float(max_value_item)/total_relation_number, 4))

<http://schema.org/birthDate>	<http://schema.org/deathDate> < 46685 0.9982
<http://schema.org/birthDate>	<http://schema.org/deathDate> > 42 0.0009
<http://schema.org/birthDate>	<http://schema.org/deathDate> == 42 0.0009

<http://schema.org/deathDate>	<http://schema.org/birthDate> > 20876 0.9971
<http://schema.org/deathDate>	<http://schema.org/birthDate> < 46 0.0022
<http://schema.org/deathDate>	<http://schema.org/birthDate> == 14 0.0007

<http://schema.org/dateCreated>	<http://schema.org/dateCreated> == 215 1.0

<http://schema.org/dateCreated>	<http://schema.org/dissolutionDate> < 2357 0.9858
<http://schema.org/dateCreated>	<http://schema.org/dissolutionDate> == 31 0.013
<http://schema.org/dateCreated>	<http://schema.org/dissolutionDate> > 3 0.0013

<http://schema.org/dissolutionDate>	<http://schema.org/dateCreated> > 427 0.993
<http://schema.org/dissolutionDate>	<http://schema.org/dateCreated> < 2 0.0047
<http://schema.org/dissolutionDate>	<http://schema.org/dateCreated> == 1 0.0023



In [11]:
def acceptable_equality_cases():
    # A list of lists where each internal list contains the elements among which temporal equality is acceptable, if encountered together.
    equality_cases_list = []
    input_file_name = "acceptable_equality_cases.txt"
    input_file = open(HOME_FOLDER_PATH + INPUT_FOLDER_PATH + input_file_name)
    # Each line of the file contains the predicates among where temporal equality is acceptable, split with a tab
    line_content = input_file.readline()  # Read the file's first line
    # Continue as long as the line read from the file exists (is not completely empty)
    while line_content != '':
        line_content = re.sub("\n", "", line_content)  # Remove the line change character
        line_content = line_content.split('\t')  # and split the line's contents by tab.
        equality_cases_list.append(line_content)
        equality_cases_list.append([line_content[1], line_content[0]])
        line_content = input_file.readline()  # Read the file's next line
    input_file.close()
    return equality_cases_list

In [12]:
def acceptable_interval_limits():
    # A list of lists where each internal list contains the pairs of predicates making an interval,
    # as well as the lower and upper limits of the possible values for said interval.
    interval_limits_list = []
    input_file_name = "acceptable_interval_rules.txt"
    input_file = open(HOME_FOLDER_PATH + INPUT_FOLDER_PATH + input_file_name)
    # Each line of the file contains two predicates, an upper and a lower limit (from which one might be - if it is not applicable), separated by a tab
    line_content = input_file.readline()  # Read the file's first line
    # Continue as long as the line read from the file exists (is not completely empty)
    while line_content != '':
        line_content = re.sub("\n", "", line_content)  # Remove the line change character
        line_content = line_content.split('\t')  # and split the line's contents by tab.
        interval_limits_list.append(line_content)
        line_content = input_file.readline()  # Read the file's next line
    input_file.close()
    return interval_limits_list

In [None]:
# From the next cells, run whichever is desired, each time.
# The first cell only applies the fully-automatic internal validation method for finding inconsistent temporal triples,
# the second cell has the fully-automatic method, alongside the relaxation of what constitutes the dominant temporal relation, via external information,
# while the third combines the two methods of the second cell, along with temporal, validity intervals, defined through extenral information.

In [13]:
# This applies the automatic method for finding temporal inconsistencies within a TKG, which is reliant
# exclusively on the KG's internal data. External information for the temporal relations is not used.
automatically_produced_erroneous_patterns = set()

# Find the erroneous patterns and print to a file all the erroneous triples
for outer_key_item in temporal_key_value_relations_counters.keys():
    total_relation_number = sum(temporal_key_value_relations_counters[outer_key_item].values())  # Can be removed
    # Get the value of the temporal relation item with the maximum value, which is considered the dominant variant
    max_value_item = max(list(temporal_key_value_relations_counters[outer_key_item].values()))
    # Get the '<', '>' or '==' symbol that corresponds to the variant of the temporal relation with the maximum value
    max_key_item = list(temporal_key_value_relations_counters[outer_key_item].keys())[list(temporal_key_value_relations_counters[outer_key_item].values()).index(max_value_item)]
    #print(outer_key_item, max_key_item, max_value_item, round(float(max_value_item)/total_relation_number, 4))
    for inner_key_item, inner_value_item in temporal_key_value_relations_counters[outer_key_item].items():
        if inner_key_item != max_key_item:
            outer_key_split = outer_key_item.split('\t')
            automatically_produced_erroneous_patterns.add(str(outer_key_split[0]+"\t"+inner_key_item+"\t"+outer_key_split[1]))

print(automatically_produced_erroneous_patterns)

automatically_produced_erroneous_patterns = list(automatically_produced_erroneous_patterns)

# Stores the indexes of the patterns found to be potentially inconsistent
indexes_of_automatically_produced_erroneous_patterns = []

for item_idx in range(len(automatically_produced_erroneous_patterns)):
    indexes_of_automatically_produced_erroneous_patterns.append(temporal_relation_names_list.index(automatically_produced_erroneous_patterns[item_idx]))

print(indexes_of_automatically_produced_erroneous_patterns)

# Stores the subjects (with subject referring to s from the <s,p,o> triples), who have at least one pair of inonsistent temporal triples,
# which is a pair of triples satisfying a temporal relationship variant found to be inconsistent.
automatically_erroneous_subject_list = []
for item_idx in range(len(s_and_temporal_true_index_list)):
    if len(set(indexes_of_automatically_produced_erroneous_patterns).intersection(set(s_and_temporal_true_index_list[item_idx][1:]))) > 0:
        automatically_erroneous_subject_list.append(s_and_temporal_true_index_list[item_idx][0])

destination_file_name = "automatically_inconsistent_triples_file.txt"
automatically_erroneous_triple_file_name = HOME_FOLDER_PATH + OUTPUT_FOLDER_PATH + destination_file_name
automatically_erroneous_triple_file_contents = []

# Prints to file all the triples sharing the erroneous subject
for item_idx in range(len(temporal_triple_strings_list)):
    current_subject = temporal_triple_strings_list[item_idx].split('\t')[0]
    #print(temporal_triple_strings_list[item_idx], current_subject)
    if current_subject in automatically_erroneous_subject_list:
        #print(temporal_triple_strings_list[item_idx], current_subject, automatically_erroneous_subject_list[0])
        automatically_erroneous_triple_file_contents.append(temporal_triple_strings_list[item_idx])

automatically_erroneous_triples_file = open(automatically_erroneous_triple_file_name, "w+", encoding="utf8")
for list_idx in range(len(automatically_erroneous_triple_file_contents)):
    automatically_erroneous_triples_file.write(automatically_erroneous_triple_file_contents[list_idx]+"\n")
automatically_erroneous_triples_file.close()

{'<http://schema.org/deathDate>\t==\t<http://schema.org/birthDate>', '<http://schema.org/dateCreated>\t==\t<http://schema.org/dissolutionDate>', '<http://schema.org/dissolutionDate>\t==\t<http://schema.org/dateCreated>', '<http://schema.org/startDate>\t>\t<http://schema.org/endDate>', '<http://schema.org/birthDate>\t>\t<http://schema.org/deathDate>', '<http://schema.org/startDate>\t==\t<http://schema.org/endDate>', '<http://schema.org/dissolutionDate>\t<\t<http://schema.org/dateCreated>', '<http://schema.org/dateCreated>\t>\t<http://schema.org/dissolutionDate>', '<http://schema.org/birthDate>\t==\t<http://schema.org/deathDate>', '<http://schema.org/deathDate>\t<\t<http://schema.org/birthDate>'}
[14, 7, 16, 15, 5, 12, 13, 11, 10, 6]


In [14]:
# This firstly applies the automatic method for finding temporal inconsistencies within a TKG, reliant only on
# the KG's internal information, and then uses external information for defining which temporal relations should
# have the equality as a part of the dominant variant.
automatically_produced_erroneous_patterns = set()

# The list with the cases where, for temporal relations with an inequality dominant variant, the equality is also acceptable
equality_cases_list = acceptable_equality_cases()
print(equality_cases_list)

# Find the erroneous patterns and print to a file all the erroneous triples
for outer_key_item in temporal_key_value_relations_counters.keys():
    total_relation_number = sum(temporal_key_value_relations_counters[outer_key_item].values())  # Can be removed
    # Get the value of the temporal relation item with the maximum value, which is considered the dominant variant
    max_value_item = max(list(temporal_key_value_relations_counters[outer_key_item].values()))
    # Get the '<', '>' or '==' symbol that corresponds to the variant of the temporal relation with the maximum value
    max_key_item = list(temporal_key_value_relations_counters[outer_key_item].keys())[list(temporal_key_value_relations_counters[outer_key_item].values()).index(max_value_item)]
    #print(outer_key_item, max_key_item, max_value_item, round(float(max_value_item)/total_relation_number, 4))
    for inner_key_item, inner_value_item in temporal_key_value_relations_counters[outer_key_item].items():
        if inner_key_item != max_key_item:
            outer_key_split = outer_key_item.split('\t')
            if inner_key_item != "==":
                # In case the non-dominant relation found is of '<' or '>', denote it as automatically erroneous
                automatically_produced_erroneous_patterns.add(str(outer_key_split[0]+"\t"+inner_key_item+"\t"+outer_key_split[1]))
            else:
                # In case the non-dominant relation found is '==', check if it among the ones that have been deemed as acceptable
                exclude_from_erroneous = False
                for equality_list in equality_cases_list:
                    if outer_key_split[0] in equality_list and outer_key_split[1] in equality_list:
                        exclude_from_erroneous = True
                        break;
                # If the current '==' variant was not deemed acceptable, according to the external information, consider it as erroneous
                if not exclude_from_erroneous:
                    automatically_produced_erroneous_patterns.add(str(outer_key_split[0]+"\t"+inner_key_item+"\t"+outer_key_split[1]))
                        

print(automatically_produced_erroneous_patterns)

automatically_produced_erroneous_patterns = list(automatically_produced_erroneous_patterns)

# Stores the indexes of the patterns found to be potentially inconsistent
indexes_of_automatically_produced_erroneous_patterns = []

for item_idx in range(len(automatically_produced_erroneous_patterns)):
    indexes_of_automatically_produced_erroneous_patterns.append(temporal_relation_names_list.index(automatically_produced_erroneous_patterns[item_idx]))

print(indexes_of_automatically_produced_erroneous_patterns)

# Stores the subjects (with subject referring to s from the <s,p,o> triples), who have at least one pair of inonsistent temporal triples,
# which is a pair of triples satisfying a temporal relationship variant found to be inconsistent.
automatically_erroneous_subject_list = []
for item_idx in range(len(s_and_temporal_true_index_list)):
    if len(set(indexes_of_automatically_produced_erroneous_patterns).intersection(set(s_and_temporal_true_index_list[item_idx][1:]))) > 0:
        automatically_erroneous_subject_list.append(s_and_temporal_true_index_list[item_idx][0])

destination_file_name = "semi-automatically_inconsistent_triples_file.txt"
automatically_erroneous_triple_file_name = HOME_FOLDER_PATH + OUTPUT_FOLDER_PATH + destination_file_name
automatically_erroneous_triple_file_contents = []

# Prints to file all the triples sharing the erroneous subject
for item_idx in range(len(temporal_triple_strings_list)):
    current_subject = temporal_triple_strings_list[item_idx].split('\t')[0]
    #print(temporal_triple_strings_list[item_idx], current_subject)
    if current_subject in automatically_erroneous_subject_list:
        #print(temporal_triple_strings_list[item_idx], current_subject, automatically_erroneous_subject_list[0])
        automatically_erroneous_triple_file_contents.append(temporal_triple_strings_list[item_idx])

automatically_erroneous_triples_file = open(automatically_erroneous_triple_file_name, "w+", encoding="utf8")
for list_idx in range(len(automatically_erroneous_triple_file_contents)):
    automatically_erroneous_triples_file.write(automatically_erroneous_triple_file_contents[list_idx]+"\n")
automatically_erroneous_triples_file.close()

[['<http://schema.org/startDate>', '<http://schema.org/endDate>'], ['<http://schema.org/endDate>', '<http://schema.org/startDate>']]
{'<http://schema.org/deathDate>\t==\t<http://schema.org/birthDate>', '<http://schema.org/dateCreated>\t==\t<http://schema.org/dissolutionDate>', '<http://schema.org/dissolutionDate>\t==\t<http://schema.org/dateCreated>', '<http://schema.org/startDate>\t>\t<http://schema.org/endDate>', '<http://schema.org/birthDate>\t>\t<http://schema.org/deathDate>', '<http://schema.org/dissolutionDate>\t<\t<http://schema.org/dateCreated>', '<http://schema.org/dateCreated>\t>\t<http://schema.org/dissolutionDate>', '<http://schema.org/birthDate>\t==\t<http://schema.org/deathDate>', '<http://schema.org/deathDate>\t<\t<http://schema.org/birthDate>'}
[14, 7, 16, 15, 5, 13, 11, 10, 6]


In [15]:
# This applies all the tests: the automatic version that relies on internal information and then the two external information
# methods; firstly the one that relaxes which temporal relation variant is the dominant, by also considering the equality, whenever
# this is specified, and then by also creating the externally specified validity intervals for specific temporal relations.
automatically_produced_erroneous_patterns = set()

# The list with the cases where, for temporal relations with an inequality dominant variant, the equality is also acceptable
equality_cases_list = acceptable_equality_cases()
print(equality_cases_list)

# Find the erroneous patterns and print to a file all the erroneous triples
for outer_key_item in temporal_key_value_relations_counters.keys():
    total_relation_number = sum(temporal_key_value_relations_counters[outer_key_item].values())  # Can be removed
    # Get the value of the temporal relation item with the maximum value, which is considered the dominant variant
    max_value_item = max(list(temporal_key_value_relations_counters[outer_key_item].values()))
    # Get the '<', '>' or '==' symbol that corresponds to the variant of the temporal relation with the maximum value
    max_key_item = list(temporal_key_value_relations_counters[outer_key_item].keys())[list(temporal_key_value_relations_counters[outer_key_item].values()).index(max_value_item)]
    #print(outer_key_item, max_key_item, max_value_item, round(float(max_value_item)/total_relation_number, 4))
    for inner_key_item, inner_value_item in temporal_key_value_relations_counters[outer_key_item].items():
        if inner_key_item != max_key_item:
            outer_key_split = outer_key_item.split('\t')
            if inner_key_item != "==":
                # In case the non-dominant relation found is of '<' or '>', denote it as automatically erroneous
                automatically_produced_erroneous_patterns.add(str(outer_key_split[0]+"\t"+inner_key_item+"\t"+outer_key_split[1]))
            else:
                # In case the non-dominant relation found is '==', check if it among the ones that have been deemed as acceptable
                exclude_from_erroneous = False
                for equality_list in equality_cases_list:
                    if outer_key_split[0] in equality_list and outer_key_split[1] in equality_list:
                        exclude_from_erroneous = True
                        break;
                # If the current '==' variant was not deemed acceptable, according to the external information, consider it as erroneous
                if not exclude_from_erroneous:
                    automatically_produced_erroneous_patterns.add(str(outer_key_split[0]+"\t"+inner_key_item+"\t"+outer_key_split[1]))
                        

print(automatically_produced_erroneous_patterns)

automatically_produced_erroneous_patterns = list(automatically_produced_erroneous_patterns)

# Stores the indexes of the patterns found to be potentially inconsistent
indexes_of_automatically_produced_erroneous_patterns = []

for item_idx in range(len(automatically_produced_erroneous_patterns)):
    indexes_of_automatically_produced_erroneous_patterns.append(temporal_relation_names_list.index(automatically_produced_erroneous_patterns[item_idx]))

print(indexes_of_automatically_produced_erroneous_patterns)

# Stores the subjects (with subject referring to s from the <s,p,o> triples), who have at least one pair of inonsistent temporal triples,
# which is a pair of triples satisfying a temporal relationship variant found to be inconsistent.
automatically_erroneous_subject_list = []
for item_idx in range(len(s_and_temporal_true_index_list)):
    if len(set(indexes_of_automatically_produced_erroneous_patterns).intersection(set(s_and_temporal_true_index_list[item_idx][1:]))) > 0:
        automatically_erroneous_subject_list.append(s_and_temporal_true_index_list[item_idx][0])

# A list of lists where each internal list contains the pair of predicates that define an interval,
# as well as the lower and upper limits of the possible values for said interval.
# Its creation is based on a file and each line of said file contains two predicates,
# an upper and a lower limit (from which one might be "-" if it is not applicable), separated by a tab.
interval_limits_list = acceptable_interval_limits()

# The key_subject is the subject from the <s,p,o> triples
for key_subject, values_dict in temporal_intervals_dict.items():
    # The values dict contains the temporal relations (in the form "temporal_predicate_1\ttemporal_predicate_2")
    # as keys and the corresponding intervals as values.
    for key_tempor_rel, value_interv in values_dict.items():
        # Split the "temporal_predicate_1\ttemporal_predicate_2" temporal relation into ["temporal_predicate_1", "temporal_predicate_2"]
        tempor_rel_split = key_tempor_rel.split("\t")
        for curr_interval_limit in interval_limits_list:
            #print(curr_interval_limit[:2])
            # This should better NOT be a set
            # If the predicates defining an interval match the currently examined triple's interval,
            # then check whether the upper and/or lower bounds are violated
            if set([tempor_rel_split[0], tempor_rel_split[1]]) == set(curr_interval_limit[:2]):
                if curr_interval_limit[2] != "-":
                    #print(value_interv.length, timedelta(days=int(curr_interval_limit[2])))
                    if value_interv.length < timedelta(days=int(curr_interval_limit[2])):
                        # Erroneous value, add to the list
                        if key_subject not in automatically_erroneous_subject_list:
                            automatically_erroneous_subject_list.append(key_subject)
                            print(key_subject, value_interv.length, timedelta(days=int(curr_interval_limit[2])))
                if curr_interval_limit[3] != "-":
                    #print(value_interv.length, timedelta(days=int(curr_interval_limit[3])))
                    if value_interv.length > timedelta(days=int(curr_interval_limit[3])):
                        # Erroneous value, add to the list
                        if key_subject not in automatically_erroneous_subject_list:
                            automatically_erroneous_subject_list.append(key_subject)
                            print(key_subject, value_interv.length, timedelta(days=int(curr_interval_limit[3])))

destination_file_name = "full_semi-automatically_inconsistent_triples_file.txt"
automatically_erroneous_triple_file_name = HOME_FOLDER_PATH + OUTPUT_FOLDER_PATH + destination_file_name
automatically_erroneous_triple_file_contents = []

# Prints to file all the triples sharing the erroneous subject
for item_idx in range(len(temporal_triple_strings_list)):
    current_subject = temporal_triple_strings_list[item_idx].split('\t')[0]
    #print(temporal_triple_strings_list[item_idx], current_subject)
    if current_subject in automatically_erroneous_subject_list:
        #print(temporal_triple_strings_list[item_idx], current_subject, automatically_erroneous_subject_list[0])
        automatically_erroneous_triple_file_contents.append(temporal_triple_strings_list[item_idx])

automatically_erroneous_triples_file = open(automatically_erroneous_triple_file_name, "w+", encoding="utf8")
for list_idx in range(len(automatically_erroneous_triple_file_contents)):
    automatically_erroneous_triples_file.write(automatically_erroneous_triple_file_contents[list_idx]+"\n")
automatically_erroneous_triples_file.close()

[['<http://schema.org/startDate>', '<http://schema.org/endDate>'], ['<http://schema.org/endDate>', '<http://schema.org/startDate>']]
{'<http://schema.org/deathDate>\t==\t<http://schema.org/birthDate>', '<http://schema.org/dateCreated>\t==\t<http://schema.org/dissolutionDate>', '<http://schema.org/dissolutionDate>\t==\t<http://schema.org/dateCreated>', '<http://schema.org/startDate>\t>\t<http://schema.org/endDate>', '<http://schema.org/birthDate>\t>\t<http://schema.org/deathDate>', '<http://schema.org/dissolutionDate>\t<\t<http://schema.org/dateCreated>', '<http://schema.org/dateCreated>\t>\t<http://schema.org/dissolutionDate>', '<http://schema.org/birthDate>\t==\t<http://schema.org/deathDate>', '<http://schema.org/deathDate>\t<\t<http://schema.org/birthDate>'}
[14, 7, 16, 15, 5, 13, 11, 10, 6]
<http://yago-knowledge.org/resource/Alexios_Palaiologos__u0028_despot_u0029_> 73779 days 00:00:00 73050 days, 0:00:00
