In [28]:
import pandas as pd 
import numpy as np
import os
from subprocess import check_output
import shutil
import rdflib
from multiprocessing import Process, Manager, SimpleQueue
import multiprocessing

from rule import *
from amie import *
from experiment import *

In [2]:
root_source_FB = "./../../FB15k_mail/"
root_source_DB = "./../../DB15k_mail/"

root_data_FB = "./../../Data_mail/FB_LTE_GT_"
root_data_DB = "./../../Data_mail/DB_LTE_GT_"

# FB15K 

### Prepare the data

As we need a baseline for Amie, we are not going to change the value of anything. 

In [3]:
data = open(root_source_FB+"numericals.txt", "r")

numerical_predicate = set()

for predicate in data:
    numerical_predicate.add(predicate.split("\n")[0])
    
data.close()

In [4]:
thresholds = np.arange(0.25,1,0.25) #[0.25, 0.50, 0.75]
thresholds_str = ["-"+(str(int(i*100))) for i in thresholds]

In [5]:
def write_file(X, f, threshold, comparator):
    f.write(f"<http:{X['Subject']}>\t{X['Predicate'][:-1]}_LTE_{threshold}>\t<http:/{X['Object']<=comparator}>\n")
    f.write(f"<http:{X['Subject']}>\t{X['Predicate'][:-1]}_GT_{threshold}>\t<http:/{X['Object']>comparator}>\n")

In [6]:
data = open(root_source_FB+"train.txt", "r")
f = open(root_data_FB+"train.tsv", "w")

dic_predicate = {}

for line in data:
    line_split = line.split("\n")[0].split("\t")
    if line_split[1] in numerical_predicate:
        line_split[2] = float(line_split[2]) 
        line_split = tuple(line_split)
        if line_split[1] in dic_predicate.keys():
            dic_predicate[line_split[1]].add(line_split)
        else : 
            dic_predicate[line_split[1]] = {line_split}
    else:
        for var in line_split:
            f.write('<http:'+var+'>\t')
        f.write("\n")
    
data.close()
f.close()

In [7]:
f = open(root_data_FB+"train.tsv", "a")

for key in dic_predicate.keys():
    tp_df = pd.DataFrame.from_dict(dic_predicate[key]).rename(columns={0: "Subject", 1: "Predicate", 2: "Object"})
    tp_df_describe = tp_df["Object"].quantile(thresholds)
    for threshold in thresholds: 
        tp_df.apply(write_file, args=(f, threshold, tp_df_describe[threshold]), axis=1)

f.close()

### Launch Amie

In [8]:
res = check_output(f'java -jar ./../amie3.jar {root_data_FB+"train.tsv"}', shell=True)

res_parsed = parse_amie(res)

In [9]:
len(res_parsed)

339508

In [12]:
# res_parsed

{?f <http:/people/person/languages> ?b & ?a <http:/people/person/spouse_s./people/marriage/spouse> ?f => ?a <http:/people/person/languages> ?b,
 ?b <http:/film/film/sequel> ?f & ?f <http:/film/film/written_by> ?a => ?a <http:/film/producer/films_executive_produced> ?b,
 ?a <http://rdf.freebase.com/ns/location.country.iso_numeric>_GT_0.25 ?b & ?a <http://rdf.freebase.com/ns/location.dated_location.date_founded>_GT_0.5 ?b => ?a <http://rdf.freebase.com/ns/topic_server.population_number>_GT_0.25 ?b,
 ?a <http://rdf.freebase.com/ns/location.geocode.longitude>_LTE_0.75 ?b & ?a <http://rdf.freebase.com/ns/location.location.gns_ufi>_LTE_0.75 ?b => ?a <http://rdf.freebase.com/ns/location.geocode.latitude>_GT_0.75 ?b,
 ?e <http:/location/statistical_region/places_imported_from./location/imports_and_exports/imported_from> ?a & ?e <http:/olympics/olympic_participating_country/athletes./olympics/olympic_athlete_affiliation/olympics> ?b => ?a <http:/olympics/olympic_participating_country/medals_won

### Clean rules

In [13]:
res_parsed_clean = res_parsed.copy()
for i in res_parsed:
    if len(i.hypotheses) == 1:
        if (i.hypotheses[0].predicate.split("LTE")[0] == i.conclusion.predicate.split("LTE")[0]) and len(i.hypotheses[0].predicate.split("LTE")) == 2:
            print(i)
            res_parsed_clean.remove(i)

?a <http://rdf.freebase.com/ns/organization.organization.date_founded_LTE_0.75> ?b => ?a <http://rdf.freebase.com/ns/organization.organization.date_founded_LTE_0.25> ?b
?a <http://rdf.freebase.com/ns/people.person.weight_kg_LTE_0.75> ?b => ?a <http://rdf.freebase.com/ns/people.person.weight_kg_LTE_0.5> ?b
?a <http://rdf.freebase.com/ns/location.dated_location.date_founded_LTE_0.25> ?b => ?a <http://rdf.freebase.com/ns/location.dated_location.date_founded_LTE_0.75> ?b
?a <http://rdf.freebase.com/ns/tv.tv_program.air_date_of_first_episode_LTE_0.75> ?b => ?a <http://rdf.freebase.com/ns/tv.tv_program.air_date_of_first_episode_LTE_0.5> ?b
?a <http://rdf.freebase.com/ns/people.person.date_of_birth_LTE_0.75> ?b => ?a <http://rdf.freebase.com/ns/people.person.date_of_birth_LTE_0.5> ?b
?a <http://rdf.freebase.com/ns/people.person.weight_kg_LTE_0.5> ?b => ?a <http://rdf.freebase.com/ns/people.person.weight_kg_LTE_0.25> ?b
?a <http://rdf.freebase.com/ns/location.country.iso_numeric_LTE_0.75> ?b =

In [14]:
len(res_parsed_clean)

339358

### Number of numericals in the rules

In [15]:
def predicate_is_numerical(atom):
    return "LTE" in atom.predicate or "GT" in atom.predicate or atom.objectD.isdigit()

rule_with_numerical_in_hyp = 0
rule_with_numerical_in_conc = 0
rule_with_numerical = 0

for rule in res_parsed_clean:
    num = False
    for hyp in rule.hypotheses:
        if predicate_is_numerical(hyp):
            rule_with_numerical_in_hyp+=1
            num = True
            break
    if predicate_is_numerical(rule.conclusion):
        rule_with_numerical_in_conc+=1
        num=True
    if num:
        rule_with_numerical += 1
    
print("Rule with numerical : ", rule_with_numerical)
print("Rule without numerical : ", len(res_parsed_clean) - rule_with_numerical)
print("Rule with numerical in hypotheses : ", rule_with_numerical_in_hyp)
print("Rule with numerical in conclusion : ", rule_with_numerical_in_conc)

Rule with numerical :  299476
Rule without numerical :  39882
Rule with numerical in hypotheses :  299476
Rule with numerical in conclusion :  291858


### Test the rules through test

In [16]:
data = open(root_data_FB+"train.tsv", "r")
f = open(root_data_FB+"train_rdflib.nt", "w")

for line in data:
    f.write(line.split("\n")[0]+" . \n")

data.close()
f.close()

In [17]:
data = open(root_source_FB+"test.txt", "r")

set_instances_to_predict = set()

for line in data:
    line_splited = line.split("\n")[0].split("\t")
    for i in range(len(line_splited)):
        line_splited[i] = "<http:"+line_splited[i]+">"
    set_instances_to_predict.add(tuple(line_splited))
        
data.close()

In [18]:
print(len(set_instances_to_predict))

59250


In [19]:
rules_associated_to_query = {}

for rule in res_parsed:
    if rule.conclusion.predicate in rules_associated_to_query.keys():
        rules_associated_to_query[rule.conclusion.predicate].add(rule)
    else:
        rules_associated_to_query[rule.conclusion.predicate] = set()
        rules_associated_to_query[rule.conclusion.predicate].add(rule)

In [22]:
g = rdflib.Graph()
g.parse(root_data_FB+"train_rdflib.nt")

<Graph identifier=Ndef2630c583042919a74340c2472a310 (<class 'rdflib.graph.Graph'>)>

In [23]:
def print_rules_nicely(rule):
    toprint = ""
    for hyp in rule.hypotheses:
        toprint += hyp.predicate+" & "
    toprint = toprint[:-2]
    toprint += "=>"+rule.conclusion.predicate
    print(toprint)

In [29]:
%%time

def predict_instance(name, g, instances, prediction_per_instance_man, rules_associated_to_query, cpt, total_length, print_advancment):
    
    print(f"Process n°{name} : Launched")
    
    for instance in instances:
        
        dict_tp = {}
        if instance[1] in rules_associated_to_query.keys():
            for rule in rules_associated_to_query[instance[1]]:
                try:
                    qres = g.query(create_query(rule, instance[2]))

                    set_res = set()
                    bool_res = False
                    for row in qres:
                        bool_res = True
                        set_res.add(str(row.a))

                    if bool_res:
                        dict_tp[rule] = set_res
            
                except:
                    print(create_query(rule, instance[2]))
                    
            prediction_per_instance_man[instance] = dict_tp
            
        else:
            prediction_per_instance_man[instance] = {}
        
        cpt.value += 1
        if (cpt.value/total_length > print_advancment.value):
            print(cpt)
            print_advancment.value+=0.1
        
    print(f"Process n°{name} : Finished")

prediction_per_instance = {}

size_queue = len(list(set_instances_to_predict))

with Manager() as manager:

    processes_to_create = multiprocessing.cpu_count()-3
    processes = list()

    prediction_per_instance_man = manager.dict()
    cpt = manager.Value("cpt",0)
    print_advancment = manager.Value("print_advancment",0)
    
    instances_list = list(set_instances_to_predict)

    for name in range(processes_to_create):
        x = Process(target=predict_instance, args=(name, g, instances_list[int(np.floor(name*len(instances_list)/processes_to_create)): int(np.floor((name+1)*len(instances_list)/processes_to_create))], 
                                                   prediction_per_instance_man, rules_associated_to_query, cpt, size_queue, print_advancment))
        processes.append(x)
        x.start()
        
    for index, process in enumerate(processes):
        process.join()

    prediction_per_instance = prediction_per_instance_man.copy()

Process n°0 : Launched
Process n°1 : Launched
Process n°2 : Launched
Process n°3 : Launched
Process n°4 : Launched
Process n°5 : Launched
Process n°6 : Launched
Process n°7 : Launched
Process n°8 : Launched
Process n°9 : Launched
Process n°10 : Launched
Process n°11 : Launched
Process n°12 : Launched
Value('cpt', 1)
Value('cpt', 2)
Value('cpt', 2001)
Value('cpt', 3001)
Value('cpt', 4001)
Value('cpt', 5001)
Value('cpt', 6001)
Value('cpt', 7001)
Value('cpt', 8000)
Process n°2 : Finished
Process n°4 : Finished
Value('cpt', 9000)
Process n°10 : Finished
Process n°6 : Finished
Process n°0 : Finished
Process n°9 : Finished
Process n°1 : Finished
Process n°11 : Finished
Process n°3 : Finished
Process n°8 : Finished
Process n°12 : Finished
Process n°5 : Finished
Process n°7 : Finished
CPU times: user 18.1 s, sys: 2h 59min 58s, total: 3h 16s
Wall time: 3h 21min 6s


In [30]:
len(prediction_per_instance.keys())

10000

In [31]:
df_prediction = {}

for prediction_instance in prediction_per_instance:
    df_rules = {}
    for rule in prediction_per_instance[prediction_instance]:
        df_rules[rule] = [set(prediction_per_instance[prediction_instance][rule]), rule.stdConfidence, rule.pcaConfidence]
    df_prediction[prediction_instance] = pd.DataFrame.from_dict(df_rules, orient="index", columns=["Prediction", "Std Confidence", "Pca Confidence"])

In [32]:
print("----- Democracy -----")
hit_at(df_prediction, democracy, 2)
hit_at(df_prediction, democracy, 5)
hit_at(df_prediction, democracy, 10)
hit_at(df_prediction, democracy, 1000)

print("----- Expert -----")
hit_at(df_prediction, expert, 2)
hit_at(df_prediction, expert, 5)
hit_at(df_prediction, expert, 10)
hit_at(df_prediction, expert, 1000)

----- Democracy -----
0.2108
0.3278
0.4327
0.8505
----- Expert -----
0.1393
0.1789
0.2052
0.2845


In [38]:
%%time

def predict_instance(name, g, instances, prediction_per_instance_man, rules_associated_to_query, cpt, total_length, print_advancment):
    
    print(f"Process n°{name} : Launched")
    
    for instance in instances:
        
        dict_tp = {}
        if instance[1] in rules_associated_to_query.keys():
            for rule in rules_associated_to_query[instance[1]]:
                try:
                    qres = g.query(create_query(rule, instance[2]))
                    tp = create_query(rule, instance[2])
                    if ("LTE"  in tp) or ("GT" in tp):
                        print(tp)

                    set_res = set()
                    bool_res = False
                    for row in qres:
                        bool_res = True
                        set_res.add(str(row.a))

                    if bool_res:
                        dict_tp[rule] = set_res
            
                except:
                    print(create_query(rule, instance[2]))
                    
            prediction_per_instance_man[instance] = dict_tp
            
        else:
            prediction_per_instance_man[instance] = {}
        
        cpt.value += 1
        if (cpt.value/total_length > print_advancment.value):
            print(cpt)
            print_advancment.value+=0.1
        
    print(f"Process n°{name} : Finished")

prediction_per_instance_test = {}

size_queue = len(list(set_instances_to_predict)[:100])

with Manager() as manager:

    processes_to_create = 1#multiprocessing.cpu_count()-3
    processes = list()

    prediction_per_instance_man = manager.dict()
    cpt = manager.Value("cpt",0)
    print_advancment = manager.Value("print_advancment",0)
    
    instances_list = list(set_instances_to_predict)[:100]

    for name in range(processes_to_create):
        x = Process(target=predict_instance, args=(name, g, instances_list[int(np.floor(name*len(instances_list)/processes_to_create)): int(np.floor((name+1)*len(instances_list)/processes_to_create))], 
                                                   prediction_per_instance_man, rules_associated_to_query, cpt, size_queue, print_advancment))
        processes.append(x)
        x.start()
        
    for index, process in enumerate(processes):
        process.join()

    prediction_per_instance_test = prediction_per_instance_man.copy()

Process n°0 : Launched
Value('cpt', 1)
Value('cpt', 11)
Value('cpt', 21)
Value('cpt', 31)
Value('cpt', 41)
Value('cpt', 51)
Value('cpt', 61)

    SELECT DISTINCT ?a
    WHERE {
<http:/m/0mwjk> <http://rdf.freebase.com/ns/location.geocode.latitude_LTE_0.75> ?f .
?a <http://rdf.freebase.com/ns/time.time_zone.dst_offset_from_utc_LTE_0.5> ?f .
}

    SELECT DISTINCT ?a
    WHERE {
<http:/m/0mwjk> <http://rdf.freebase.com/ns/location.country.iso_numeric_GT_0.25> ?f .
?a <http://rdf.freebase.com/ns/time.time_zone.dst_offset_from_utc_GT_0.25> ?f .
}

    SELECT DISTINCT ?a
    WHERE {
<http:/m/0mwjk> <http://rdf.freebase.com/ns/location.geocode.latitude_GT_0.75> ?f .
?a <http://rdf.freebase.com/ns/time.time_zone.offset_from_uct_GT_0.5> ?f .
}

    SELECT DISTINCT ?a
    WHERE {
?a <http://rdf.freebase.com/ns/time.time_zone.offset_from_uct_GT_0.5> ?f .
<http:/m/0mwjk> <http://rdf.freebase.com/ns/topic_server.population_number_GT_0.5> ?f .
}

    SELECT DISTINCT ?a
    WHERE {
<http:/m/0mwjk> <

# DB15K 

### Prepare the data

As we need a baseline for Amie, we are not going to change the value of anything. 

In [14]:
data = open(root_source_DB+"numericals.txt", "r")

numerical_predicate = set()

for predicate in data:
    numerical_predicate.add(predicate.split("\n")[0])
    
data.close()

In [15]:
thresholds = np.arange(0.25,1,0.25) #[0.25, 0.50, 0.75]
thresholds_str = ["-"+(str(int(i*100))) for i in thresholds]

In [16]:
data = open(root_source_DB+"train.txt", "r")
f = open(store_data_DB, "w")

dic_predicate = {}

for line in data:
    line_split = line.split("\n")[0].split(" ")
    if len(line_split) == 1:
        line_split = line.split("\n")[0].split("\t")
    if line_split[1] in numerical_predicate:
        line_split[2] = float(line_split[2]) 
        line_split = tuple(line_split)
        if line_split[1] in dic_predicate.keys():
            dic_predicate[line_split[1]].add(line_split)
        else : 
            dic_predicate[line_split[1]] = {line_split}
    else:
        f.write(line)
    
data.close()
f.close()

In [17]:
f = open(store_data_DB, "a")

for key in dic_predicate.keys():
    tp_df = pd.DataFrame.from_dict(dic_predicate[key]).rename(columns={0: "Subject", 1: "Predicate", 2: "Object"})
    tp_df_describe = tp_df["Object"].quantile(thresholds)
    for threshold in thresholds: 
        tp_df.apply(write_file, args=(f, threshold, tp_df_describe[threshold]), axis=1)

f.close()

### Launch Amie

In [18]:
res = check_output(f'java -jar ./../amie3.jar {store_data_DB}', shell=True)

res_parsed = parse_amie(res)

In [19]:
len(res_parsed)

487704

In [20]:
res_parsed

{?a <http://dbpedia.org/ontology/populationTotalRanking>_GT_0.25 ?b & ?a <http://dbpedia.org/ontology/populationUrban>_GT_0.75 ?b => ?a <http://dbpedia.org/ontology/foundingDate>_LTE_0.75 ?b,
 ?a <http://dbpedia.org/ontology/foundingDate>_GT_0.5 ?b & ?a <http://dbpedia.org/ontology/populationUrban>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/areaWater>_LTE_0.25 ?b,
 ?a <http://dbpedia.org/ontology/activeYearsStartYear>_GT_0.25 ?b => ?a <http://dbpedia.org/ontology/capacity>_LTE_0.5 ?b,
 ?a <http://dbpedia.org/ontology/areaCode>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/populationDensity>_GT_0.5 ?b,
 ?a <http://dbpedia.org/ontology/birthYear>_GT_0.5 ?b & ?a <http://www.w3.org/2003/01/geo/wgs84_pos#long>_GT_0.75 ?b => ?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.25 ?b,
 ?a <http://dbpedia.org/ontology/activeYearsStartYear>_LTE_0.5 ?b & ?a <http://dbpedia.org/ontology/birthDate>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/numberOfSeasons>_LTE_0.25 ?b,
 ?a <http://dbpedia.org

### Clean rules

In [21]:
res_parsed_clean = res_parsed.copy()
for i in res_parsed:
    if len(i.hypotheses) == 1:
        if (i.hypotheses[0].predicate.split("LTE")[0] == i.conclusion.predicate.split("LTE")[0]) and len(i.hypotheses[0].predicate.split("LTE")) == 2:
            print(i)
            res_parsed_clean.remove(i)

?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/gross>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/gross>_LTE_0.25 ?b
?a <http://dbpedia.org/ontology/gross>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/gross>_LTE_0.25 ?b
?a <http://dbpedia.org/ontology/revenue>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/revenue>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/maximumElevation>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/maximumElevation>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/endowment>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/endowment>_LTE_0.5 ?b
?a <http://dbpedia.org/ontology/numberOfEpisodes>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/numberOfEpisodes>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/birthYear>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/bir

In [22]:
len(res_parsed_clean)

487392

### Number of numericals in the rules

In [23]:
def predicate_is_numerical(atom):
    return "LTE" in atom.predicate or "GT" in atom.predicate or atom.objectD.isdigit()

rule_with_numerical_in_hyp = 0
rule_with_numerical_in_conc = 0
rule_with_numerical = 0

for rule in res_parsed_clean:
    num = False
    for hyp in rule.hypotheses:
        if predicate_is_numerical(hyp):
            rule_with_numerical_in_hyp+=1
            num = True
            break
    if predicate_is_numerical(rule.conclusion):
        rule_with_numerical_in_conc+=1
        num=True
    if num:
        rule_with_numerical += 1
    
print("Rule with numerical : ", rule_with_numerical)
print("Rule without numerical : ", len(res_parsed_clean) - rule_with_numerical)
print("Rule with numerical in hypotheses : ", rule_with_numerical_in_hyp)
print("Rule with numerical in conclusion : ", rule_with_numerical_in_conc)

Rule with numerical :  487392
Rule without numerical :  0
Rule with numerical in hypotheses :  487392
Rule with numerical in conclusion :  487392
