In [1]:
import pandas as pd 
import numpy as np
import os
from subprocess import check_output
import shutil
import rdflib
from multiprocessing import Process, Manager, Queue
import multiprocessing

from rule import *
from amie import *
from experiment import *

In [2]:
root_source_FB = "./../../FB15k_mail/"
root_source_DB = "./../../DB15k_mail/"

root_data_FB = "./../../Data_mail/FB_baseline_"
root_data_DB = "./../../Data_mail/DB_baseline_"

# FB15K 

### Prepare the data

As we need a baseline for Amie, we are not going to change the value of anything. 

In [3]:
data = open(root_source_FB+"numericals.txt", "r")

numerical_predicate = set()

for predicate in data:
    numerical_predicate.add(predicate.split("\n")[0])
    
data.close()

In [4]:
data = open(root_source_FB+"train.txt", "r")
f = open(root_data_FB+"train.tsv", "w")

for line in data:
    line = line.split("\n")[0].split("\t")
    if not line[1] in numerical_predicate:
        cpt = 0
        for var in line:
            f.write('<http:'+var+'>')
            if cpt<2:
                cpt+=1
                f.write('\t')
        f.write("\n")
    else:
        f.write('<http:'+line[0]+'>\t'+line[1]+'\t'+'<http:'+line[2]+'>\n')
    
data.close()
f.close()

### Find which predicate

We decide to not predict numerical predicate because with our implementation of "LT", we have LT-50 -> LT-25. But one could argue that we could clean this shit later.
So we are going to predict the numerical and then clean the rules that describe LT-50 -> LT-25.

In [5]:
res = check_output(f'java -jar ./../amie3.jar {root_data_FB+"train.tsv"}', shell=True)

res_parsed = parse_amie(res)

In [6]:
# res

In [7]:
len(res_parsed)

40094

### Number of numericals in the rules

In [8]:
def predicate_is_numerical(atom):
    return "LTE" in atom.predicate or "GT" in atom.predicate or atom.objectD.isdigit()

rule_with_numerical_in_hyp = 0
rule_with_numerical_in_conc = 0
rule_with_numerical = 0

for rule in res_parsed:
    num = False
    for hyp in rule.hypotheses:
        if predicate_is_numerical(hyp):
            rule_with_numerical_in_hyp+=1
            num = True
            break
    if predicate_is_numerical(rule.conclusion):
        rule_with_numerical_in_conc+=1
        num=True
    if num:
        rule_with_numerical += 1
    
print("Rule with numerical : ", rule_with_numerical)
print("Rule without numerical : ", len(res_parsed) - rule_with_numerical)
print("Rule with numerical in hypotheses : ", rule_with_numerical_in_hyp)
print("Rule with numerical in conclusion : ", rule_with_numerical_in_conc)

Rule with numerical :  0
Rule without numerical :  40094
Rule with numerical in hypotheses :  0
Rule with numerical in conclusion :  0


### Test the rules through test

In [9]:
data = open(root_data_FB+"train.tsv", "r")
f = open(root_data_FB+"train_rdflib.nt", "w")

for line in data:
    f.write(line.split("\n")[0]+" . \n")

data.close()
f.close()

In [10]:
data = open(root_source_FB+"test.txt", "r")

set_instances_to_predict = set()

for line in data:
    line_splited = line.split("\n")[0].split("\t")
    for i in range(len(line_splited)):
        line_splited[i] = "<http:"+line_splited[i]+">"
    set_instances_to_predict.add(tuple(line_splited))
        
data.close()

In [11]:
print(len(set_instances_to_predict))

59250


In [12]:
rules_associated_to_query = {}

for rule in res_parsed:
    if rule.conclusion.predicate in rules_associated_to_query.keys():
        rules_associated_to_query[rule.conclusion.predicate].add(rule)
    else:
        rules_associated_to_query[rule.conclusion.predicate] = set()
        rules_associated_to_query[rule.conclusion.predicate].add(rule)

In [13]:
g = rdflib.Graph()
g.parse(root_data_FB+"train_rdflib.nt")

<Graph identifier=N7b4c6502183140f6bba6ae35abc71c4f (<class 'rdflib.graph.Graph'>)>

In [14]:
def print_rules_nicely(rule):
    toprint = ""
    for hyp in rule.hypotheses:
        toprint += hyp.predicate+" & "
    toprint = toprint[:-2]
    toprint += "=>"+rule.conclusion.predicate
    print(toprint)

In [None]:
%%time

def predict_instance(name, g, queue, prediction_per_instance_man, rules_associated_to_query, cpt, total_length, print_advancment):
    
    print(f"Process n°{name} : Launched")
    
    while not queue.empty():
        
        instance = queue.get()
        
        dict_tp = {}
        
        if instance[1] in rules_associated_to_query.keys():
            for rule in rules_associated_to_query[instance[1]]:
                try:
                    qres = g.query(create_query(rule, instance[2]))

                    set_res = set()
                    bool_res = False
                    for row in qres:
                        bool_res = True
                        set_res.add(str(row.a))

                    if bool_res:
                        dict_tp[rule] = set_res
            
                except:
                    print(create_query(rule, instance[2]))
                    
            prediction_per_instance_man[instance] = dict_tp
            
        else:
            prediction_per_instance_man[instance] = {}
        
        cpt.value += 1
        if (cpt.value/total_length > print_advancment.value):
            print(cpt)
            print_advancment.value+=0.1
        
    print(f"Process n°{name} : Finished")   
    
q = Queue()
prediction_per_instance = {}

for instance in list(set_instances_to_predict):
    q.put(instance)

size_queue = q.qsize()

print("Queue finished")

with Manager() as manager:

    processes_to_create = multiprocessing.cpu_count()-3
    processes = list()

    prediction_per_instance_man = manager.dict()
    cpt = manager.Value("cpt",0)
    print_advancment = manager.Value("print_advancment",0)

    for name in range(processes_to_create):
        x = Process(target=predict_instance, args=(name, g, q, prediction_per_instance_man, rules_associated_to_query, cpt, size_queue, print_advancment))
        processes.append(x)
        x.start()
        
    for index, process in enumerate(processes):
        process.join()

    prediction_per_instance = prediction_per_instance_man.copy()

Queue finished
Process n°0 : Launched
Process n°1 : Launched
Process n°2 : Launched
Process n°3 : Launched
Process n°4 : Launched
Process n°5 : Launched
Process n°6 : Launched
Process n°7 : Launched
Process n°8 : Launched
Process n°9 : Launched
Process n°10 : Launched
Process n°11 : Launched
Process n°12 : Launched
Value('cpt', 1)
Value('cpt', 5926)Value('cpt', 5926)

Value('cpt', 17776)
Value('cpt', 23701)
Value('cpt', 29626)
Value('cpt', 35551)Value('cpt', 35551)

Value('cpt', 41476)
Value('cpt', 47400)


In [15]:
# %%time

# def predict_instance(name, g, instances, prediction_per_instance_man, rules_associated_to_query, cpt, total_length, print_advancment):
    
#     print(f"Process n°{name} : Launched")
    
#     for instance in instances:
        
#         dict_tp = {}
#         if instance[1] in rules_associated_to_query.keys():
#             for rule in rules_associated_to_query[instance[1]]:
#                 try:
#                     qres = g.query(create_query(rule, instance[2]))

#                     set_res = set()
#                     bool_res = False
#                     for row in qres:
#                         bool_res = True
#                         set_res.add(str(row.a))

#                     if bool_res:
#                         dict_tp[rule] = set_res
            
#                 except:
#                     print(create_query(rule, instance[2]))
                    
#             prediction_per_instance_man[instance] = dict_tp
            
#         else:
#             prediction_per_instance_man[instance] = {}
        
#         cpt.value += 1
#         if (cpt.value/total_length > print_advancment.value):
#             print(cpt)
#             print_advancment.value+=0.1
        
#     print(f"Process n°{name} : Finished")

# prediction_per_instance = {}

# list_instances_to_predict = list(set_instances_to_predict)[:1000]

# size_queue = len(list_instances_to_predict)


# with Manager() as manager:

#     processes_to_create = multiprocessing.cpu_count()-3
#     processes = list()

#     prediction_per_instance_man = manager.dict()
#     cpt = manager.Value("cpt",0)
#     print_advancment = manager.Value("print_advancment",0)
    
#     instances_list = list_instances_to_predict

#     for name in range(processes_to_create):
#         x = Process(target=predict_instance, args=(name, g, instances_list[int(np.floor(name*len(instances_list)/processes_to_create)): int(np.floor((name+1)*len(instances_list)/processes_to_create))], 
#                                                    prediction_per_instance_man, rules_associated_to_query, cpt, size_queue, print_advancment))
#         processes.append(x)
#         x.start()
        
#     for index, process in enumerate(processes):
#         process.join()

#     prediction_per_instance = prediction_per_instance_man.copy()

Process n°0 : Launched
Process n°1 : Launched
Process n°2 : Launched
Process n°3 : Launched
Process n°4 : Launched
Process n°5 : Launched
Process n°6 : Launched
Process n°7 : Launched
Process n°8 : Launched
Process n°9 : Launched
Process n°10 : Launched
Process n°11 : Launched
Process n°12 : Launched
Value('cpt', 1)
Value('cpt', 101)
Value('cpt', 201)
Value('cpt', 301)
Value('cpt', 401)
Value('cpt', 501)
Value('cpt', 601)
Process n°7 : Finished
Value('cpt', 701)
Process n°12 : Finished
Process n°10 : Finished
Value('cpt', 800)
Process n°1 : Finished
Process n°2 : Finished
Process n°11 : Finished
Process n°9 : Finished
Process n°4 : Finished
Value('cpt', 900)
Process n°0 : Finished
Process n°6 : Finished
Process n°5 : Finished
Process n°3 : Finished
Process n°8 : Finished
CPU times: user 1.62 s, sys: 34.2 s, total: 35.9 s
Wall time: 2min 43s


In [None]:
len(prediction_per_instance.keys())

In [None]:
df_prediction = {}

for prediction_instance in prediction_per_instance:
    df_rules = {}
    for rule in prediction_per_instance[prediction_instance]:
        df_rules[rule] = [set(prediction_per_instance[prediction_instance][rule]), rule.stdConfidence, rule.pcaConfidence]
    df_prediction[prediction_instance] = pd.DataFrame.from_dict(df_rules, orient="index", columns=["Prediction", "Std Confidence", "Pca Confidence"])

In [None]:
print("----- Democracy -----")
hit_at(df_prediction, democracy, 1)
hit_at(df_prediction, democracy, 5)
hit_at(df_prediction, democracy, 10)
hit_at(df_prediction, democracy, 1000)

print("----- Expert -----")
hit_at(df_prediction, expert, 1)
hit_at(df_prediction, expert, 5)
hit_at(df_prediction, expert, 10)
hit_at(df_prediction, expert, 1000)

# DB15K 

### Prepare the data

As we need a baseline for Amie, we are not going to change the value of anything. 

In [18]:
data = open(root_source_DB+"train.txt", "r")
f = open(root_data_DB, "w")

for line in data:
    cpt = 0
    for split in line.split("\n")[0].split(" "):
        f.write(split)
        if cpt < 2 :
            f.write("\t")
        cpt+=1
    f.write("\n")
    
data.close()
f.close()

### Find which predicate

We decide to not predict numerical predicate because with our implementation of "LT", we have LT-50 -> LT-25. But one could argue that we could clean this shit later.
So we are going to predict the numerical and then clean the rules that describe LT-50 -> LT-25.

In [19]:
res = check_output(f'java -jar ./../amie3.jar {root_data_DB}', shell=True)

res_parsed = parse_amie(res)

In [20]:
len(res_parsed)

2693

In [21]:
res

b'Assuming rdf:type as type relation\nLoading files... \n  Starting DB_baseline.tsv\n  Finished DB_baseline.tsv, still running: 0\nLoaded 120756 facts in 0.85 s using 183 MB\nUsing HeadCoverage as pruning metric with minimum threshold 0.01\nUsing recursivity limit 3\nLazy mining assistant that stops counting when the denominator gets too high\nNo minimum threshold on standard confidence\nFiltering on PCA confidence with minimum threshold 0.1\nConstants in the arguments of relations are disabled\nLossless (query refinement) heuristics enabled\nMRT calls: 0\nStarting the mining phase... Using 16 threads\nRule\tHead Coverage\tStd Confidence\tPCA Confidence\tPositive Examples\tBody size\tPCA Body size\tFunctional variable\n?a  <http://dbpedia.org/ontology/secondCommander>  ?b   => ?a  <http://dbpedia.org/ontology/commander>  ?b\t0.024590164\t0.6\t0.6\t3\t5\t5\t?b\n?b  <http://dbpedia.org/ontology/battle>  ?a   => ?a  <http://dbpedia.org/ontology/commander>  ?b\t0.131147541\t0.125984252\t0.

### Number of numericals in the rules

In [24]:
def predicate_is_numerical(atom):
    return "LTE" in atom.predicate or "GT" in atom.predicate or atom.objectD.isdigit()

rule_with_numerical_in_hyp = 0
rule_with_numerical_in_conc = 0
rule_with_numerical = 0

for rule in res_parsed:
    num = False
    for hyp in rule.hypotheses:
        if predicate_is_numerical(hyp):
            rule_with_numerical_in_hyp+=1
            num = True
            break
    if predicate_is_numerical(rule.conclusion):
        rule_with_numerical_in_conc+=1
        num=True
    if num:
        rule_with_numerical += 1
    
print("Rule with numerical : ", rule_with_numerical)
print("Rule without numerical : ", len(res_parsed) - rule_with_numerical)
print("Rule with numerical in hypotheses : ", rule_with_numerical_in_hyp)
print("Rule with numerical in conclusion : ", rule_with_numerical_in_conc)

Rule with numerical :  0
Rule without numerical :  2693
Rule with numerical in hypotheses :  0
Rule with numerical in conclusion :  0
