In [1]:
import pandas as pd 
import numpy as np
import os
from subprocess import check_output
import shutil

from rule import *
from amie import *

In [2]:
root_source_FB = "./../../FB15k_mail/"
root_source_DB = "./../../DB15k_mail/"

store_data_FB = "./../../Data_mail/FB_LTE_GT.tsv"
store_data_DB = "./../../Data_mail/DB_LTE_GT.tsv"

# FB15K 

### Prepare the data

As we need a baseline for Amie, we are not going to change the value of anything. 

In [3]:
data = open(root_source_FB+"numericals.txt", "r")

numerical_predicate = set()

for predicate in data:
    numerical_predicate.add(predicate.split("\n")[0])
    
data.close()

In [4]:
thresholds = np.arange(0.25,1,0.25) #[0.25, 0.50, 0.75]
thresholds_str = ["-"+(str(int(i*100))) for i in thresholds]

In [5]:
def write_file(X, f, threshold, comparator):
    f.write(f"{X['Subject']}\t{X['Predicate']}_LTE_{threshold}\t{X['Object']<=comparator}\n")
    f.write(f"{X['Subject']}\t{X['Predicate']}_GT_{threshold}\t{X['Object']>comparator}\n")

In [6]:
data = open(root_source_FB+"train.txt", "r")
f = open(store_data_FB, "w")

dic_predicate = {}

for line in data:
    line_split = line.split("\n")[0].split("\t")
    if line_split[1] in numerical_predicate:
        line_split[2] = float(line_split[2]) 
        line_split = tuple(line_split)
        if line_split[1] in dic_predicate.keys():
            dic_predicate[line_split[1]].add(line_split)
        else : 
            dic_predicate[line_split[1]] = {line_split}
    else:
        f.write(line)
    
data.close()
f.close()

In [7]:
f = open(store_data_FB, "a")

for key in dic_predicate.keys():
    tp_df = pd.DataFrame.from_dict(dic_predicate[key]).rename(columns={0: "Subject", 1: "Predicate", 2: "Object"})
    tp_df_describe = tp_df["Object"].quantile(thresholds)
    for threshold in thresholds: 
        tp_df.apply(write_file, args=(f, threshold, tp_df_describe[threshold]), axis=1)

f.close()

### Launch Amie

In [8]:
res = check_output(f'java -jar ./../amie3.jar {store_data_FB}', shell=True)

res_parsed = parse_amie(res)

In [9]:
len(res_parsed)

339508

In [10]:
res_parsed

{?e /symbols/name_source/namesakes ?a & ?e <http://rdf.freebase.com/ns/people.deceased_person.date_of_death>_LTE_0.25 ?b => ?a <http://rdf.freebase.com/ns/location.geocode.latitude>_LTE_0.5 ?b,
 ?a /music/artist/origin ?f & ?f <http://rdf.freebase.com/ns/location.location.area>_GT_0.25 ?b => ?a <http://rdf.freebase.com/ns/base.popstra.sww_base.heat>_GT_0.5 ?b,
 ?b /award/award_winning_work/awards_won./award/award_honor/award_winner ?a & ?b /film/film/directed_by ?a => ?a /film/producer/films_executive_produced ?b,
 ?a /base/aareas/schema/administrative_area/administrative_parent ?f & ?f <http://rdf.freebase.com/ns/location.dated_location.date_founded>_GT_0.75 ?b => ?a <http://rdf.freebase.com/ns/topic_server.population_number>_GT_0.75 ?b,
 ?a /time/event/locations ?f & ?f <http://rdf.freebase.com/ns/location.location.area>_LTE_0.75 ?b => ?a <http://rdf.freebase.com/ns/time.event.end_date>_GT_0.5 ?b,
 ?e /people/person/spouse_s./people/marriage/spouse ?a & ?e <http://rdf.freebase.com/ns

### Clean rules

In [11]:
res_parsed_clean = res_parsed.copy()
for i in res_parsed:
    if len(i.hypotheses) == 1:
        if (i.hypotheses[0].predicate.split("LTE")[0] == i.conclusion.predicate.split("LTE")[0]) and len(i.hypotheses[0].predicate.split("LTE")) == 2:
            print(i)
            res_parsed_clean.remove(i)

?a <http://rdf.freebase.com/ns/location.location.area>_LTE_0.25 ?b => ?a <http://rdf.freebase.com/ns/location.location.area>_LTE_0.5 ?b
?a <http://rdf.freebase.com/ns/tv.tv_program.number_of_episodes>_LTE_0.5 ?b => ?a <http://rdf.freebase.com/ns/tv.tv_program.number_of_episodes>_LTE_0.25 ?b
?a <http://rdf.freebase.com/ns/time.event.end_date>_LTE_0.5 ?b => ?a <http://rdf.freebase.com/ns/time.event.end_date>_LTE_0.25 ?b
?a <http://rdf.freebase.com/ns/time.event.start_date>_LTE_0.25 ?b => ?a <http://rdf.freebase.com/ns/time.event.start_date>_LTE_0.75 ?b
?a <http://rdf.freebase.com/ns/award.award_category.date_established>_LTE_0.25 ?b => ?a <http://rdf.freebase.com/ns/award.award_category.date_established>_LTE_0.75 ?b
?a <http://rdf.freebase.com/ns/organization.organization.date_founded>_LTE_0.25 ?b => ?a <http://rdf.freebase.com/ns/organization.organization.date_founded>_LTE_0.5 ?b
?a <http://rdf.freebase.com/ns/tv.tv_program.air_date_of_final_episode>_LTE_0.5 ?b => ?a <http://rdf.freebas

In [12]:
len(res_parsed_clean)

339358

### Number of numericals in the rules

In [13]:
def predicate_is_numerical(atom):
    return "LTE" in atom.predicate or "GT" in atom.predicate or atom.objectD.isdigit()

rule_with_numerical_in_hyp = 0
rule_with_numerical_in_conc = 0
rule_with_numerical = 0

for rule in res_parsed_clean:
    num = False
    for hyp in rule.hypotheses:
        if predicate_is_numerical(hyp):
            rule_with_numerical_in_hyp+=1
            num = True
            break
    if predicate_is_numerical(rule.conclusion):
        rule_with_numerical_in_conc+=1
        num=True
    if num:
        rule_with_numerical += 1
    
print("Rule with numerical : ", rule_with_numerical)
print("Rule without numerical : ", len(res_parsed_clean) - rule_with_numerical)
print("Rule with numerical in hypotheses : ", rule_with_numerical_in_hyp)
print("Rule with numerical in conclusion : ", rule_with_numerical_in_conc)

Rule with numerical :  299476
Rule without numerical :  39882
Rule with numerical in hypotheses :  299476
Rule with numerical in conclusion :  291858


# DB15K 

### Prepare the data

As we need a baseline for Amie, we are not going to change the value of anything. 

In [14]:
data = open(root_source_DB+"numericals.txt", "r")

numerical_predicate = set()

for predicate in data:
    numerical_predicate.add(predicate.split("\n")[0])
    
data.close()

In [15]:
thresholds = np.arange(0.25,1,0.25) #[0.25, 0.50, 0.75]
thresholds_str = ["-"+(str(int(i*100))) for i in thresholds]

In [16]:
data = open(root_source_DB+"train.txt", "r")
f = open(store_data_DB, "w")

dic_predicate = {}

for line in data:
    line_split = line.split("\n")[0].split(" ")
    if len(line_split) == 1:
        line_split = line.split("\n")[0].split("\t")
    if line_split[1] in numerical_predicate:
        line_split[2] = float(line_split[2]) 
        line_split = tuple(line_split)
        if line_split[1] in dic_predicate.keys():
            dic_predicate[line_split[1]].add(line_split)
        else : 
            dic_predicate[line_split[1]] = {line_split}
    else:
        f.write(line)
    
data.close()
f.close()

In [17]:
f = open(store_data_DB, "a")

for key in dic_predicate.keys():
    tp_df = pd.DataFrame.from_dict(dic_predicate[key]).rename(columns={0: "Subject", 1: "Predicate", 2: "Object"})
    tp_df_describe = tp_df["Object"].quantile(thresholds)
    for threshold in thresholds: 
        tp_df.apply(write_file, args=(f, threshold, tp_df_describe[threshold]), axis=1)

f.close()

### Launch Amie

In [18]:
res = check_output(f'java -jar ./../amie3.jar {store_data_DB}', shell=True)

res_parsed = parse_amie(res)

In [19]:
len(res_parsed)

487704

In [20]:
res_parsed

{?a <http://dbpedia.org/ontology/populationTotalRanking>_GT_0.25 ?b & ?a <http://dbpedia.org/ontology/populationUrban>_GT_0.75 ?b => ?a <http://dbpedia.org/ontology/foundingDate>_LTE_0.75 ?b,
 ?a <http://dbpedia.org/ontology/foundingDate>_GT_0.5 ?b & ?a <http://dbpedia.org/ontology/populationUrban>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/areaWater>_LTE_0.25 ?b,
 ?a <http://dbpedia.org/ontology/activeYearsStartYear>_GT_0.25 ?b => ?a <http://dbpedia.org/ontology/capacity>_LTE_0.5 ?b,
 ?a <http://dbpedia.org/ontology/areaCode>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/populationDensity>_GT_0.5 ?b,
 ?a <http://dbpedia.org/ontology/birthYear>_GT_0.5 ?b & ?a <http://www.w3.org/2003/01/geo/wgs84_pos#long>_GT_0.75 ?b => ?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.25 ?b,
 ?a <http://dbpedia.org/ontology/activeYearsStartYear>_LTE_0.5 ?b & ?a <http://dbpedia.org/ontology/birthDate>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/numberOfSeasons>_LTE_0.25 ?b,
 ?a <http://dbpedia.org

### Clean rules

In [21]:
res_parsed_clean = res_parsed.copy()
for i in res_parsed:
    if len(i.hypotheses) == 1:
        if (i.hypotheses[0].predicate.split("LTE")[0] == i.conclusion.predicate.split("LTE")[0]) and len(i.hypotheses[0].predicate.split("LTE")) == 2:
            print(i)
            res_parsed_clean.remove(i)

?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/gross>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/gross>_LTE_0.25 ?b
?a <http://dbpedia.org/ontology/gross>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/gross>_LTE_0.25 ?b
?a <http://dbpedia.org/ontology/revenue>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/revenue>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/maximumElevation>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/maximumElevation>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/endowment>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/endowment>_LTE_0.5 ?b
?a <http://dbpedia.org/ontology/numberOfEpisodes>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/numberOfEpisodes>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/utcOffset>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/birthYear>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/bir

In [22]:
len(res_parsed_clean)

487392

### Number of numericals in the rules

In [23]:
def predicate_is_numerical(atom):
    return "LTE" in atom.predicate or "GT" in atom.predicate or atom.objectD.isdigit()

rule_with_numerical_in_hyp = 0
rule_with_numerical_in_conc = 0
rule_with_numerical = 0

for rule in res_parsed_clean:
    num = False
    for hyp in rule.hypotheses:
        if predicate_is_numerical(hyp):
            rule_with_numerical_in_hyp+=1
            num = True
            break
    if predicate_is_numerical(rule.conclusion):
        rule_with_numerical_in_conc+=1
        num=True
    if num:
        rule_with_numerical += 1
    
print("Rule with numerical : ", rule_with_numerical)
print("Rule without numerical : ", len(res_parsed_clean) - rule_with_numerical)
print("Rule with numerical in hypotheses : ", rule_with_numerical_in_hyp)
print("Rule with numerical in conclusion : ", rule_with_numerical_in_conc)

Rule with numerical :  487392
Rule without numerical :  0
Rule with numerical in hypotheses :  487392
Rule with numerical in conclusion :  487392
