In [1]:
import pandas as pd 
import numpy as np
import os
from subprocess import check_output
import shutil

from rule import *
from amie import *

In [2]:
root_source_FB = "./../../FB15k_mail/"
root_source_DB = "./../../DB15k_mail/"

store_data_FB = "./../../Data_mail/FB_LTE.tsv"
store_data_DB = "./../../Data_mail/DB_LTE.tsv"

# FB15K 

### Prepare the data

As we need a baseline for Amie, we are not going to change the value of anything. 

In [3]:
data = open(root_source_FB+"numericals.txt", "r")

numerical_predicate = set()

for predicate in data:
    numerical_predicate.add(predicate.split("\n")[0])
    
data.close()

In [4]:
thresholds = np.arange(0.25,1,0.25) #[0.25, 0.50, 0.75]
thresholds_str = ["-"+(str(int(i*100))) for i in thresholds]

In [5]:
def write_file(X, f, threshold, comparator):
    f.write(f"{X['Subject']}\t{X['Predicate']}_LTE_{threshold}\t{X['Object']<=comparator}\n")

In [6]:
data = open(root_source_FB+"train.txt", "r")
f = open(store_data_FB, "w")

dic_predicate = {}

for line in data:
    line_split = line.split("\n")[0].split("\t")
    if line_split[1] in numerical_predicate:
        line_split[2] = float(line_split[2]) 
        line_split = tuple(line_split)
        if line_split[1] in dic_predicate.keys():
            dic_predicate[line_split[1]].add(line_split)
        else : 
            dic_predicate[line_split[1]] = {line_split}
    else:
        f.write(line)
    
data.close()
f.close()

In [7]:
f = open(store_data_FB, "a")

for key in dic_predicate.keys():
    tp_df = pd.DataFrame.from_dict(dic_predicate[key]).rename(columns={0: "Subject", 1: "Predicate", 2: "Object"})
    tp_df_describe = tp_df["Object"].quantile(thresholds)
    for threshold in thresholds: 
        tp_df.apply(write_file, args=(f, threshold, tp_df_describe[threshold]), axis=1)

f.close()

### Launch Amie

In [8]:
res = check_output(f'java -jar ./../amie3.jar {store_data_FB}', shell=True)

res_parsed = parse_amie(res)

In [9]:
len(res_parsed)

114199

In [10]:
res_parsed

{?e /film/film_location/featured_in_films ?a & ?e <http://rdf.freebase.com/ns/location.geocode.longitude>_LTE_0.75 ?b => ?a <http://rdf.freebase.com/ns/tv.tv_program.air_date_of_final_episode>_LTE_0.5 ?b,
 ?e /business/employer/employees./business/employment_tenure/person ?a & ?e <http://rdf.freebase.com/ns/sports.sports_team.founded>_LTE_0.5 ?b => ?a <http://rdf.freebase.com/ns/people.person.weight_kg>_LTE_0.25 ?b,
 ?a /olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/medalist ?f & ?f <http://rdf.freebase.com/ns/people.person.weight_kg>_LTE_0.75 ?b => ?a <http://rdf.freebase.com/ns/location.country.iso_numeric>_LTE_0.75 ?b,
 ?a <http://rdf.freebase.com/ns/people.person.height_meters>_LTE_0.25 ?b & ?a <http://rdf.freebase.com/ns/people.person.weight_kg>_LTE_0.25 ?b => ?a <http://rdf.freebase.com/ns/people.person.date_of_birth>_LTE_0.75 ?b,
 ?a /location/administrative_division/first_level_division_of ?f & ?f <http://rdf.freebase.com/ns/location.dated_loca

### Clean rules

In [11]:
res_parsed_clean = res_parsed.copy()
for i in res_parsed:
    if len(i.hypotheses) == 1:
        if (i.hypotheses[0].predicate.split("LTE")[0] == i.conclusion.predicate.split("LTE")[0]) and len(i.hypotheses[0].predicate.split("LTE")) == 2:
            print(i)
            res_parsed_clean.remove(i)

?a <http://rdf.freebase.com/ns/film.film.initial_release_date>_LTE_0.75 ?b => ?a <http://rdf.freebase.com/ns/film.film.initial_release_date>_LTE_0.25 ?b
?a <http://rdf.freebase.com/ns/location.geocode.longitude>_LTE_0.5 ?b => ?a <http://rdf.freebase.com/ns/location.geocode.longitude>_LTE_0.25 ?b
?a <http://rdf.freebase.com/ns/tv.tv_program.number_of_seasons>_LTE_0.75 ?b => ?a <http://rdf.freebase.com/ns/tv.tv_program.number_of_seasons>_LTE_0.25 ?b
?a <http://rdf.freebase.com/ns/base.popstra.sww_base.interest>_LTE_0.75 ?b => ?a <http://rdf.freebase.com/ns/base.popstra.sww_base.interest>_LTE_0.5 ?b
?a <http://rdf.freebase.com/ns/award.award_category.date_established>_LTE_0.5 ?b => ?a <http://rdf.freebase.com/ns/award.award_category.date_established>_LTE_0.75 ?b
?a <http://rdf.freebase.com/ns/tv.tv_program.air_date_of_first_episode>_LTE_0.25 ?b => ?a <http://rdf.freebase.com/ns/tv.tv_program.air_date_of_first_episode>_LTE_0.75 ?b
?a <http://rdf.freebase.com/ns/tv.tv_program.episode_runnin

In [12]:
len(res_parsed_clean)

114049

# DB15K 

### Prepare the data

As we need a baseline for Amie, we are not going to change the value of anything. 

In [13]:
data = open(root_source_DB+"numericals.txt", "r")

numerical_predicate = set()

for predicate in data:
    numerical_predicate.add(predicate.split("\n")[0])
    
data.close()

In [14]:
thresholds = np.arange(0.25,1,0.25) #[0.25, 0.50, 0.75]
thresholds_str = ["-"+(str(int(i*100))) for i in thresholds]

In [15]:
data = open(root_source_DB+"train.txt", "r")
f = open(store_data_DB, "w")

dic_predicate = {}

for line in data:
    line_split = line.split("\n")[0].split(" ")
    if len(line_split) == 1:
        line_split = line.split("\n")[0].split("\t")
    if line_split[1] in numerical_predicate:
        line_split[2] = float(line_split[2]) 
        line_split = tuple(line_split)
        if line_split[1] in dic_predicate.keys():
            dic_predicate[line_split[1]].add(line_split)
        else : 
            dic_predicate[line_split[1]] = {line_split}
    else:
        f.write(line)
    
data.close()
f.close()

In [16]:
f = open(store_data_DB, "a")

for key in dic_predicate.keys():
    tp_df = pd.DataFrame.from_dict(dic_predicate[key]).rename(columns={0: "Subject", 1: "Predicate", 2: "Object"})
    tp_df_describe = tp_df["Object"].quantile(thresholds)
    for threshold in thresholds: 
        tp_df.apply(write_file, args=(f, threshold, tp_df_describe[threshold]), axis=1)

f.close()

### Launch Amie

In [17]:
res = check_output(f'java -jar ./../amie3.jar {store_data_DB}', shell=True)

res_parsed = parse_amie(res)

In [18]:
len(res_parsed)

80579

In [19]:
res_parsed

{?a <http://dbpedia.org/ontology/areaLand>_LTE_0.25 ?b & ?a <http://dbpedia.org/ontology/areaLand>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/areaMetro>_LTE_0.5 ?b,
 ?a <http://dbpedia.org/ontology/areaLand>_LTE_0.5 ?b & ?a <http://dbpedia.org/ontology/populationAsOf>_LTE_0.25 ?b => ?a <http://www.w3.org/2003/01/geo/wgs84_pos#long>_LTE_0.25 ?b,
 ?a <http://dbpedia.org/ontology/deathDate>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/numberOfEmployees>_LTE_0.25 ?b,
 ?a <http://dbpedia.org/ontology/areaUrban>_LTE_0.75 ?b & ?a <http://dbpedia.org/ontology/elevation>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/populationAsOf>_LTE_0.75 ?b,
 ?a <http://dbpedia.org/ontology/maximumElevation>_LTE_0.5 ?b & ?a <http://dbpedia.org/ontology/populationMetro>_LTE_0.75 ?b => ?a <http://www.w3.org/2003/01/geo/wgs84_pos#lat>_LTE_0.75 ?b,
 ?a <http://dbpedia.org/ontology/areaLand>_LTE_0.25 ?b & ?a <http://dbpedia.org/ontology/areaLand>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/maximumElevat

### Clean rules

In [20]:
res_parsed_clean = res_parsed.copy()
for i in res_parsed:
    if len(i.hypotheses) == 1:
        if (i.hypotheses[0].predicate.split("LTE")[0] == i.conclusion.predicate.split("LTE")[0]) and len(i.hypotheses[0].predicate.split("LTE")) == 2:
            print(i)
            res_parsed_clean.remove(i)

?a <http://dbpedia.org/ontology/deathDate>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/deathDate>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/height>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/height>_LTE_0.5 ?b
?a <http://dbpedia.org/ontology/revenue>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/revenue>_LTE_0.25 ?b
?a <http://dbpedia.org/ontology/foundingYear>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/foundingYear>_LTE_0.25 ?b
?a <http://dbpedia.org/ontology/numberOfStudents>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/numberOfStudents>_LTE_0.5 ?b
?a <http://dbpedia.org/ontology/areaTotal>_LTE_0.5 ?b => ?a <http://dbpedia.org/ontology/areaTotal>_LTE_0.75 ?b
?a <http://dbpedia.org/ontology/formationDate>_LTE_0.25 ?b => ?a <http://dbpedia.org/ontology/formationDate>_LTE_0.5 ?b
?a <http://dbpedia.org/ontology/completionDate>_LTE_0.75 ?b => ?a <http://dbpedia.org/ontology/completionDate>_LTE_0.5 ?b
?a <http://dbpedia.org/ontology/runtime>_LTE_0.25 ?b => ?a <http://dbpedia

In [21]:
len(res_parsed_clean)

80267