In [1]:
pip install pslpython

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  4 09:49:29 2022

@author: ewanhilton
"""

from classes.DatasetGenerator import generate_dataset_file
from classes.EntityConverter import EntityConverter
from classes.PSLFileBuilder import PSLFileBuilder
from pykeen.datasets import CoDExSmall

#Setting this to True is required to get all files needed for PSL,
#but is very costly
CREATE_FILES = True 

def pre_main():
    dataset = CoDExSmall()
    
    train_triples = dataset.training.mapped_triples.numpy()
    val_triples = dataset.validation.mapped_triples.numpy()
    test_triples = dataset.testing.mapped_triples.numpy()   
    
    generate_dataset_file('train.txt','CoDEx',train_triples,dataset)
    generate_dataset_file('valid.txt','CoDEx',val_triples,dataset)
    generate_dataset_file('test.txt','CoDEx',test_triples,dataset)
    
    if CREATE_FILES:
        dataset = CoDExSmall()
        
        train_triples = dataset.training.mapped_triples.numpy()
        val_triples = dataset.validation.mapped_triples.numpy()
        #test_triples = dataset.testing.mapped_triples.numpy()  
    
        entity_converter = EntityConverter(dataset)
        create_files(train_triples,val_triples,entity_converter)
    
    #Create files needed by PSL
def create_files(train_triples, val_triples,entity_converter):    
    filebuilder = PSLFileBuilder(train_triples, val_triples, entity_converter)
    filebuilder.build_map_files()
    filebuilder.build_obs_files()
    filebuilder.build_target_files()
    filebuilder.build_truth_files()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  4 08:04:34 2022

@author: ewanhilton
"""
import os

from pslpython.model import Model
from pslpython.partition import Partition
from pslpython.predicate import Predicate
from pslpython.rule import Rule
from pykeen.datasets import CoDExSmall
from classes.ANYBurlToPSLConverter import ANYBurlToPSLConverter
from classes.RuleImporter import RuleImporter
from classes.EntityConverter import EntityConverter
from classes.DatasetGenerator import encode_text
from tqdm import tqdm
from datetime import datetime

MODEL_NAME = 'ANYBurl and PSL Model'

DATA_DIR = os.path.join('data')

ADDITIONAL_PSL_OPTIONS = {
    'log4j.threshold': 'INFO'
}

ADDITIONAL_CLI_OPTIONS = [
    # '--postgres'
]

MAX_RULES = 1
ANYBURL_RULES_THRESHOLD = 0.6

def main():
    importer = RuleImporter()
    importer.import_rules()

    model = Model(MODEL_NAME)
    dataset = CoDExSmall()
    entity_converter = EntityConverter(dataset)

    #train_triples = dataset.training.mapped_triples.numpy()
    #val_triples = dataset.validation.mapped_triples.numpy()
    #test_triples = dataset.testing.mapped_triples.numpy()  

    # Add Predicates
    add_predicates(model,entity_converter)

    # Add Rules
    add_rules(model,importer.rules)

    # Inference
    results = infer(model,entity_converter)
    write_results(results, model)
 
def add_predicates(model,entity_converter): 
    print("Adding predicates...")
    for relindex,name in tqdm(entity_converter.relindex_to_name.items()):
        predicate = Predicate(encode_text(name), closed = name != 'genre', size = 2)
        model.add_predicate(predicate)

def add_rules(model, rules):
    print("Adding rules...")
    converter = ANYBurlToPSLConverter(rules)
    total_rules = 0
    for rule in tqdm(converter.converted_rules):
        if rule.split('->')[1].split('(')[0].replace(' ','') == 'genre':
            if total_rules >= MAX_RULES:
                print(f"Maximum number of rules added ({total_rules} rules added)")
                return
            if float(rule.split(':')[0]) > ANYBURL_RULES_THRESHOLD:
                model.add_rule(Rule(rule))
                total_rules += 1
                continue
    print(f"{total_rules} rules added")
    
def add_data(model,entity_converter):
    print("Adding data...")
    for relindex,name in tqdm(entity_converter.relindex_to_name.items()):        
        path = f'data/obs/{encode_text(name)}_obs.txt'
        if path_exists(path): #Check file has content before adding  
            model.get_predicate(encode_text(name)).add_data_file(Partition.OBSERVATIONS, path)

        path =  f'data/targets/{encode_text(name)}_targets.txt'
        if path_exists(path):
            model.get_predicate(encode_text(name)).add_data_file(Partition.TARGETS, path)

        path = f'data/truth/{encode_text(name)}_truth.txt'
        if path_exists(path):
            model.get_predicate(encode_text(name)).add_data_file(Partition.TRUTH, path)
   
def path_exists(path):
    try:
        return os.path.getsize(f"{path}") > 0
    except:
        return False

def infer(model,entity_converter):
    add_data(model, entity_converter)
    print(f"Inference starting at {get_date_time()}")
    return model.infer(additional_cli_options = ADDITIONAL_CLI_OPTIONS, psl_config = ADDITIONAL_PSL_OPTIONS)
     
def get_date_time():
    return f"{str(datetime.now().time()).split('.')[0]} on {datetime.today().strftime('%d-%b-%Y')}"

def write_results(results, model):
    print(f"Inferenced completed at {get_date_time()}")
    out_dir = 'inferred-predicates'
    os.makedirs(out_dir, exist_ok = True)
    print("Writing predicates")
    for predicate in tqdm(model.get_predicates().values()):
        if (predicate.closed()):
            continue       
        try:
            out_path = os.path.join(out_dir, "%s.txt" % (predicate.name()))
            results[predicate].to_csv(out_path, sep = "\t", header = False, index = False)     
        except:
            continue

In [4]:
pre_main()

Generating CoDEx train.txt dataset file in destination: datasets/data/CoDEx/train.txt


100%|██████████| 32888/32888 [00:00<00:00, 526195.96it/s]

Generating CoDEx valid.txt dataset file in destination: datasets/data/CoDEx/valid.txt



100%|██████████| 1827/1827 [00:00<00:00, 490714.23it/s]


Generating CoDEx test.txt dataset file in destination: datasets/data/CoDEx/test.txt


100%|██████████| 1828/1828 [00:00<00:00, 497191.34it/s]


Building _map.txt files


100%|██████████| 42/42 [00:01<00:00, 37.10it/s]


Building _obs.txt files


100%|██████████| 42/42 [00:01<00:00, 36.95it/s]


Building _targets.txt files


100%|██████████| 42/42 [00:01<00:00, 33.91it/s]


Building _truth.txt files


100%|██████████| 42/42 [00:00<00:00, 646.23it/s]


In [None]:
main()

Adding predicates...


100%|██████████| 42/42 [00:00<00:00, 676.75it/s]

Adding rules...



  4%|▍         | 1125/27266 [00:00<00:00, 1301680.55it/s]


Maximum number of rules added (1 rules added)
Adding data...


100%|██████████| 42/42 [00:11<00:00,  3.61it/s]


Inference starting at 14:24:38 on 29-Aug-2022
