In [None]:
import os, subprocess
from tools import dataset_tools
import pandas as pd
import numpy as np
import config, models
import multiprocessing

# Extract features from Ĝ

In [None]:
# main variables
dataset_name               = "NELL186"
embedding_model            = models.TransE
model_timestamp            = '1524632595'
knn_k                      = 5 # number of nearest neighbors
g_hat_fname_ids            = 'positives2id_{}nn.tsv'.format(knn_k)
g_hat_fname_names          = 'positives_{}nn.tsv'.format(knn_k)
neg_rate                   = 5 # negative to positive ratio
bern                       = True
feature_extractors         = ['pra', 'onesided', 'anyrel'] # pra, onesided or anyrel

# GPU settings
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # should be a string

In [None]:
dataset_path = './benchmarks/{}/'.format(dataset_name)
import_path = './results/{}/{}/{}/'.format(
    dataset_name,
    embedding_model.__name__,
    model_timestamp
)
pra_explain_path = import_path + '/pra_explain/'
pra_explain_path_abs = os.path.abspath(import_path + '/pra_explain/')
experiment_specs_path = pra_explain_path + '/experiment_specs/'
distribution = 'bern' if bern else 'unif'
split_name = 'g_hat_{}nn_{}negrate_{}'.format(knn_k, neg_rate, distribution)
g_hat_path_ids = os.path.abspath(import_path + '/g_hat/' + g_hat_fname_ids)
g_hat_path_names = os.path.abspath(import_path + '/g_hat/' + g_hat_fname_names)

# ensure dirs exist
def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d)
ensure_dir(pra_explain_path)
ensure_dir(experiment_specs_path)

# handle feature extraction strings and split name
feature_extractor_dict = {
    'pra': 'PraFeatureExtractor',
    'onesided': 'OneSidedFeatureExtractor',
    'anyrel': 'AnyRelFeatureExtractor'
}
spec_g_hat_name = split_name + '__'
feat_list = []
for feat in feature_extractors:
    spec_g_hat_name += '_' + feat
    feat_list.append('"{}"'.format(feature_extractor_dict[feat]))
feat_extractor_string = ','.join(feat_list)

## Generate/Read Negative Examples

In [None]:
corrupted_filename = 'train2id_{}_{}to1.txt'.format(distribution, neg_rate)
corrupted_dirpath = dataset_path + '/corrupted/'
corrupted_filepath = corrupted_dirpath + corrupted_filename

In [None]:
if not os.path.exists(corrupted_filepath):
    # create corrupted dirpath if not exist
    if not os.path.exists(corrupted_dirpath):
        os.makedirs(corrupted_dirpath)
    # generate corrupted set and save to disk in `corrupted` folder
    corrupted = dataset_tools.generate_corrupted_training_examples(dataset_path,
            neg_proportion=neg_rate, bern=bern)
    train2id = pd.DataFrame(corrupted)
    train2id.to_csv(corrupted_filepath,
        columns=['head', 'tail', 'relation', 'label'], index=False, header=False, sep=' ')
    print('Created corrupted file: {}.'.format(corrupted_filepath))    
else:
    train2id = pd.read_csv(corrupted_filepath,
        names=['head', 'tail', 'relation', 'label'], sep=' ', skiprows=0)
    print('Corrupted file already exists: {}.'.format(corrupted_filepath))

## Read validation and test examples

In [None]:
if dataset_name == 'FB15k' :
    valid2id = pd.read_csv(dataset_path + 'valid_neg.txt', sep='\t', skiprows=0, names=['head', 'tail', 'relation', 'label'])
    test2id = pd.read_csv(dataset_path + 'test_neg.txt', sep='\t', skiprows=0, names=['head', 'tail', 'relation', 'label'])
else:
    valid2id_pos = pd.read_csv(dataset_path + 'valid2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
    valid2id_neg = pd.read_csv(dataset_path + 'valid2id_neg.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
    test2id_pos = pd.read_csv(dataset_path + 'test2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
    test2id_neg = pd.read_csv(dataset_path + 'test2id_neg.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])

    valid2id_pos['label'] = 1
    valid2id_neg['label'] = -1
    test2id_pos['label'] = 1
    test2id_neg['label'] = -1

    valid2id = pd.concat((valid2id_pos, valid2id_neg))
    test2id = pd.concat((test2id_pos, test2id_neg))

In [None]:
display(train2id.head())
display(valid2id.head())
display(test2id.head())

## Restore working model

In [None]:
model_info_df = pd.read_csv('{}/model_info.tsv'.format(import_path), sep='\t')

In [None]:
# transform model info into dict with only one "row"
model_info = model_info_df.to_dict()
for key,d in model_info.iteritems():
    model_info[key] = d[0]

In [None]:
con = config.Config()
dataset_path = "./benchmarks/{}/".format(model_info['dataset_name'])
con.set_in_path(dataset_path)
con.set_test_link_prediction(False)
con.set_test_triple_classification(True)
con.set_work_threads(multiprocessing.cpu_count())
con.set_dimension(int(model_info['k']))
con.score_norm = model_info['score_norm']
con.init()
con.set_model(embedding_model)
con.import_variables("{}tf_model/model.vec.tf".format(import_path)) # loading model via tensor library

In [None]:
model_info_df.transpose()

## Predict and Update Data

In [None]:
for fold in [train2id, valid2id, test2id]:
    fold['label'] = con.classify(fold['head'], fold['tail'], fold['relation'])
    fold['label'] = fold['label'].map(lambda x: 1 if x==1 else -1)

## Decode from id to names

In [None]:
g_hat_df = pd.read_csv(g_hat_path_ids, names=['head', 'relation', 'tail'], sep='\t', skiprows=0)

In [None]:
entity2id, id2entity = dataset_tools.read_name2id_file(dataset_path + 'entity2id.txt')
relation2id, id2relation = dataset_tools.read_name2id_file(dataset_path + 'relation2id.txt')

for fold in [train2id, valid2id, test2id, g_hat_df]:
    fold['head'] = fold['head'].map(id2entity)
    fold['tail'] = fold['tail'].map(id2entity)
    fold['relation'] = fold['relation'].map(id2relation)

# WARNING: at this stage we have transformed the dataframes,
#   and entities and relations are not represented by ids anymore

Save G^ decoded to names file

In [None]:
g_hat_df.to_csv(g_hat_path_names, header=False, index=False, sep='\t', columns=['head', 'relation', 'tail'])

## Setup PRA Experiment Specs

In [None]:
spec_g_hat = """
{{
    "graph": {{
        "name": "g_hat_{}nn",
        "relation sets": [
            {{
                "is kb": false,
                "relation file": "{}"
            }}
        ]
    }},
    "split": "{}",
    "operation": {{
        "type": "create matrices",
        "features": {{
            "type": "subgraphs",
            "path finder": {{
                "type": "BfsPathFinder",
                "number of steps": 2
            }},
            "feature extractors": [
                "PraFeatureExtractor"
            ],
            "feature size": -1
        }}
    }},
    "output": {{ "output matrices": true }}
}}

""".format(knn_k, g_hat_path_names, split_name)
spec_g_hat_fpath = '{}/experiment_specs/{}.json'.format(pra_explain_path, spec_g_hat_name)
with open(spec_g_hat_fpath, 'w') as f:
    f.write(spec_g_hat)
print "Spec file written: {}".format(spec_g_hat_fpath)

## Create Split

Generate split (inside `./results/`) with random negative examples (bernoulli or uniform)

In [None]:
from tools import pra_setup

pra_setup.create_split({'train': train2id, 'valid': valid2id, 'test': test2id},
                       splits_dirpath=import_path+'/pra_explain/splits',
                       split_name=split_name)

## Run PRA (extract features for split)

In [None]:
%%bash -s "$pra_explain_path_abs" "$spec_g_hat_name"
(cd /home/arthurcgusmao/Projects/xkbc/algorithms/pra/; sbt "runMain edu.cmu.ml.rtw.pra.experiments.ExperimentRunner $1 $2")

In [None]:
print("Features (paths) extracted and saved into:\n{}".format(os.path.abspath(pra_explain_path)))

# Debug

In [None]:
# !rm /home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/./results/FB13/TransE/1524490825//pra_explain//results/ -r