In [None]:
import os

try:
	os.chdir(os.path.join('/home/andrey/proj/OpenKE/'))
	print('Current working dir:', os.getcwd())
except:
	pass

import pandas as pd
import numpy as np
from tools.tools import get_dirs, write_to_pkl, load_file, restore_model
from tools.dataset_tools import Dataset
from tools.explainer import Explainer
from tqdm import tqdm

from sfe_ar.tools.helpers import generate_timestamp

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 200)
get_ipython().magic(u'load_ext autoreload')
get_ipython().magic(u'autoreload 2')

## SFE Timestamps



In [None]:
splits = 'g_2negrate_bern'
timestamp_emb = '1906141142'
timestamp_sfe = '2010150748'
dataset = 'FB15K237'
emb_model = 'TransE'
kv_model = 'fb15k237_Google_news_d300.model'

# timestamp_sfe = '2010161223'
# dataset = 'NELL186'
# emb_model = 'Analogy'
# timestamp_emb = '1904121223'
# kv_model = 'word2vec/NELL186_Google_news_d300.model'

In [None]:
e = Explainer(dataset, 
                emb_model, 
                timestamp_emb, 
                timestamp_sfe,
                splits, 
                method='fast')

In [None]:
e.load_kv_model(kv_model)

In [None]:
param_grid_logit = [{
            'l1_ratio': [.1, .5, .7, .9, .95, .99, 1],
            'alpha': [0.01, 0.001, 0.0001],
            'loss': ["log"],
            'penalty': ["elasticnet"],
            'max_iter': [100000],
            'tol': [1e-3],
            'class_weight': ["balanced"],
            'n_jobs': [10]
}]

e.set_param_grid_logit(param_grid_logit)

In [None]:
e.set_prune_dict(
    {
        'pru:prunning':'force',
        # 'pru:node_relv_in':False,
        'pru:top_pop': 0.2,
        # 'pru:top_avg_rel_sim': 0.2,
        'xke:evaluate_benchmarks':False
    }
)

e.train_test_logit()

In [None]:
e.logit_models['r0']['xke']

# Build X_test_pred Bench Test

In [None]:
model_folder = e.logit_results_folder + '2011192127/'
xke_model = e.load_from_pkl(model_folder + 'logit_models')
xke = xke_model['r82']
feature_names = np.array(xke['feature_names'])

In [None]:
X = xke['X_test'].todense()

In [None]:
np.array(X[xke['y_test_emb'] == 1].sum(axis=0))[0]

In [None]:
coefs = xke['xke'].coef_[0]
intercept = xke['xke'].intercept_[0]

In [None]:
features = pd.DataFrame(index=feature_names)
features['coefs'] = coefs
features.reset_index(inplace=True)
features.rename(columns={'index':'path'}, inplace=True)
features['idx'] = features.index
features.set_index('path', inplace=True)
features.sort_values(by='coefs', ascending=False, inplace=True)
features = features[features['coefs'] != 0]

pos_features = features[features['coefs'] > 0]
pos_features.sort_values(by='coefs', ascending=False, inplace=True)
print('pos_features has {} features.'.format(len(pos_features)))

neg_features = features[features['coefs'] < 0]
neg_features.sort_values(by='coefs', ascending=True, inplace=True)
print('neg_features has {} features.'.format(len(neg_features)))

In [None]:
x0 = X[0].toarray()[0]
x0

In [None]:
xke['xke'].decision_function(X[0].toarray())[0]

In [None]:
idx = np.arange(len(feature_names))

In [None]:
mask = (x0 != 1) & (coef > 0)

In [None]:
selected_features = feature_names[mask]
selected_features

In [None]:
selected_pos_features = features[features.index.isin(selected_features)]

In [None]:
f_features = list(selected_pos_features.index.values)
f_coefs = list(selected_pos_features.coefs.values)
f_idx = list(selected_pos_features.idx.values)

In [None]:
f_features

In [None]:
f_coefs

In [None]:
xke['xke'].intercept_[0]

In [None]:
coef[(x0 == 1) & (coef != 0)].sum()

In [None]:
xke['xke'].decision_function(X[0])

In [None]:
coef[coef > 0].sum() + intercept

In [None]:
coef[coef<0].sum() + intercept

## Explanations file bench test

In [None]:
_, _ , _, X_test, y_test, y_test_emb, _, _, _ , _ , feature_names2, test_triples = e.fast_load_data(e.rel_dict['r82'])

In [None]:
model_folder = e.logit_results_folder + '2011240652/'
xke_model = e.load_from_pkl(model_folder + 'logit_models')
xke = xke_model['r82']
feature_names = np.array(xke['feature_names'])
XKEe_X_test = xke['XKEe_X_test']
coefs = xke['xke'].coef_

In [None]:
explain_model = pd.read_csv(model_folder + 'r82_coefs.tsv', sep='\t', index_col=1)

In [None]:
from io import StringIO
from csv import writer

In [None]:
cols = ['triple_id', 'triple', 'label', 'emb_label', 'XKE_label', 'XKEe_label', 'sim_index', 'coef', 'g_hat', 'explanation']

output = StringIO()
csv_writer = writer(output)

emb_labels = xke['y_test_emb']
xke_labels = xke['xke'].predict(X_test.toarray())
xkee_labels = xke['xke'].predict(XKEe_X_test.todense())

j = 0
for triple_id, emb_label, xke_label, xkee_label in zip(test_triples, emb_labels, xke_labels, xkee_labels):
    triple = triple_id.split('_')
    triple_descr = str(e.ent_dict[triple[0]]) + ' | ' + str(e.ent_dict[triple[1]])
    label = triple[2]

    mask = np.array((X_test[j].todense() != 0) & (coefs != 0))[0]

    active_features = feature_names[mask]

    for feature in active_features:
        csv_writer.writerow([triple_id]+[triple_descr]+[label]+[emb_label]+[xke_label]+[xkee_label]+[explain_model.loc[feature, 'avg_rel_sim']]+[explain_model.loc[feature, 'coef']] + [0] + [e.explain_path(feature)])

    new_mask = np.array((XKEe_X_test.todense()[j] > X_test.todense()[j]) & (coefs != 0))[0]

    active_features = feature_names[new_mask]

    for feature in active_features:
        csv_writer.writerow([triple_id]+[triple_descr]+[label]+[emb_label]+[xke_label]+[xkee_label]+[explain_model.loc[feature, 'avg_rel_sim']]+[explain_model.loc[feature, 'coef']]+ [1] + [e.explain_path(feature)])
    j += 1 
   
output.seek(0)
df = pd.read_csv(output, sep=',',names=cols)
print(f'DataFrame with {df.shape[0]} rows and {df.shape[1]} cols.')
df.head()

In [None]:
sigmoide(df[df['triple_id'] == test_triples[3]].sort_values(by='coef', ascending=False).coef.sum())

In [None]:
def sigmoide(x):
    return 1 / (1 + np.exp(-x))

In [None]:
sigmoide(8)

In [None]:
e.emb.test_step([193], [1771], [82])

## Bench Test

In [None]:
import os

try:
	os.chdir(os.path.join('/home/andrey/proj/OpenKE/'))
	print('Current working dir:', os.getcwd())
except:
	pass

import pandas as pd
import numpy as np
from tools.tools import get_dirs, write_to_pkl, load_file, restore_model
from tools.dataset_tools import Dataset
from tools.explainer import Explainer
from tqdm import tqdm

from sfe_ar.tools.helpers import generate_timestamp

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 200)
get_ipython().magic(u'load_ext autoreload')
get_ipython().magic(u'autoreload 2')# Tensorflow Experiments

import sys
import time
import tensorflow as tf
print(tf.__version__)

In [None]:
timestamp_emb = '1906141142_a'
timestamp_sfe = '2010150748'
dataset = 'FB15K237'
emb_model = 'TransE'

e = Explainer(dataset, emb_model, timestamp_emb, timestamp_sfe, method='fast')
e.load_true_sets()
e.build_graph()
e.load_kbe()

In [None]:
e.emb.enhanced_true_tails([72], 82, 0.2)

In [None]:
e.emb.classify(72, 82, 0.2)

In [None]:
e.emb.test_step([72], [44], [82])

In [None]:
np.nonzero(e.emb.get_true_tails_np([72], 82))

In [None]:
e.names_dict['e72']

In [None]:
e.emb.calculate_thresholds()

In [None]:
e.emb.relThresh[82]

In [None]:
nodes = [0, 1]
rel = 82
rel_thresh = e.emb.relThresh[82]

In [None]:
# e.emb.optimized_node_expansion(nodes, rel, rel_thresh)

In [None]:
e.emb.classify_classes

In [None]:
a = tf.constant([True, True, False], tf.bool)
b = tf.constant([True, False, False], tf.bool)

In [None]:
with tf.Session() as sess:

    print(sess.run(tf.math.logical_or(a, b)))

In [None]:
h = tf.constant(0, shape=(e.emb.entTotal,))
r = tf.constant(82, shape=(e.emb.entTotal,))
t = tf.range(start=0, limit=e.emb.entTotal, dtype=tf.int32)

In [None]:
d = tf.constant([3])
e = tf.reshape(d, shape=)
with tf.Session() as sess:
    print(sess.run(e))

In [None]:
n = 1000

start = time.time()
h = np.array([72] * (e.emb.entTotal * n))
r = np.array([82] * (e.emb.entTotal * n))
t = np.array(list(range(e.emb.entTotal * n)))

with e.emb.graph.as_default():
    with e.emb.sess.as_default():

        feed_dict = {
            e.emb.trainModel.predict_h: h,
            e.emb.trainModel.predict_t: t,
            e.emb.trainModel.predict_r: r,
        }
        res = e.emb.sess.run(e.emb.trainModel.predict, feed_dict)

print(f'Elapsed time: {time.time() - start}')
print(f'res has len={len(res)}')

In [None]:
n = 300

start = time.time()
h = np.array(list(range(n)))
r = 82

with e.emb.graph.as_default():
    with e.emb.sess.as_default():

        feed_dict = {
            # h_e:h,
            # r_e:r
        }
        for i in h:
            e.emb.h = i
            res = e.emb.sess.run(e.emb.trainModel.predict, feed_dict)

print(f'Elapsed time: {time.time() - start}')
print(f'res has len={len(res)}')

In [None]:
res

In [None]:
sys.getsizeof(h)

In [None]:
res.shape

In [None]:
n = 10000
hs = np.array(list(range(n)), dtype=np.int32)
rs = 82
rel_thresh = e.emb.relThresh[82]
hs.shape

In [None]:
q = tf.FIFOQueue(capacity=10, 
                dtypes=tf.int32,
                shapes=[])
enqueue_op = q.enqueue_many(hs)
qr = tf.train.QueueRunner(q, [enqueue_op] * 1)
tf.train.add_queue_runner(qr)

node = q.dequeue()
node_to_expand = tf.reshape(node, shape=(1,))

t = tf.range(start=0, limit=e.emb.entTotal, dtype=tf.int32)

rel = tf.constant(rs, shape=(1,))

n_ents = tf.constant([e.emb.entTotal])
heads = tf.tile(node_to_expand, n_ents)
rels = tf.tile(rel, n_ents)

data = tf.Print([heads], data=[heads, rels, t, tf.shape(heads)], message='This is how many items are left in q: ')
# # h = tf.reshape(node_to_expand, shape=(e.emb.entTotal,))
# r = tf.constant(rs, shape=(e.emb.entTotal,), dtype=tf.int32)
# t = tf.range(start=0, limit=e.emb.entTotal, dtype=tf.int32)
fg = data + 1

In [None]:
start = time.time()

with tf.Session() as sess:

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(len(hs)):
        sess.run(fg)
    print("We're here!")

    coord.request_stop()
    coord.join(threads)

print(f'Elapsed time: {time.time() - start}')

In [None]:
# https://stackoverflow.com/questions/38856292/tensorflow-queue-feed-order

In [None]:
import numpy as np
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:

h = np.array([1, 2, 3])
r = np.array([3])
n_ents = 10
thres = 0.17

t_h = tf.placeholder(tf.int64)
t_ents = tf.constant([n_ents])
t_heads = tf.constant([1, len(h)])

t_hh = tf.reshape(tf.tile(t_h, t_ents), [n_ents, len(h)])
t_rr = tf.constant(r, tf.int64, shape=(n_ents, len(h)))
t_tt = tf.tile(tf.reshape(tf.range(start=0, limit=n_ents, dtype=tf.int64), [n_ents, 1]), t_heads)

out = t_hh + t_rr + t_tt

with tf.Session() as sess:


    print(sess.run(out, feed_dict={t_h:h}))




In [None]:
h

In [None]:
len(h)

In [None]:
a = np.zeros(shape=(14500, 14500), dtype=np.bool)

In [None]:
sys.getsizeof(a) / 1000000

In [None]:
np.count_nonzero(e.emb.build_rel_ghat(82, e.emb.relThresh[82]))

In [None]:
e.emb.entTotal

In [None]:
heads = np.array(list(range(e.emb.entTotal)))

In [None]:
start = time.time()

e.emb.enhanced_true_tails(heads, 82, e.emb.relThresh[82])

print(f'Finished process in {time.time() - start}s.')

In [None]:
from scipy import sparse

In [None]:
n = np.zeros(shape=(e.emb.entTotal, e.emb.entTotal), dtype=np.bool)

In [None]:
sys.getsizeof(n)

In [None]:
x = e.emb.build_emb_rel_matrix(heads, 82, e.emb.relThresh[82])

In [None]:
sys.getsizeof(x)

In [None]:
len(np.nonzero(r[:, 44])[0].tolist())

In [None]:
sys.getsizeof(r)

In [None]:
e.ent_dict['e44']

In [None]:
start = time.time()
for _ in range(100):
    np.dot(x[44].todense(), x[44].T.todense())
print(f'Elapsed Time: {time.time() - start}s.')

In [None]:
start = time.time()
for _ in range(100):
    set(np.nonzero(x[44])[0].tolist()).isdisjoint(set(np.nonzero(x[44])[0].tolist()))
print(f'Elapsed Time: {time.time() - start}s.')

In [None]:
sys.getsizeof(x)

In [None]:
len(np.nonzero(x.T[44])[1].tolist())

In [None]:
0.3 * 237

In [None]:
np.dot(x[72].todense(), x[44].T.todense())

In [None]:
empty_graph = sparse.lil_matrix(np.zeros(shape=(e.emb.entTotal, e.emb.entTotal), dtype = np.bool))

In [None]:
graph = dict()
for rel in range(237):
    graph[rel] = empty_graph.copy()

In [None]:
sys.getsizeof(graph)

In [None]:
rels = [82, 23, 0]

for rel in rels:
    graph[rel][:] = e.emb.build_emb_rel_matrix(heads, rel, e.emb.relThresh[rel])

In [None]:
graph[82]

In [None]:
for i in range(237):
    print(f'rel {i} : {e.emb.relThresh[i]}')

In [None]:
start = time.time()
x = graph[82]
y = graph[82].T

for _ in range(100):
    np.nonzero(x[44])[1].tolist()
print(f'Elapsed Time: {time.time() - start}s.')
len(np.nonzero(x[44])[1].tolist())

In [None]:
sys.getsizeof(graph[220])

In [None]:
np.nonzero(graph[82][44])[1].tolist()

In [None]:
matrix = np.random.randint(0, 10, size=(10, 10))
matrix[4] = 0
matrix[2, 2:5] = 0
matrix

In [None]:
mask = np.zeros(shape=(10), dtype=np.bool)
heads = [2, 4]
mask[heads] = 1
mask

In [None]:
matrix[:, mask]

In [None]:
np.nonzero(matrix[:, mask].sum(axis=1))[0].tolist()

# PathFinder BenchTest

In [1]:
import os

try:
	os.chdir(os.path.join('/home/andrey/proj/OpenKE/'))
	print('Current working dir:', os.getcwd())
except:
	pass

import pandas as pd
import numpy as np

from collections import defaultdict

from tools.tools import get_dirs, write_to_pkl, load_file, restore_model
from tools.dataset_tools import Dataset
from tools.explainer import Explainer
from tqdm import tqdm

from sfe_ar.tools.helpers import generate_timestamp

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 200)
get_ipython().magic(u'load_ext autoreload')
get_ipython().magic(u'autoreload 2')# Tensorflow Experiments

import sys
import time
# import tensorflow as tf
# print(tf.__version__)

Current working dir: /media/andrey/2a3d8a6c-48b6-437b-9410-7c45ccb1c802/andrey/proj/OpenKE
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
dataset = 'FB15K237'
splits = 'g_2negrate_bern'
kv_model = 'fb15k237_Google_news_d300.model'

emb_model = 'TransE'
timestamp_emb = '1906141142'
timestamp_sfe = '2009262117'

param_grid_logit = [{
            'l1_ratio': [.1, .5, .7, .9, .95, .99, 1],
            'alpha': [0.01, 0.001, 0.0001],
            'loss': ["log"],
            'penalty': ["elasticnet"],
            'max_iter': [100000],
            'tol': [1e-3],
            'class_weight': ["balanced"],
            'n_jobs': [10]
}]
params = {'pru:prunning':'force', 'xke:evaluate_benchmarks':False, 'pru:top_avg_rel_sim': 0.1}


e = Explainer(dataset, 
                emb_model, 
                timestamp_emb, 
                timestamp_sfe,
                splits, 
                method='fast')

e.load_kv_model(kv_model)
e.set_param_grid_logit(param_grid_logit)
e.set_prune_dict(params)
e.build_graph()
e.load_g_hat()

Loaded FB15K237 Dataset with 14541 entities and 474 relations.


Loaded sfe_model_info!
Loaded emb_model_info for 1906141142 timestamp!
Loaded Keyed-Vectors Similarity Model.
Computed rel and ent similarity matrices.

Building FB15K237 graph... loading triples... Loading FB15K237 true facts...  Done!

Train set has 272115 triples
Test set has 20466 triples
Valid set has 17535 triples
Done!

Graph built with 579300 edges.
Loadding g_hat file... Done!


In [3]:
e.build_g_hat_dict()

100%|██████████| 237/237 [00:24<00:00,  9.86it/s]Done!



In [114]:
path = 'r48'
triple = 'e72_e2410_1_r82'

In [115]:
t1 = time.time()
print(e.build_emb_path(triple, path))
print(f'Finished in {time.time()-t1} seconds.')

True
Finished in 0.023296833038330078 seconds.


In [116]:
t1 = time.time()
print(e.path_builder(triple, path))
print(f'Finished in {time.time()-t1} seconds.')

True
Finished in 0.00027823448181152344 seconds.


In [30]:
{2, 5} & {2, 3}

{2}

In [113]:
e.graph['e72']['r48']

['e2410']