In [None]:
import os
print(os.getcwd())

import sys
import time
import pickle

import config
from tools.tools import restore_model, get_dirs, write_to_pkl
from tools.dataset_tools import Dataset

import random
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import KFold
from scipy import sparse

get_ipython().magic(u'load_ext autoreload')
get_ipython().magic(u'autoreload 2')

In [None]:
bench_dataset = 'FB15K237' #[FB13, FB15k, FB15k-237, NELL186, WN11, WN18, WN18RR]
model = 'TransE'
timestamp_emb = '1906141142'
splits = 'g_2negrate_bern'

project_folder = os.path.expanduser('~') + f'/proj/XKE_results/{bench_dataset}/'
emb_folder = project_folder + f'embeddings/{model}/{timestamp_emb}/'
splits_folder = project_folder + f'splits/{splits}/'

emb_results_folder = project_folder + f'emb_results/{model}_{timestamp_emb}_{splits}/'

d = Dataset(bench_dataset)

if not os.path.exists(emb_results_folder):
    os.makedirs(emb_results_folder)
    print('Creating folder: {}.'.format(emb_results_folder))

split_statistics = pd.read_csv(splits_folder + 'split_statistics.tsv', sep='\t', index_col=0)

In [None]:
print(tf.__version__)

In [None]:
split_statistics

In [None]:
con = config.Config()
embd = restore_model(con, emb_folder)

In [None]:
con.classify_triples([0, 2, 7], [1, 3, 5], [0, 1, 2])

In [None]:
embd.classify_triples([0, 2, 7], [1, 3, 5], [0, 1, 2])
# should return [True, True, False]

In [None]:
con.test_step([0, 2, 7], [1, 3, 5], [0, 1, 2])

In [None]:
emb_overall_metrics = pd.DataFrame(columns=['emb:rel_threshold', 'emb:cv_rel_threshold', 'emb:rel_train_acc','emb:cv_rel_train_acc', 'emb:rel_test_acc', 'emb:cv_rel_test_acc', 'emb:rel_train_f1','emb:cv_rel_train_f1', 'emb:rel_test_f1','emb:cv_rel_test_f1', 'emb:rel_train_tp', 'emb:rel_train_fp', 'emb:rel_train_fn', 'emb:rel_train_tn', 'emb:rel_test_tp', 'emb:rel_test_fp', 'emb:rel_test_fn', 'emb:rel_test_tn', 'emb:cv_rel_train_tp', 'emb:cv_rel_train_fp', 'emb:cv_rel_train_fn', 'emb:cv_rel_train_tn', 'emb:cv_rel_test_tp', 'emb:cv_rel_test_fp', 'emb:cv_rel_test_fn', 'emb:cv_rel_test_tn'])

In [None]:
def get_cv_threshold(df):

    min_score = df['emb_score'].min()
    max_score = df['emb_score'].max()

    negative_instances = list(df[df['label'] == 0].index)
    K = len(negative_instances) - len(df[df['label'] == 1])
    negative_samples = random.sample(negative_instances, k=K)
    df.drop(index=negative_samples, inplace=True)

    label = df.label.values
    emb_score = df.emb_score.values

    n_folds = 5
    kf = KFold(n_splits=n_folds, shuffle=True)

    splits = []
    for train_index, test_index in kf.split(df):
        splits.append(test_index)

    thresh_values = np.linspace(min_score, max_score, num=500)

    accs = []
    for value in thresh_values:
        acc = 0
        for split in splits:
            acc += accuracy_score(emb_score.take(split) < value, label.take(split))
        accs.append(acc / n_folds)
    
    accs = np.array(accs)
    cv_thresh = thresh_values[np.argmax(accs)]

    return cv_thresh


In [None]:

for rel_id in tqdm(split_statistics.index):

    rel_name = d.rel_dict[rel_id]
    rel_dir = splits_folder + d.rel_dict[rel_id] + '/'
    rel_splits = os.listdir(rel_dir)
    # rel_id = d.rel_dict_rev[rel]

    train_df, valid_df, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    emb_results_rel_folder = emb_results_folder + rel_name + '/'
    if not os.path.exists(emb_results_rel_folder):
        os.makedirs(emb_results_rel_folder)

    # if len(rel_splits) < 3:
    #     continue

    thresh = con.relThresh[int(rel_id[1:])]

    #train subset
    train_df = pd.read_csv(rel_dir + 'train.tsv', sep='\t')
    train_df['rel_thresh'] = thresh
    train_df['emb_score'] = con.test_step(train_df.e1.values, train_df.e2.values, train_df.rel.values)

    # train_emb_pred = [1 if el == True else 0 for el in train_emb_pred]
    train_emb_true = list(train_df.label.values)
    train_df['emb_pred'] = train_df['emb_score'] < thresh
    train_df['emb_pred'] = train_df['emb_pred'].astype(int)
        

    #valid subset
    if split_statistics.loc[rel_id, 'valid'] > 0:
        valid_df = pd.read_csv(rel_dir + 'valid.tsv', sep='\t')
        valid_df['rel_thresh'] = thresh
        valid_df['emb_score'] = con.test_step(valid_df.e1.values, valid_df.e2.values, valid_df.rel.values)
        valid_df['emb_pred'] = valid_df['emb_score'] < thresh
        valid_df['emb_pred'] = valid_df['emb_pred'].astype(int)

        train_valid_df = pd.concat([train_df, valid_df])
    else:
        print('Valid split for rel {} not found, skipping!'.format(rel_name))
        train_valid_df = train_df

    cv_thresh = get_cv_threshold(train_valid_df)

    train_df['emb_cv_thresh'] = cv_thresh
    train_df['emb_cv_pred'] = train_df.emb_score.values < cv_thresh
    train_df['emb_cv_pred'] = train_df['emb_cv_pred'].astype(int)
    train_df.to_csv(emb_results_rel_folder + 'train.tsv', sep='\t')

    if split_statistics.loc[rel_id, 'valid'] > 0:
        valid_df['emb_cv_thresh'] = cv_thresh
        valid_df['emb_cv_pred'] = valid_df.emb_score.values < cv_thresh
        valid_df['emb_cv_pred'] = valid_df['emb_cv_pred'].astype(int)
        valid_df.to_csv(emb_results_rel_folder + 'valid.tsv', sep='\t')

    #test subset
    if split_statistics.loc[rel_id, 'test'] > 0:
        test_df = pd.read_csv(rel_dir + 'test.tsv', sep='\t')
        test_df['rel_thresh'] = thresh
        test_df['emb_score'] = con.test_step(test_df.e1.values, test_df.e2.values, test_df.rel.values)
        test_df['emb_pred'] = test_df['emb_score'] < thresh
        test_df['emb_pred'] = test_df['emb_pred'].astype(int)
        test_df['emb_cv_thresh'] = cv_thresh
        test_df['emb_cv_pred'] = test_df.emb_score.values < cv_thresh
        test_df['emb_cv_pred'] = test_df['emb_cv_pred'].astype(int)
        test_df.to_csv(emb_results_rel_folder + 'test.tsv', sep='\t')

    else:
        print('Test split for rel {} not found, skipping!'.format(rel_name))

    emb_overall_metrics.loc[rel_id, 'emb:rel_threshold'] = thresh
    emb_overall_metrics.loc[rel_id, 'emb:cv_rel_threshold'] = cv_thresh

    emb_overall_metrics.loc[rel_id, 'emb:rel_train_acc'] = accuracy_score(train_df.label.values, train_df.emb_pred.values)
    emb_overall_metrics.loc[rel_id, 'emb:rel_train_f1'] = f1_score(train_df.label.values, train_df.emb_pred.values)
    emb_overall_metrics.loc[rel_id, 'emb:rel_train_tn'], emb_overall_metrics.loc[rel_id, 'emb:rel_train_fp'], emb_overall_metrics.loc[rel_id, 'emb:rel_train_fn'], emb_overall_metrics.loc[rel_id, 'emb:rel_train_tp'] =  confusion_matrix(train_df.label.values, train_df.emb_pred.values).ravel()

    emb_overall_metrics.loc[rel_id, 'emb:cv_rel_train_acc'] = accuracy_score(train_df.label.values, train_df.emb_cv_pred.values)
    emb_overall_metrics.loc[rel_id, 'emb:cv_rel_train_f1'] = f1_score(train_df.label.values, train_df.emb_cv_pred.values)
    emb_overall_metrics.loc[rel_id, 'emb:cv_rel_train_tn'], emb_overall_metrics.loc[rel_id, 'emb:cv_rel_train_fp'], emb_overall_metrics.loc[rel_id, 'emb:cv_rel_train_fn'], emb_overall_metrics.loc[rel_id, 'emb:cv_rel_train_tp'] =  confusion_matrix(train_df.label.values, train_df.emb_cv_pred.values).ravel()

    if split_statistics.loc[rel_id, 'test'] > 0:
        emb_overall_metrics.loc[rel_id, 'emb:rel_test_acc'] = accuracy_score(test_df.label.values, test_df.emb_pred.values)
        emb_overall_metrics.loc[rel_id, 'emb:rel_test_f1'] = f1_score(test_df.label.values, test_df.emb_pred.values)
        emb_overall_metrics.loc[rel_id, 'emb:rel_test_tn'], emb_overall_metrics.loc[rel_id, 'emb:rel_test_fp'], emb_overall_metrics.loc[rel_id, 'emb:rel_test_fn'], emb_overall_metrics.loc[rel_id, 'emb:rel_test_tp'] =  confusion_matrix(test_df.label.values, test_df.emb_pred.values).ravel()

        emb_overall_metrics.loc[rel_id, 'emb:cv_rel_test_acc'] = accuracy_score(test_df.label.values, test_df.emb_cv_pred.values)
        emb_overall_metrics.loc[rel_id, 'emb:cv_rel_test_f1'] = f1_score(test_df.label.values, test_df.emb_cv_pred.values)
        emb_overall_metrics.loc[rel_id, 'emb:cv_rel_test_tn'], emb_overall_metrics.loc[rel_id, 'emb:cv_rel_test_fp'], emb_overall_metrics.loc[rel_id, 'emb:cv_rel_test_fn'], emb_overall_metrics.loc[rel_id, 'emb:cv_rel_test_tp'] =  confusion_matrix(test_df.label.values, test_df.emb_cv_pred.values).ravel()

emb_overall_metrics.fillna(0, inplace=True)
emb_overall_metrics.to_csv(emb_results_folder + 'emb_metrics.tsv', sep='\t')

In [None]:
emb_overall_metrics

In [None]:
train_acc_emb = (emb_overall_metrics['emb:rel_train_tp'].sum() + emb_overall_metrics['emb:rel_train_tn'].sum()) / (emb_overall_metrics['emb:rel_train_tp'].sum() + emb_overall_metrics['emb:rel_train_tn'].sum() + emb_overall_metrics['emb:rel_train_fp'].sum() + emb_overall_metrics['emb:rel_train_fn'].sum())
train_acc_emb

In [None]:
test_acc_emb = (emb_overall_metrics['emb:rel_test_tp'].sum() + emb_overall_metrics['emb:rel_test_tn'].sum()) / (emb_overall_metrics['emb:rel_test_tp'].sum() + emb_overall_metrics['emb:rel_test_tn'].sum() + emb_overall_metrics['emb:rel_test_fp'].sum() + emb_overall_metrics['emb:rel_test_fn'].sum())
test_acc_emb

In [None]:
train_acc_emb_cv = (emb_overall_metrics['emb:cv_rel_train_tp'].sum() + emb_overall_metrics['emb:cv_rel_train_tn'].sum()) / (emb_overall_metrics['emb:cv_rel_train_tp'].sum() + emb_overall_metrics['emb:cv_rel_train_tn'].sum() + emb_overall_metrics['emb:cv_rel_train_fp'].sum() + emb_overall_metrics['emb:cv_rel_train_fn'].sum())
train_acc_emb_cv

In [None]:
test_acc_emb_cv = (emb_overall_metrics['emb:cv_rel_test_tp'].sum() + emb_overall_metrics['emb:cv_rel_test_tn'].sum()) / (emb_overall_metrics['emb:cv_rel_test_tp'].sum() + emb_overall_metrics['emb:cv_rel_test_tn'].sum() + emb_overall_metrics['emb:cv_rel_test_fp'].sum() + emb_overall_metrics['emb:cv_rel_test_fn'].sum())
test_acc_emb_cv

In [None]:
con.relThresh[82]

In [None]:
emb_overall_metrics.loc['r82', 'emb:rel_threshold']

## Build g_hat for this embedding
In fact this is not g_hat, it is the set of all 

In [None]:
# in fact this is not g_hat, it is the set of all triples deemed positive by the embedding considering all heads and tails that are within the type constraints of the dataset

In [None]:
emb_metrics = pd.read_csv(emb_results_folder + 'emb_metrics.tsv', sep='\t', index_col=0)

In [None]:
emb_metrics.loc['r82', 'emb:rel_threshold']

In [None]:
emb_metrics.head()

In [None]:
direct_rels = [r for r in d.rel_dict.keys() if r[0] != 'i']
inverse_rels = [r for r in d.rel_dict.keys() if r[0] == 'i']

In [None]:
type_constraints = d.build_type_constraints()

In [None]:
len(type_constraints['r0']['tail_int'])

In [None]:
g_hat = dict()
i = 1
for rel in tqdm(direct_rels):

    # print(f'\n{i}/{len(direct_rels)}\tBuilding g_hat for rel {rel}.')
    # time.sleep(0.2)

    if emb_metrics.loc[rel, 'emb:rel_threshold'] == 0:
        threshold = emb_metrics.loc[rel, 'emb:cv_rel_threshold']
        # print(f'Using cv_rel_treshold:{threshold} for relation {rel}')
    else:
        threshold = emb_metrics.loc[rel, 'emb:rel_threshold']

    g_hat[rel] = embd.build_emb_rel_matrix(type_constraints[rel]['head_int'], type_constraints[rel]['tail_int'], int(rel[1:]), threshold)
    i+=1

In [None]:
edges = 0

for rel in direct_rels:
    edges += g_hat[rel].sum()

print(f'g_hat has {edges} edges!')

In [None]:
print('Storing g_hat to disk.')
with open(emb_results_folder + 'g_hat.pkl', 'wb') as outfile:
    pickle.dump(g_hat, outfile, pickle.HIGHEST_PROTOCOL)