# How to run:

*   **Libraries**, to import all the needed libraries (if Colab shows an error, restart the runtime).
*   If you would create the toy graph,
    * **Toy Graph (with two communities)**, to create the graph with NetworkX
*   Otherwise,
    * **Import Graph from dataset**, to import one of the PyKEEN public datasets (Nations, Kinships, UMLS, FB15k-237, WN18, ecc.)
    * **Create and visualize the KG**, to create the graph (from the chosen dataset) with NetworkX. You can decide to make it *undirected* or *directed*, commenting the proper line of code.
*   If you already have a trained model,
    * **Load model**, to load a pre-trained model on the chosen dataset.
*   Otherwise,
    *   **Divide into three groups**, to separate the triplets in *Positive*, *Negative* and *Different relation* sets. The last one gets all the triplets generated from two neighboring entities, differently from the Negative set, which contains all the triplets from two entities that have no linked relations between them. Run this task only if you want to get information about distribution of scores for each set.
    *   **Train the embeddings**, to train the embeddings on the training set retrieved from the chosen dataset.
*   **Extract the embeddings**, to achieve entity/relation embeddings using the newly trained model.
*   **Reliability**, to define the function used to compute the reliability score on a set of triplets in three different ways: *Normalized Score*, *Non Normalized Score*, *Sigmoid Score*.
*   **LP Classifier**, to define the function used to create a classifier with training/test sets composed by triplets in the selected subgraph (or also for the entire graph).
*   **Reliability/Accuracy of subgraph**, functions mentioned above are called, passing subgraphs generated by a random walk on the original graph. Make sure to use the correct version (*Undirected* or *Directed*), based on the choice made in the initial part of the process.
*   **Reliability/Accuracy of the graph**, in this way both functions will compute their values with the given information on the entire graph.
*   **Compute scores**, to compute scores of every single triple in the three generated sets.
*   **Plot scores**, to plot distributions of scores.

The last part (evaluation) regards the ranking, so it is possible to compute metrics on the information retrieved by the ranking of triplets between every couple of entities in the graph.

# Libraries

In [None]:
# Install packages if they are not already found
! python -c "import pykeen" || pip install git+https://github.com/pykeen/pykeen.git
! python -c "import networkx" || pip install networkx

In [None]:
from pkg_resources import require
require('pykeen')

# Import libraries
import os
from google.colab import drive

import matplotlib.pyplot as plt
import scipy.stats as st
import torch

import pykeen
from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset
from pykeen.triples import TriplesFactory
from pykeen.models.predict import *

import networkx as nx
import pandas as pd
import csv

import numpy as np
import seaborn as sns

import math
import random
import itertools
import statistics
import time

# Access data from Google Drive
drive.mount('/content/drive', force_remount='True')
drive_path = '/content/drive/My Drive/KG_Reliability_Models/'

# Change the model
my_model = 'TransE'

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

Mounted at /content/drive


In [None]:
pykeen.env()

# Toy Graph (with two communities)

In [None]:
dataset = "ToyGraph"

nodes = list(range(101))

relations = ['a', 'b']

a_edges = list()
b_edges = list()

a = nx.Graph()

a.add_nodes_from(nodes)

for n1 in range(0, len(nodes)):
    for n2 in range(n1 + 1, len(nodes)):
        a.add_edge(n1, n2, label = 'a')
        if n1 == 0:
            a_edges.append((str(n1), 'a', 'a-' + str(n2)))
        else:
            a_edges.append(('a-' + str(n1), 'a', 'a-' + str(n2)))

b = nx.Graph()

b.add_nodes_from(nodes)

for n1 in range(0, len(nodes)):
    for n2 in range(n1 + 1, len(nodes)):
        b.add_edge(n1, n2, label = 'b')
        if n1 == 0:
            b_edges.append((str(n1), 'b', 'b-' + str(n2)))
        else:
            b_edges.append(('b-' + str(n1), 'b', 'b-' + str(n2)))

a = nx.relabel_nodes(a, { n: str(n) if n==0 else 'a-' + str(n) for n in nodes })
b = nx.relabel_nodes(b, { n: str(n) if n==0 else 'b-' + str(n) for n in nodes })

G = nx.compose(a, b)

c_edges = a_edges + b_edges

print(len(a_edges))

tf = TriplesFactory.from_labeled_triples(np.array(c_edges))

print(len(tf.triples))

train_triples, test_triples = tf.split()

nx.draw_networkx(G, with_labels=True, node_size=500)
plt.show()

# Import Graph from dataset

In [None]:
# Get all the triples of the KG

dataset = "WN18" # Kinships, UMLS, FB15k-237, WN18

tf_train = get_dataset(dataset=dataset).training.triples.tolist()
tf_test  = get_dataset(dataset=dataset).testing.triples.tolist()
tf_valid  = get_dataset(dataset=dataset).validation.triples.tolist()

tf_all = tf_train + tf_test + tf_valid

relations = []

for t in tf_all:
    if t[1] not in relations: # To limit relations number to 5, add " and len(relations) < 5"
        relations.append(t[1])

print("Relations: ", relations)

tf = TriplesFactory.from_labeled_triples(np.array(tf_all))

print("Total triples: ", len(tf.triples))

# Split the input KG into Train and LP+
#train, LP_pos = tf.split([.5, .5])

# Split the KG into Train and Test (70/30) for the training phase
train_triples, test_triples = tf.split()

# Create and visualize the KG

In [None]:
ds = tf.triples

print(type(ds))

df = pd.DataFrame(ds, columns = ['subject', 'predicate', 'object'])

# Create an undirected graph class that can store multiedges
#G = nx.MultiGraph()

# Create a directed graph class that can store multiedges
G = nx.MultiDiGraph()

for t in df.values:
    G.add_edge(t[0], t[2], label = t[1])

entities = list(G.nodes())

# Info and plot KG
'''
print(nx.info(G))
print(type(G))
print(G.nodes)
print('Total triples in the KG:', ds.shape)
print([val for (node, val) in G.degree()])

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()
'''

# Load model

In [None]:
model = torch.load(drive_path + my_model + '/' + dataset + '/trained_model.pkl') # Insert the name of a trained model

print(model)

# Divide into three groups
###### (If you want to get information about distributions)



In [None]:
# To obtain the running time of this task
start_time = time.time()

def nodes_connected(u, v):
    return u in G.neighbors(v)

entities  = list(G.nodes())
#relations = ['a', 'b'] # Only for the toy graph, comment if you use a dataset

for t in train_triples.triples:
    if t[0] not in entities:
        entities.append(t[0])

# Existing triples in the KG
t = tf.triples.tolist()

# Positive triples with fixed relations
pos_triples = t

# Only the relation is changed in an existing triple between two nodes
diff_rel_triples = []

# Non existing triples in the KG (no triples between two nodes)
neg_triples = []

# Scores of all triples
score_pos       = []
score_diff_rel  = []
score_neg       = []

c = 0

for x in range(0, len(entities)):
    for y in range(x + 1, len(entities)):
        # List to store a relation if exists in positive examples
        curr_rel        = []
        # All combinations between a selected couple of entities
        e_combinations  = np.column_stack([
                [entities[x]] * len(relations),
                relations,
                [entities[y]] * len(relations)]).tolist()

        #print(e_combinations)
        #for r in relations:
            #curr_list = [entities[x], r, entities[y]]
            #if nodes_connected(curr_list[0], curr_list[2]) and curr_list not in pos_triples:
            #    curr_rel.append(curr_list)
            #e_combinations.append(curr_list)

        for i in range(0, len(e_combinations)):
            if e_combinations[i] not in pos_triples:
                if not nodes_connected(e_combinations[i][0], e_combinations[i][2]) and not nodes_connected(e_combinations[i][2], e_combinations[i][0]):
                    neg_triples.append(e_combinations[i])
                else:
                    diff_rel_triples.append(e_combinations[i])
                neg_triples.append(e_combinations[i])
                
            
            c += 1
            #print(c, len(pos_triples), len(neg_triples))
            #if len(neg_triples) == 10000:
            #    break

        '''
        for i in range(0, len(e_combinations)):
            if not nodes_connected(e_combinations[i][0], e_combinations[i][2]):
                neg_triples.append(e_combinations[i])
            elif e_combinations[i] not in pos_triples and [e_combinations[i][2], e_combinations[i][1], e_combinations[i][0]] not in pos_triples:
                diff_rel_triples.append(e_combinations[i])
            else:
                pos_triples.append(e_combinations[i])

            c += 1
        '''

#print(len(pos_triples))    
#print(len(neg_triples[:len(pos_triples)]))        
                    
all_triples = pos_triples + diff_rel_triples + neg_triples

tf_all_triples = TriplesFactory.from_labeled_triples(np.array(all_triples))

print("Positive: " + str(len(pos_triples)) + 
      ", Different relation: " + str(len(diff_rel_triples)) + 
      ", Negative: " +  str(len(neg_triples)) + "\n")

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
## GENERATE NEGATIVE SAMPLES (it is not necessary anymore) ##

'''

import numpy as np

entities  = []
relations = []

for t in train_triples.triples:
    if t[0] not in entities:
        entities.append(t[0])
    if t[1] not in relations:
        relations.append(t[1])

print(entities, relations)

all_triples     = []
#all_triples_rev = []

for r in range(len(relations)):
    for e in range(len(entities)):
        for x in range(e+1, len(entities)):
            all_triples.append([entities[e], relations[r], entities[x]])
        for y in range(len(entities)):
            all_triples_rev.append([entities[e], relations[r], entities[y]])

all_triples = list(itertools.combinations(itertools.chain(entities, relations, entities), 3))

print(len(all_triples))

positive_samples = (LP_pos.triples).tolist()
negative_samples = []

for t in all_triples:
    if t not in positive_samples:
        negative_samples.append(t)

LP_all = np.array(all_triples)

print(len(LP_all), len(LP_pos.triples), len(LP_neg))


#LP_all_rev  = np.array(all_triples_rev)

LP_neg = np.array(negative_samples)

'''

In [None]:
## SPLIT AND MERGE TRAINING/TEST SETS FOR LP TASK (it is not necessary anymore) ##

'''

LP_neg = np.array(diff_rel_triples + neg_triples)

trainLP_pos = tf.triples[:int(len(tf.triples)*0.7)]
testLP_pos = tf.triples[int(len(tf.triples)*0.7):]

trainLP_neg = LP_neg[:int(len(LP_neg)*0.7)]
testLP_neg  = LP_neg[int(len(LP_neg)*0.7):]

trainfLP_pos = TriplesFactory.from_labeled_triples(np.array(trainLP_pos)) 
trainfLP_neg = TriplesFactory.from_labeled_triples(np.array(trainLP_neg)) 

train_LP = trainLP_pos.tolist()
for l in trainLP_neg.tolist():
    train_LP.append(l)

trainf_LP = TriplesFactory.from_labeled_triples(np.array(train_LP)) 

test_LP = testLP_pos.tolist()

for l in testLP_neg.tolist():
    test_LP.append(l)

testf_LP = TriplesFactory.from_labeled_triples(np.array(test_LP))

print(len(train_LP), len(test_LP))

'''

# Train the embeddings

In [None]:
pipeline_result = pipeline(
    training=train_triples,
    testing=test_triples,
    model=my_model,
    training_loop='sLCWA',
    negative_sampler='basic',
    negative_sampler_kwargs=dict(
        filtered=True,    
    ),
    # Training configuration
    training_kwargs=dict(
        num_epochs=200,
        use_tqdm_batch=False,
    ),  
    # Runtime configuration
    random_seed=1235,
    device='cpu',
)

pipeline_result.save_to_directory(drive_path + my_model + '/' + dataset)

model = pipeline_result.model

# Extract embeddings

In [None]:
# Extract entity embeddings
entity_ids = torch.LongTensor(range(tf.num_entities))
e_emb = model.entity_embeddings
e_emb.cpu()
e_emb_numpy = e_emb(entity_ids).detach().numpy()
entity2embedding = {}
for eid in range(tf.num_entities):
  e = tf.entity_id_to_label[eid]
  entity2embedding[e] = list(e_emb_numpy[eid])

# Extract relation embeddings
relation_ids = torch.LongTensor(range(tf.num_relations))
r_emb = model.relation_embeddings
r_emb.cpu()
r_emb_numpy = r_emb(relation_ids).detach().numpy()
relation2embedding = {}
for rid in range(tf.num_relations):
  r = tf.relation_id_to_label[rid]
  relation2embedding[r] = list(r_emb_numpy[rid])

# Reliability

In [None]:
# Function to reject outliers
def reject_outliers(data, m=2):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

# Compute the sigmoid on top the scores
def sigmoid(x, max_score, min_score):

    # Normalize x between -6 and 6
    # x' = (b - a) * (x – min(x)) / (max(x) – min(x)) + a
    x_norm = 12 * (x - min_score) / (max_score - min_score) - 6

    # Apply the sigmoid
    return 1 / (1 + math.exp(-x_norm))

# Check if two nodes are connected
def nodes_connected(u, v):
    return u in G.neighbors(v)

# Compute the reliability measure    
def reliability(G, pos_triples, diff_rel_triples, neg_triples):

    # To obtain the running time of this task
    start_time = time.time()

    print("Positive: ", len(pos_triples))
    print("Negative: ", len(diff_rel_triples))
    print("Different relation: " + str(len(neg_triples)) + "\n")

    tot_triples = pos_triples + diff_rel_triples + neg_triples

    sum_tot      = 0 # Sum of all scores
    sum_tot_norm = 0 # Sum of all normalized scores
    sum_tot_sigm = 0 # Sum of all scores applying a sigmoid on top

    sum_tot_norm_nodiff = 0 # Sum of all normalized scores (without diff_rel)

    sum_tot_norm_2 = 0 # Normalized scores added up at the end
    sum_tot_sigm_2 = 0 # Sigmoid scores added up at the end

    sum_tot_norm_3 = 0 # Normalized scores with mean and standard deviation

    # Normalized Reliability
    sum_norm_pos      = 0
    sum_norm_diff_rel = 0
    sum_norm_neg      = 0

    # Sigmoid Reliability
    sum_sigm_pos      = 0
    sum_sigm_diff_rel = 0
    sum_sigm_neg      = 0

    # Single set score
    score_pos       = []
    score_diff_rel  = []
    score_neg       = []

    new_scores = []

    all_scores = model.score_hrt(TriplesFactory.from_labeled_triples(np.array(tot_triples)).mapped_triples)

    max_score = max(all_scores.detach().numpy())[0]
    min_score = min(all_scores.detach().numpy())[0]

    print("Mean and std before: " + str(np.mean(all_scores.tolist())) + ", " + str(np.std(all_scores.tolist())))
    print(max_score, min_score)

    new_scores = reject_outliers(all_scores.detach().numpy())

    max_score = max(new_scores)
    min_score = min(new_scores)
    
    print("Mean and std after: " + str(np.mean(new_scores.tolist())) + ", " + str(np.std(new_scores.tolist())))
    print(max_score, min_score)

    # Normalize vectors
    # zi = (xi – min(x)) / (max(x) – min(x)), where 
    # xi is the current value
    # x is the array of scores

    if len(pos_triples) > 0:
        for i in range(0, len(pos_triples)):   
            score_triple = model.score_hrt(torch.LongTensor([[tf.entity_to_id[pos_triples[i][0]], 
                                                                              tf.relation_to_id[pos_triples[i][1]], 
                                                                              tf.entity_to_id[pos_triples[i][2]]]]))
            score = score_triple.detach().numpy()
            if score > max_score:
                score_norm = 1
            elif score < min_score:
                score_norm = 0
            else:
                score_norm = (score - min_score) / (max_score - min_score)

            score_sigm   = sigmoid(score, max_score, min_score)
            score_pos.append(score)
            sum_tot      += float(score)
            sum_tot_norm += float(score_norm)
            sum_tot_sigm += float(score_sigm)
            sum_tot_norm_nodiff += float(score_norm)
            sum_norm_pos += float(score_norm)
            sum_sigm_pos += float(score_sigm)

    if len(diff_rel_triples) > 0:
        for i in range(0, len(diff_rel_triples)):
            score_triple = model.score_hrt(torch.LongTensor([[tf.entity_to_id[diff_rel_triples[i][0]], 
                                                                              tf.relation_to_id[diff_rel_triples[i][1]], 
                                                                              tf.entity_to_id[diff_rel_triples[i][2]]]]))
            score = score_triple.detach().numpy()
            if score > max_score:
                score_norm = 1
            elif score < min_score:
                score_norm = 0
            else:
                score_norm = (score - min_score) / (max_score - min_score)

            score_sigm        = sigmoid(score, max_score, min_score)
            score_diff_rel.append(score)
            sum_tot           += float(score)
            sum_tot_norm      += 1 - float(score_norm)
            sum_tot_sigm      += 1 - float(score_sigm)
            sum_norm_diff_rel += 1 - float(score_norm)
            sum_sigm_diff_rel += 1 - float(score_sigm)

    if len(neg_triples) > 0:
        for i in range(0, len(neg_triples)):
            score_triple = model.score_hrt(torch.LongTensor([[tf.entity_to_id[neg_triples[i][0]], 
                                                                              tf.relation_to_id[neg_triples[i][1]], 
                                                                              tf.entity_to_id[neg_triples[i][2]]]]))
            score = score_triple.detach().numpy()
            if score > max_score:
                score_norm = 1
            elif score < min_score:
                score_norm = 0
            else:
                score_norm = (score - min_score) / (max_score - min_score)

            score_sigm   = sigmoid(score, max_score, min_score)
            score_neg.append(score)
            sum_tot      += float(score)
            sum_tot_norm += 1 - float(score_norm)
            sum_tot_sigm += 1 - float(score_sigm)
            sum_tot_norm_nodiff += 1 - float(score_norm)
            sum_norm_neg += 1 - float(score_norm)
            sum_sigm_neg += 1 - float(score_sigm)


    # Mean and standard deviation of the subgraph
    subgraph_mean = np.mean(sum_tot)
    subgraph_std  = np.std(sum_tot)

    # Mean and standard deviation of the single sets
    # (Positive, Different relation, Negative)
    pos_mean = np.mean(score_pos)
    pos_std  = np.std(score_pos)

    diff_rel_mean = np.mean(score_diff_rel)
    diff_rel_std  = np.std(score_diff_rel)

    neg_mean = np.mean(score_neg)
    neg_std  = np.std(score_neg)

    score_pos = (score_pos - pos_mean) / pos_std

    sum_tot      = sum_tot / (len(pos_triples) + len(diff_rel_triples) + len(neg_triples))
    sum_tot_norm = sum_tot_norm / (len(pos_triples) + len(diff_rel_triples) + len(neg_triples))
    sum_tot_sigm = sum_tot_sigm / (len(pos_triples) + len(diff_rel_triples) + len(neg_triples))

    sum_tot_norm_nodiff = sum_tot_norm_nodiff / (len(pos_triples) + len(diff_rel_triples) + len(neg_triples))

    # Number of non-negative sets
    n = 0

    if(len(pos_triples)):
        sum_norm_pos      /= len(pos_triples)
        sum_sigm_pos      /= len(pos_triples)
        n += 1
    if(len(diff_rel_triples)):
        sum_norm_diff_rel /= len(diff_rel_triples)
        sum_sigm_diff_rel /= len(diff_rel_triples)
        n += 1
    if(len(neg_triples)):
        sum_norm_neg      /= len(neg_triples)
        sum_sigm_neg      /= len(neg_triples)
        n += 1

    sum_tot_norm_2 = (sum_norm_pos + sum_norm_diff_rel + sum_norm_neg) / n
    sum_tot_sigm_2 = (sum_sigm_pos + sum_sigm_diff_rel + sum_sigm_neg) / n

    rel_scores.append(float("{:.3f}".format(sum_tot)))
    rel_scores_norm.append(float("{:.3f}".format(sum_tot_norm)))
    rel_scores_sigm.append(float("{:.3f}".format(sum_tot_sigm)))
    rel_scores_sigm_end.append(float("{:.3f}".format(sum_tot_sigm_2)))

    print("Reliability of subgraph              -> %.2f" % sum_tot)
    print("Normalized Reliability of subgraph   -> %.2f" % sum_tot_norm)
    print("Sigmoid Reliability of subgraph      -> %.2f" % sum_tot_sigm + "\n")

    print("Normalized Reliability of subgraph (without diff_rel)   -> %.2f" % sum_tot_norm_nodiff)
    print("Normalized Reliability (end)         -> %.2f" % sum_tot_norm_2)
    print("Sigmoid Reliability (end)            -> %.2f" % sum_tot_sigm_2 + "\n")


    print("--- %s seconds RELIABILITY ---\n" % (time.time() - start_time))


# Total reliability of the KG (TODO)
'''
def total_reliability(G, pos_triples, diff_rel_triples, neg_triples):

    score_pos       = []
    score_diff_rel  = []
    score_neg       = []

    print("Positive: ", len(pos_triples))
    print("Negative: ", len(diff_rel_triples))
    print("Different relation: " + str(len(neg_triples)) + "\n")

    tot_triples = pos_triples + diff_rel_triples + neg_triples

    sum_tot      = 0 # Sum of all scores
    sum_tot_norm = 0 # Sum of all normalized scores

    all_scores = pipeline_result.model.score_hrt(TriplesFactory.from_labeled_triples(np.array(tot_triples)).mapped_triples)
    max_score = max(all_scores.detach().numpy())[0]
    min_score = min(all_scores.detach().numpy())[0]

    if len(pos_triples) > 0:
        for i in range(0, len(pos_triples)):   
            score_triple = pipeline_result.model.score_hrt(torch.LongTensor([[tf.entity_to_id[pos_triples[i][0]], 
                                                                              tf.relation_to_id[pos_triples[i][1]], 
                                                                              tf.entity_to_id[pos_triples[i][2]]]]))
            score_np     = score_triple.detach().numpy()
            score        = (score_np - min_score) / (max_score - min_score)
            score_pos.append(score_np)
            sum_tot      += float(score_np)
            sum_tot_norm += float(score)
        
    if len(diff_rel_triples) > 0:
        for i in range(0, len(diff_rel_triples)):
            score_triple = pipeline_result.model.score_hrt(torch.LongTensor([[tf.entity_to_id[diff_rel_triples[i][0]], 
                                                                              tf.relation_to_id[diff_rel_triples[i][1]], 
                                                                              tf.entity_to_id[diff_rel_triples[i][2]]]]))
            score_np     = score_triple.detach().numpy()
            score        = (score_np - min_score) / (max_score - min_score)
            score_diff_rel.append(score_np)
            sum_tot      += 1 - float(score_np)
            sum_tot_norm += 1 - float(score)

    if len(neg_triples) > 0:
        for i in range(0, len(neg_triples)):
            score_triple = pipeline_result.model.score_hrt(torch.LongTensor([[tf.entity_to_id[neg_triples[i][0]], 
                                                                              tf.relation_to_id[neg_triples[i][1]], 
                                                                              tf.entity_to_id[neg_triples[i][2]]]]))
            score_np     = score_triple.detach().numpy()
            score        = (score_np - min_score) / (max_score - min_score)
            score_neg.append(score_np)
            sum_tot      += 1 - float(score_np)
            sum_tot_norm += 1 - float(score)

    df = pd.DataFrame({'pos': score_pos[:len(score_pos)],
                       'diff_rel': score_diff_rel[:len(score_pos)],
                       'neg': score_neg[:len(score_pos)]},
                      columns=['pos','diff_rel','neg'])

    df.to_csv('scores.csv', mode='a', index=False, header=False)

    sum_tot      = sum_tot / (len(pos_triples) + len(diff_rel_triples) + len(neg_triples))

    sum_tot_norm = sum_tot_norm / (len(pos_triples))

    print("Reliability of the graph -> %.2f" % sum_tot)
    print("Normalized Reliability of the graph -> %.2f" % sum_tot_norm + "\n")
'''

In [None]:
(20-np.mean([1,2,3,4,5]))/(5*np.std([1,2,3,4,5]))

2.4041630560342613

# LP Classifier

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
 
%matplotlib inline

from sklearn import datasets, metrics, model_selection, svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.multioutput import ClassifierChain #multiclasse ?
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, plot_confusion_matrix, accuracy_score
from scipy.stats import randint
from sklearn.model_selection import cross_val_score

def accuracy(G, pos_triples, diff_rel_triples, neg_triples):

    ds = [] 
    ds_train = []
    ds_pos = [] 
    ds_neg = []
    ds_all = []

    for t in pos_triples + tf_all[:len(neg_triples)]:
        ds_pos.append([*entity2embedding[t[0]], *relation2embedding[t[1]], *entity2embedding[t[2]], 1])
    for t in diff_rel_triples + neg_triples:
        ds_neg.append([*entity2embedding[t[0]], *relation2embedding[t[1]], *entity2embedding[t[2]], 0])

    ds = ds_pos + ds_neg[:len(ds_pos)]

    dataset = np.array(ds)

    X = dataset[:, :-1]
    y = dataset[:, -1]
    
    # format all fields as string
    #X = X.astype(str)

    # prepare input data
    def prepare_inputs(X_train, X_test):
        ohe = OneHotEncoder(handle_unknown = "ignore")
        ohe.fit(X_train)
        X_train_enc = ohe.transform(X_train)
        X_test_enc = ohe.transform(X_test)
        return X_train_enc, X_test_enc
    
    # prepare target
    def prepare_targets(y_train, y_test):
        le = LabelEncoder()
        le.fit(y_train)
        y_train_enc = le.transform(y_train)
        y_test_enc = le.transform(y_test)
        return y_train_enc, y_test_enc

    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
    # prepare input data
    #X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
    # prepare output data
    #y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

    dict_classifiers = {
        "Linear SVM": SVC(),
        "Logistic Regression": LogisticRegression(),
        "Nearest Neighbors": KNeighborsClassifier(),
        "Decision Tree": tree.DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(n_estimators=1000,min_samples_leaf=2, min_samples_split=10,
                                                max_features = 'sqrt', max_depth=10,bootstrap=True),
        "Neural Net": MLPClassifier(alpha = 1),
        "Naive Bayes": GaussianNB(),
        "AdaBoost": AdaBoostClassifier(n_estimators=54,learning_rate=0.1),
        "QDA": QuadraticDiscriminantAnalysis(),
        "Gaussian Process": GaussianProcessClassifier(),
        #"Bagging":BaggingClassifier(),
        "BernoulliNB":BernoulliNB,
        "Classifier Chain":CalibratedClassifierCV(),
        #"CategoricalNB":CategoricalNB(),
        #"Classifier chain":ClassifierChain(),
        "ComplementNB": ComplementNB(),
        #"Dummy classifier":DummyClassifier,
        "ExtraTree":tree.ExtraTreeClassifier(),
        #"ExtraTrees":tree.ExtraTreesClassifier(),
        "GaussianNB":GaussianNB()

    }

      
    def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers, verbose = True):
      """
      This method, takes as input the X, Y matrices of the Train and Test set.
      And fits them on all of the Classifiers specified in the dict_classifier.
      The trained models, and accuracies are saved in a dictionary. The reason to use a dictionary
      is because it is very easy to save the whole dictionary with the pickle module.
      
      Usually, the SVM, Random Forest and Gradient Boosting Classifier take quiet some time to train. 
      So it is best to train them on a smaller dataset first and 
      decide whether you want to comment them out or not based on the test accuracy score.
      """
      
      dict_models = {}
      for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
        t_start = time.process_time()
        classifier.fit(X_train, Y_train)
        t_end = time.process_time()

        metrics.plot_roc_curve(classifier, X_test, y_test)
        plt.show()
        
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, y_train)
        test_score = classifier.score(X_test, y_test)
        
        dict_models[classifier_name] = {'model': classifier, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff}
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff))

      return dict_models

    def display_dict_models(dict_models, sort_by='test_score'):
        cls = [key for key in dict_models.keys()]
        test_s = [dict_models[key]['test_score'] for key in cls]
        training_s = [dict_models[key]['train_score'] for key in cls]
        training_t = [dict_models[key]['train_time'] for key in cls]
        
        df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),4)), columns = ['classifier', 'train_score', 'test_score', 'train_time'])
        for ii in range(0,len(cls)):
            df_.loc[ii, 'classifier'] = cls[ii]
            df_.loc[ii, 'train_score'] = training_s[ii]
            df_.loc[ii, 'test_score'] = test_s[ii]
            df_.loc[ii, 'train_time'] = training_t[ii]
        
        display(df_.sort_values(by=sort_by, ascending=False))

    dict_models = batch_classify(X_train, y_train, X_test, y_test, no_classifiers = 4)
    display_dict_models(dict_models)


# Reliability/Accuracy of subgraphs

In [None]:
# TRIPLES FOR THE TRAINING SET OF THE LP CLASSIFIER #

# Number of nodes for each subgraph
k = 20

# Number of subgraphs
h = 10

# List of subgraphs
subgraphs = []

## RANDOM WALK ##

while len(subgraphs) < h:
    # List of visited nodes
    visited = []
    # Random node
    node = random.choice(list(G.nodes()))
    # Add the node to the subgraph
    visited.append(node)
    while len(visited) < k:
        # Neighbors of the chosen entity
        neighbors = list(set(G.neighbors(node)) - set(visited))
        if len(neighbors) == 0:
            neighbors = list(set(G.nodes()) - set(visited))
        # Choose the new node randomly
        node = random.choice(neighbors)
        # Add the node to the subgraph
        visited.append(node)

    # Add the generated subgraph to the list (with optional density)
    #if nx.density(nx.subgraph(G, visited)) > 0.6 or nx.density(nx.subgraph(G, visited)) < 0.4:
    #    print(nx.density(nx.subgraph(G, visited)))
    #    print(visited)
        subgraphs.append(visited)

rel_scores      = []
rel_scores_norm = []
rel_scores_sigm = []
acc_scores      = []

perc_entities   = []

# Lists used to evaluate the accuracy on a subgraph
pos_subgraph_triples_sub      = []
diff_rel_subgraph_triples_sub = []
neg_subgraph_triples_sub      = []

# Lists to keep track of all the generated triples
pos_subgraph_triples_tot      = []
diff_rel_subgraph_triples_tot = []
neg_subgraph_triples_tot      = []

c = 0

for s in subgraphs:

    # Lists of the generated triples of the current subgraph
    pos_subgraph_triples      = []
    diff_rel_subgraph_triples = []
    neg_subgraph_triples      = []

    # Compute all the combinations of e-r-e
    for comb in itertools.combinations(s, 2):
        #print(comb)
        n1 = n2 = 0
        # Check if exists at least one relation between the current pair of entities
        if G.get_edge_data(comb[0], comb[1]) != None:
            # Save all the relations between the current pair of entities
            edge_data = [value['label'] for key, value in G[comb[0]][comb[1]].items()]
            # Consider the remaining relations with the set difference
            diff_relations = set(relations) - set(edge_data)
            # Add all the 'positive' triples
            for r in edge_data:
                if [comb[0], r, comb[1]] not in pos_subgraph_triples:
                    pos_subgraph_triples.append([comb[0], r, comb[1]])
            # Add all the 'different relation' triples
            for r in diff_relations:
                diff_rel_subgraph_triples.append([comb[0], r, comb[1]])
        else:
            # If there are no relations between the current pair of entities
            # Add all the 'negative' triples
            for r in relations:
                neg_subgraph_triples.append([comb[0], r, comb[1]])

        #print(n1, n2)

    if pos_subgraph_triples:
        pos_subgraph_triples_tot.extend(pos_subgraph_triples)

    if diff_rel_subgraph_triples:
        diff_rel_subgraph_triples_tot.extend(diff_rel_subgraph_triples)

    if neg_subgraph_triples:
        neg_subgraph_triples_tot.extend(neg_subgraph_triples)

# Compose the train dataset for the LP classifier
ds_pos = []
ds_neg = []
ds_tot = []

for t in pos_subgraph_triples_tot:
    ds_pos.append([*entity2embedding[t[0]], *relation2embedding[t[1]], *entity2embedding[t[2]], 1])
for t in diff_rel_subgraph_triples_tot + neg_subgraph_triples_tot:
    ds_neg.append([*entity2embedding[t[0]], *relation2embedding[t[1]], *entity2embedding[t[2]], 0])

ds_tot = ds_pos + ds_neg[:len(ds_pos)]

#train_dataset = np.array(ds_tot)

## Undirected

In [None]:
# UNDIRECTED GRAPH #

# To obtain the running time of this task
start_time = time.time()

# Number of nodes for each subgraph
k = 30

# Number of subgraphs
h = 100

# List of subgraphs
subgraphs = []

## RANDOM WALK ##

while len(subgraphs) < h:
    # List of visited nodes
    visited = []
    # Random node
    node = random.choice(list(G.nodes()))
    # Add the node to the subgraph
    visited.append(node)
    while len(visited) < k:
        # Neighbors of the chosen entity
        neighbors = list(set(G.neighbors(node)) - set(visited))
        if len(neighbors) == 0:
            neighbors = list(set(G.nodes()) - set(visited))
        # Choose the new node randomly
        node = random.choice(neighbors)
        # Add the node to the subgraph
        visited.append(node)

    # Add the generated subgraph to the list (with optional density)
    #if nx.density(nx.subgraph(G, visited)) > 0.6 or nx.density(nx.subgraph(G, visited)) < 0.4:
    #    print(nx.density(nx.subgraph(G, visited)))
    #    print(visited)
    subgraphs.append(visited)

rel_scores      = []
rel_scores_norm = []
rel_scores_sigm = []
acc_scores      = []

perc_entities   = []

# Lists used to evaluate the accuracy on a subgraph
pos_subgraph_triples_sub      = []
diff_rel_subgraph_triples_sub = []
neg_subgraph_triples_sub      = []

# Lists to keep track of all the generated triples
pos_subgraph_triples_tot      = []
diff_rel_subgraph_triples_tot = []
neg_subgraph_triples_tot      = []

c = 0

for s in subgraphs:

    c += 1
    
    print("*** Subgraph " + str(c) + " ***\n")

    # Lists of the generated triples of the current subgraph
    pos_subgraph_triples      = []
    diff_rel_subgraph_triples = []
    neg_subgraph_triples      = []

    # Compute all the combinations of e-r-e
    for comb in itertools.combinations(s, 2):
        #print(comb)
        n1 = n2 = 0
        # Check if exists at least one relation between the current pair of entities
        if G.get_edge_data(comb[0], comb[1]) != None:
            # Save all the relations between the current pair of entities
            edge_data = [value['label'] for key, value in G[comb[0]][comb[1]].items()]
            # Consider the remaining relations with the set difference
            diff_relations = set(relations) - set(edge_data)
            # Add all the 'positive' triples
            for r in edge_data:
                if [comb[0], r, comb[1]] not in pos_subgraph_triples:
                    pos_subgraph_triples.append([comb[0], r, comb[1]])
            # Add all the 'different relation' triples
            for r in diff_relations:
                diff_rel_subgraph_triples.append([comb[0], r, comb[1]])
        else:
            # If there are no relations between the current pair of entities
            # Add all the 'negative' triples
            for r in relations:
                neg_subgraph_triples.append([comb[0], r, comb[1]])

        #print(n1, n2)
    
    print(nx.subgraph(G, s))
    
    reliability(s, pos_subgraph_triples, diff_rel_subgraph_triples, neg_subgraph_triples)
    accuracy(s, pos_subgraph_triples, diff_rel_subgraph_triples, neg_subgraph_triples)

    for e in s:
        if e not in perc_entities:
            perc_entities.append(e)
'''
    if pos_subgraph_triples:
        pos_subgraph_triples_sub.append(pos_subgraph_triples)
        pos_subgraph_triples_tot.extend(pos_subgraph_triples)

    if diff_rel_subgraph_triples:
        diff_rel_subgraph_triples_sub.append(diff_rel_subgraph_triples)
        diff_rel_subgraph_triples_tot.extend(diff_rel_subgraph_triples)

    if neg_subgraph_triples:
        neg_subgraph_triples_sub.append(neg_subgraph_triples)
        neg_subgraph_triples_tot.extend(neg_subgraph_triples)

    

# Compose the train dataset for the LP classifier
ds_pos = []
ds_neg = []
ds_tot = []

for t in pos_subgraph_triples_tot:
    ds_pos.append([*entity2embedding[t[0]], *relation2embedding[t[1]], *entity2embedding[t[2]], 1])
for t in diff_rel_subgraph_triples_tot + neg_subgraph_triples_tot:
    ds_neg.append([*entity2embedding[t[0]], *relation2embedding[t[1]], *entity2embedding[t[2]], 0])

ds_tot = ds_pos + ds_neg[:len(ds_pos)]

#train_dataset = np.array(ds_tot)

print(len(subgraphs), len(pos_subgraph_triples_sub), len(diff_rel_subgraph_triples_sub), len(neg_subgraph_triples_sub))

for i in range(0, len(subgraphs)):
    accuracy(ds_tot, pos_subgraph_triples_sub[i], 
             diff_rel_subgraph_triples_sub[i], neg_subgraph_triples_sub[i])

'''


entity_difference = [item for item in entities if item not in perc_entities]

print(1 - len(entity_difference) / len(entities), len(entity_difference), len(entities))
    

df = pd.read_csv('stats.csv', index_col=[0])
df_total = df.assign(**{dataset + "_" + str(k) + "_" + str(h): rel_scores, 
                        dataset + "_" + str(k) + "_" + str(h) + "_NORM": rel_scores_norm, 
                        dataset + "_" + str(k) + "_" + str(h) + "_SIGM": rel_scores_sigm,
                        dataset + "_" + str(k) + "_" + str(h) + "_ACC": acc_scores})

df_total.to_csv('stats.csv')


print(rel_scores)
print(rel_scores_norm)
print(rel_scores_sigm)
print(acc_scores)


print("--- %s seconds TOTAL ---\n" % (time.time() - start_time))

*** Subgraph 1 ***

MultiGraph with 30 nodes and 66 edges
Positive:  66
Negative:  528
Different relation: 7236

Mean and std before: -7.5665728222516915, 1.7236334366612514
-2.2953618 -12.984095
Mean and std after: -7.584829915061952, 1.4931137853218681
-4.120717 -11.0112
Reliability of subgraph              -> -8.19
Normalized Reliability of subgraph   -> 0.59
Sigmoid Reliability of subgraph      -> 0.66

Normalized Reliability of subgraph (without diff_rel)   -> 0.57
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.54

--- 1.4900763034820557 seconds RELIABILITY ---

Accuracy of subgraph ->  90.91 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.920455,0.909091,0.00169


*** Subgraph 2 ***

MultiGraph with 30 nodes and 66 edges
Positive:  58
Negative:  536
Different relation: 7236

Mean and std before: -7.567452833813878, 1.7216348836615862
-2.2887826 -12.984095
Mean and std after: -7.587210041173915, 1.4948342099693432
-4.125046 -11.010347
Reliability of subgraph              -> -8.04
Normalized Reliability of subgraph   -> 0.57
Sigmoid Reliability of subgraph      -> 0.62

Normalized Reliability of subgraph (without diff_rel)   -> 0.56
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.55

--- 1.448333978652954 seconds RELIABILITY ---

Accuracy of subgraph ->  89.74 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.948052,0.897436,0.004464


*** Subgraph 3 ***

MultiGraph with 30 nodes and 54 edges
Positive:  47
Negative:  439
Different relation: 7344

Mean and std before: -7.572820797520731, 1.7202739351509841
-2.3283696 -12.984095
Mean and std after: -7.595962994970301, 1.4908436292176892
-4.135048 -11.009111
Reliability of subgraph              -> -8.25
Normalized Reliability of subgraph   -> 0.60
Sigmoid Reliability of subgraph      -> 0.67

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.56

--- 1.4844584465026855 seconds RELIABILITY ---

Accuracy of subgraph ->  87.50 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.935484,0.875,0.001187


*** Subgraph 4 ***

MultiGraph with 30 nodes and 67 edges
Positive:  53
Negative:  559
Different relation: 7218

Mean and std before: -7.5544053898917305, 1.7230637318744368
-2.3756173 -12.984095
Mean and std after: -7.56774587597934, 1.503179024139632
-4.120717 -10.997835
Reliability of subgraph              -> -7.86
Normalized Reliability of subgraph   -> 0.55
Sigmoid Reliability of subgraph      -> 0.60

Normalized Reliability of subgraph (without diff_rel)   -> 0.53
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.54

--- 1.4696552753448486 seconds RELIABILITY ---

Accuracy of subgraph ->  94.29 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.957746,0.942857,0.001326


*** Subgraph 5 ***

MultiGraph with 30 nodes and 62 edges
Positive:  55
Negative:  503
Different relation: 7272

Mean and std before: -7.554229512098953, 1.7168238703352705
-2.2953618 -12.693579
Mean and std after: -7.568365239821492, 1.494413354315023
-4.120717 -10.981315
Reliability of subgraph              -> -8.06
Normalized Reliability of subgraph   -> 0.58
Sigmoid Reliability of subgraph      -> 0.64

Normalized Reliability of subgraph (without diff_rel)   -> 0.56
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.55

--- 1.4431085586547852 seconds RELIABILITY ---

Accuracy of subgraph ->  91.89 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.945205,0.918919,0.001433


*** Subgraph 6 ***

MultiGraph with 30 nodes and 66 edges
Positive:  55
Negative:  539
Different relation: 7236

Mean and std before: -7.558314765154326, 1.7231136744349909
-2.2887826 -12.767479
Mean and std after: -7.571810662025212, 1.4979996016738149
-4.1224604 -10.997835
Reliability of subgraph              -> -8.07
Normalized Reliability of subgraph   -> 0.58
Sigmoid Reliability of subgraph      -> 0.64

Normalized Reliability of subgraph (without diff_rel)   -> 0.57
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.55

--- 1.8083300590515137 seconds RELIABILITY ---

Accuracy of subgraph ->  94.59 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.945205,0.945946,0.001659


*** Subgraph 7 ***

MultiGraph with 30 nodes and 62 edges
Positive:  51
Negative:  507
Different relation: 7272

Mean and std before: -7.565183676096062, 1.7177176543145218
-2.2953618 -12.777033
Mean and std after: -7.584417513809995, 1.487327820630836
-4.1361117 -10.996696
Reliability of subgraph              -> -8.09
Normalized Reliability of subgraph   -> 0.58
Sigmoid Reliability of subgraph      -> 0.63

Normalized Reliability of subgraph (without diff_rel)   -> 0.57
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.54

--- 1.4545066356658936 seconds RELIABILITY ---

Accuracy of subgraph ->  91.18 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.941176,0.911765,0.00125


*** Subgraph 8 ***

MultiGraph with 30 nodes and 54 edges
Positive:  50
Negative:  436
Different relation: 7344

Mean and std before: -7.572596980450558, 1.723106495541171
-2.2887826 -12.984095
Mean and std after: -7.592112213369177, 1.49382541314091
-4.1353984 -11.012741
Reliability of subgraph              -> -8.56
Normalized Reliability of subgraph   -> 0.65
Sigmoid Reliability of subgraph      -> 0.73

Normalized Reliability of subgraph (without diff_rel)   -> 0.63
Normalized Reliability (end)         -> 0.57
Sigmoid Reliability (end)            -> 0.57

--- 1.4251155853271484 seconds RELIABILITY ---

Accuracy of subgraph ->  90.91 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.955224,0.909091,0.001304


*** Subgraph 9 ***

MultiGraph with 30 nodes and 68 edges
Positive:  68
Negative:  544
Different relation: 7218

Mean and std before: -7.5614540803782635, 1.724785862952447
-2.2953618 -12.984095
Mean and std after: -7.5787709577539655, 1.4983635657037822
-4.1133337 -11.01091
Reliability of subgraph              -> -7.97
Normalized Reliability of subgraph   -> 0.56
Sigmoid Reliability of subgraph      -> 0.60

Normalized Reliability of subgraph (without diff_rel)   -> 0.54
Normalized Reliability (end)         -> 0.53
Sigmoid Reliability (end)            -> 0.52

--- 1.4081947803497314 seconds RELIABILITY ---

Accuracy of subgraph ->  93.33 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.923077,0.933333,0.00466


*** Subgraph 10 ***

MultiGraph with 30 nodes and 52 edges
Positive:  49
Negative:  437
Different relation: 7344

Mean and std before: -7.559560813331361, 1.7236005487673571
-2.2953618 -12.902369
Mean and std after: -7.572624798761522, 1.4996726807532739
-4.1133337 -10.999145
Reliability of subgraph              -> -8.29
Normalized Reliability of subgraph   -> 0.61
Sigmoid Reliability of subgraph      -> 0.69

Normalized Reliability of subgraph (without diff_rel)   -> 0.60
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.55

--- 1.4366211891174316 seconds RELIABILITY ---

Accuracy of subgraph ->  90.91 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.938462,0.909091,0.001229


*** Subgraph 11 ***

MultiGraph with 30 nodes and 64 edges
Positive:  60
Negative:  516
Different relation: 7254

Mean and std before: -7.571879088010824, 1.7195605959774662
-2.3756173 -12.984095
Mean and std after: -7.59108464623959, 1.4909920576969256
-4.134748 -11.0047035
Reliability of subgraph              -> -7.94
Normalized Reliability of subgraph   -> 0.56
Sigmoid Reliability of subgraph      -> 0.60

Normalized Reliability of subgraph (without diff_rel)   -> 0.54
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.52

--- 1.4897172451019287 seconds RELIABILITY ---

Accuracy of subgraph ->  87.50 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.95,0.875,0.001455


*** Subgraph 12 ***

MultiGraph with 30 nodes and 60 edges
Positive:  54
Negative:  486
Different relation: 7290

Mean and std before: -7.58448211230233, 1.720048990191651
-2.2887826 -12.984095
Mean and std after: -7.602490713787579, 1.49265335505204
-4.146079 -11.021315
Reliability of subgraph              -> -8.33
Normalized Reliability of subgraph   -> 0.61
Sigmoid Reliability of subgraph      -> 0.68

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.55

--- 1.6937007904052734 seconds RELIABILITY ---

Accuracy of subgraph ->  88.89 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.944444,0.888889,0.001273


*** Subgraph 13 ***

MultiGraph with 30 nodes and 54 edges
Positive:  49
Negative:  437
Different relation: 7344

Mean and std before: -7.555465779176617, 1.7229281113825785
-2.2887826 -12.984095
Mean and std after: -7.574519599093932, 1.499352454540951
-4.1133337 -10.988977
Reliability of subgraph              -> -8.38
Normalized Reliability of subgraph   -> 0.62
Sigmoid Reliability of subgraph      -> 0.70

Normalized Reliability of subgraph (without diff_rel)   -> 0.61
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.55

--- 1.4327514171600342 seconds RELIABILITY ---

Accuracy of subgraph ->  90.91 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.953846,0.909091,0.003846


*** Subgraph 14 ***

MultiGraph with 30 nodes and 60 edges
Positive:  54
Negative:  504
Different relation: 7272

Mean and std before: -7.5657385012503635, 1.7205996118208031
-2.2887826 -12.984095
Mean and std after: -7.576900482370007, 1.4966048525216553
-4.125046 -10.997835
Reliability of subgraph              -> -8.27
Normalized Reliability of subgraph   -> 0.61
Sigmoid Reliability of subgraph      -> 0.69

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.56

--- 1.46537184715271 seconds RELIABILITY ---

Accuracy of subgraph ->  86.11 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.930556,0.861111,0.001455


*** Subgraph 15 ***

MultiGraph with 30 nodes and 65 edges
Positive:  53
Negative:  523
Different relation: 7254

Mean and std before: -7.561172747337955, 1.724182256067789
-2.2887826 -12.984095
Mean and std after: -7.577130641896281, 1.5011751921184695
-4.1133337 -11.009111
Reliability of subgraph              -> -8.07
Normalized Reliability of subgraph   -> 0.58
Sigmoid Reliability of subgraph      -> 0.65

Normalized Reliability of subgraph (without diff_rel)   -> 0.56
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.54

--- 1.432107925415039 seconds RELIABILITY ---

Accuracy of subgraph ->  88.57 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.943662,0.885714,0.001326


*** Subgraph 16 ***

MultiGraph with 30 nodes and 62 edges
Positive:  61
Negative:  497
Different relation: 7272

Mean and std before: -7.578237464662408, 1.7188070402271924
-2.2887826 -12.984095
Mean and std after: -7.593580716660233, 1.491322222973778
-4.1429634 -11.0112
Reliability of subgraph              -> -8.40
Normalized Reliability of subgraph   -> 0.62
Sigmoid Reliability of subgraph      -> 0.69

Normalized Reliability of subgraph (without diff_rel)   -> 0.61
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.57

--- 1.5041205883026123 seconds RELIABILITY ---

Accuracy of subgraph ->  87.80 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.950617,0.878049,0.001453


*** Subgraph 17 ***

MultiGraph with 30 nodes and 60 edges
Positive:  59
Negative:  481
Different relation: 7290

Mean and std before: -7.576324632310929, 1.7192136256035992
-2.2887826 -12.984095
Mean and std after: -7.595345353766027, 1.4879498891491176
-4.1429634 -11.0112
Reliability of subgraph              -> -8.52
Normalized Reliability of subgraph   -> 0.64
Sigmoid Reliability of subgraph      -> 0.72

Normalized Reliability of subgraph (without diff_rel)   -> 0.62
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.57

--- 1.4653465747833252 seconds RELIABILITY ---

Accuracy of subgraph ->  89.74 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.949367,0.897436,0.001418


*** Subgraph 18 ***

MultiGraph with 30 nodes and 64 edges
Positive:  51
Negative:  525
Different relation: 7254

Mean and std before: -7.566346691973212, 1.7151386914669577
-2.3275867 -12.902369
Mean and std after: -7.5812575573562295, 1.4895088026144725
-4.1429634 -10.988649
Reliability of subgraph              -> -7.62
Normalized Reliability of subgraph   -> 0.52
Sigmoid Reliability of subgraph      -> 0.54

Normalized Reliability of subgraph (without diff_rel)   -> 0.50
Normalized Reliability (end)         -> 0.53
Sigmoid Reliability (end)            -> 0.52

--- 1.7088205814361572 seconds RELIABILITY ---

Accuracy of subgraph ->  91.18 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.970588,0.911765,0.001296


*** Subgraph 19 ***

MultiGraph with 30 nodes and 56 edges
Positive:  45
Negative:  459
Different relation: 7326

Mean and std before: -7.582086111728868, 1.7181035740783255
-2.2887826 -12.984095
Mean and std after: -7.598682907266579, 1.48858832370427
-4.146079 -11.012741
Reliability of subgraph              -> -8.18
Normalized Reliability of subgraph   -> 0.59
Sigmoid Reliability of subgraph      -> 0.66

Normalized Reliability of subgraph (without diff_rel)   -> 0.58
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.57

--- 1.5583231449127197 seconds RELIABILITY ---

Accuracy of subgraph ->  90.00 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.95,0.9,0.001142


*** Subgraph 20 ***

MultiGraph with 30 nodes and 60 edges
Positive:  53
Negative:  487
Different relation: 7290

Mean and std before: -7.570358909927261, 1.7181205762698766
-2.2887826 -12.984095
Mean and std after: -7.584973345277768, 1.4882048477115972
-4.1353984 -10.996696
Reliability of subgraph              -> -8.28
Normalized Reliability of subgraph   -> 0.61
Sigmoid Reliability of subgraph      -> 0.69

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.56

--- 1.4557161331176758 seconds RELIABILITY ---

Accuracy of subgraph ->  88.57 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.943662,0.885714,0.001417


*** Subgraph 21 ***

MultiGraph with 30 nodes and 58 edges
Positive:  48
Negative:  474
Different relation: 7308

Mean and std before: -7.554705270283676, 1.7225888862376868
-2.3756173 -12.984095
Mean and std after: -7.570136273713094, 1.4990598391674295
-4.11405 -10.997835
Reliability of subgraph              -> -7.88
Normalized Reliability of subgraph   -> 0.55
Sigmoid Reliability of subgraph      -> 0.60

Normalized Reliability of subgraph (without diff_rel)   -> 0.54
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.54

--- 1.4898126125335693 seconds RELIABILITY ---

Accuracy of subgraph ->  90.62 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.9375,0.90625,0.001252


*** Subgraph 22 ***

MultiGraph with 30 nodes and 66 edges
Positive:  64
Negative:  530
Different relation: 7236

Mean and std before: -7.55925888473199, 1.7198050283748496
-2.2887826 -12.902369
Mean and std after: -7.571937921033118, 1.4994048236268218
-4.120717 -10.997835
Reliability of subgraph              -> -8.21
Normalized Reliability of subgraph   -> 0.60
Sigmoid Reliability of subgraph      -> 0.66

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.56

--- 1.4273021221160889 seconds RELIABILITY ---

Accuracy of subgraph ->  90.70 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.952941,0.906977,0.001658


*** Subgraph 23 ***

MultiGraph with 30 nodes and 62 edges
Positive:  51
Negative:  489
Different relation: 7290

Mean and std before: -7.565819150369286, 1.7229186753456036
-2.2887826 -12.902369
Mean and std after: -7.580957858070657, 1.500723068542174
-4.122908 -11.0112
Reliability of subgraph              -> -8.02
Normalized Reliability of subgraph   -> 0.57
Sigmoid Reliability of subgraph      -> 0.62

Normalized Reliability of subgraph (without diff_rel)   -> 0.56
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.54

--- 1.43174409866333 seconds RELIABILITY ---

Accuracy of subgraph ->  91.18 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.970588,0.911765,0.001323


*** Subgraph 24 ***

MultiGraph with 30 nodes and 58 edges
Positive:  54
Negative:  468
Different relation: 7308

Mean and std before: -7.558916658886243, 1.7231880226678626
-2.2887826 -12.767479
Mean and std after: -7.578138359959954, 1.4987305982233898
-4.1133337 -11.0047035
Reliability of subgraph              -> -7.93
Normalized Reliability of subgraph   -> 0.56
Sigmoid Reliability of subgraph      -> 0.59

Normalized Reliability of subgraph (without diff_rel)   -> 0.54
Normalized Reliability (end)         -> 0.53
Sigmoid Reliability (end)            -> 0.52

--- 1.761763334274292 seconds RELIABILITY ---

Accuracy of subgraph ->  88.89 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.944444,0.888889,0.001365


*** Subgraph 25 ***

MultiGraph with 30 nodes and 58 edges
Positive:  57
Negative:  465
Different relation: 7308

Mean and std before: -7.5572871348867015, 1.7165205470947562
-2.3756173 -12.697769
Mean and std after: -7.570014304876584, 1.4987339904316415
-4.126313 -10.988649
Reliability of subgraph              -> -8.18
Normalized Reliability of subgraph   -> 0.59
Sigmoid Reliability of subgraph      -> 0.65

Normalized Reliability of subgraph (without diff_rel)   -> 0.58
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.54

--- 1.455503225326538 seconds RELIABILITY ---

Accuracy of subgraph ->  86.84 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.947368,0.868421,0.00226


*** Subgraph 26 ***

MultiGraph with 30 nodes and 58 edges
Positive:  50
Negative:  472
Different relation: 7308

Mean and std before: -7.563360340841886, 1.720291945706821
-2.2887826 -12.697769
Mean and std after: -7.577074145310032, 1.4980335048821838
-4.122908 -10.999145
Reliability of subgraph              -> -8.17
Normalized Reliability of subgraph   -> 0.59
Sigmoid Reliability of subgraph      -> 0.66

Normalized Reliability of subgraph (without diff_rel)   -> 0.58
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.55

--- 1.470276117324829 seconds RELIABILITY ---

Accuracy of subgraph ->  93.94 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.955224,0.939394,0.001276


*** Subgraph 27 ***

MultiGraph with 30 nodes and 66 edges
Positive:  47
Negative:  547
Different relation: 7236

Mean and std before: -7.5729161805424505, 1.720061101912742
-2.2887826 -12.902369
Mean and std after: -7.590106372241767, 1.4970978863458404
-4.134748 -11.012741
Reliability of subgraph              -> -7.72
Normalized Reliability of subgraph   -> 0.53
Sigmoid Reliability of subgraph      -> 0.56

Normalized Reliability of subgraph (without diff_rel)   -> 0.51
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.53

--- 1.4592030048370361 seconds RELIABILITY ---

Accuracy of subgraph ->  93.75 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.935484,0.9375,0.002301


*** Subgraph 28 ***

MultiGraph with 30 nodes and 61 edges
Positive:  52
Negative:  506
Different relation: 7272

Mean and std before: -7.553053857053072, 1.724060875302918
-2.2887826 -12.693579
Mean and std after: -7.569867066478216, 1.500733441554159
-4.107461 -10.997835
Reliability of subgraph              -> -8.27
Normalized Reliability of subgraph   -> 0.61
Sigmoid Reliability of subgraph      -> 0.69

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.55

--- 1.4436726570129395 seconds RELIABILITY ---

Accuracy of subgraph ->  94.29 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.956522,0.942857,0.001304


*** Subgraph 29 ***

MultiGraph with 30 nodes and 62 edges
Positive:  57
Negative:  501
Different relation: 7272

Mean and std before: -7.561984390987138, 1.7184883296015088
-2.2953618 -12.693579
Mean and std after: -7.58011715053582, 1.491784411229088
-4.126313 -10.997835
Reliability of subgraph              -> -8.16
Normalized Reliability of subgraph   -> 0.59
Sigmoid Reliability of subgraph      -> 0.66

Normalized Reliability of subgraph (without diff_rel)   -> 0.58
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.56

--- 1.4337337017059326 seconds RELIABILITY ---

Accuracy of subgraph ->  89.47 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.947368,0.894737,0.001323


*** Subgraph 30 ***

MultiGraph with 30 nodes and 60 edges
Positive:  52
Negative:  488
Different relation: 7290

Mean and std before: -7.57529149975052, 1.7114179548134445
-2.2953618 -12.984095
Mean and std after: -7.5898998386200915, 1.482287193167191
-4.1675663 -10.988977
Reliability of subgraph              -> -7.99
Normalized Reliability of subgraph   -> 0.56
Sigmoid Reliability of subgraph      -> 0.61

Normalized Reliability of subgraph (without diff_rel)   -> 0.55
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.54

--- 1.4237620830535889 seconds RELIABILITY ---

Accuracy of subgraph ->  88.57 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.942029,0.885714,0.001315


*** Subgraph 31 ***

MultiGraph with 30 nodes and 62 edges
Positive:  51
Negative:  507
Different relation: 7272

Mean and std before: -7.568022379442802, 1.7200014910166332
-2.3283696 -12.984095
Mean and std after: -7.587550376663444, 1.4927625796009165
-4.1353984 -11.007502
Reliability of subgraph              -> -7.93
Normalized Reliability of subgraph   -> 0.56
Sigmoid Reliability of subgraph      -> 0.59

Normalized Reliability of subgraph (without diff_rel)   -> 0.54
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.53

--- 1.7776548862457275 seconds RELIABILITY ---

Accuracy of subgraph ->  88.24 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.941176,0.882353,0.001268


*** Subgraph 32 ***

MultiGraph with 30 nodes and 78 edges
Positive:  72
Negative:  630
Different relation: 7128

Mean and std before: -7.574550306264164, 1.7121938042608205
-2.2953618 -12.984095
Mean and std after: -7.597236018051868, 1.4798826916066716
-4.1501994 -10.996696
Reliability of subgraph              -> -7.64
Normalized Reliability of subgraph   -> 0.52
Sigmoid Reliability of subgraph      -> 0.53

Normalized Reliability of subgraph (without diff_rel)   -> 0.50
Normalized Reliability (end)         -> 0.52
Sigmoid Reliability (end)            -> 0.49

--- 1.414686679840088 seconds RELIABILITY ---

Accuracy of subgraph ->  91.67 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.916667,0.916667,0.001636


*** Subgraph 33 ***

MultiGraph with 30 nodes and 64 edges
Positive:  55
Negative:  521
Different relation: 7254

Mean and std before: -7.5652855526594065, 1.7249168955295964
-2.3275867 -12.902369
Mean and std after: -7.581965809175839, 1.5031104408371903
-4.115623 -11.012741
Reliability of subgraph              -> -8.31
Normalized Reliability of subgraph   -> 0.61
Sigmoid Reliability of subgraph      -> 0.67

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.54

--- 1.4540469646453857 seconds RELIABILITY ---

Accuracy of subgraph ->  89.19 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.945205,0.891892,0.001386


*** Subgraph 34 ***

MultiGraph with 30 nodes and 76 edges
Positive:  60
Negative:  624
Different relation: 7146

Mean and std before: -7.5711550734509, 1.7191510692536693
-2.2887826 -12.984095
Mean and std after: -7.585386763967507, 1.4948392159817454
-4.1361117 -11.007502
Reliability of subgraph              -> -7.74
Normalized Reliability of subgraph   -> 0.54
Sigmoid Reliability of subgraph      -> 0.57

Normalized Reliability of subgraph (without diff_rel)   -> 0.52
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.54

--- 1.4468584060668945 seconds RELIABILITY ---

Accuracy of subgraph ->  90.00 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.95,0.9,0.00131


*** Subgraph 35 ***

MultiGraph with 30 nodes and 75 edges
Positive:  55
Negative:  593
Different relation: 7182

Mean and std before: -7.564918221002338, 1.7196801501589924
-2.2953618 -12.902369
Mean and std after: -7.579914661108078, 1.495860432431925
-4.131158 -10.999145
Reliability of subgraph              -> -7.76
Normalized Reliability of subgraph   -> 0.53
Sigmoid Reliability of subgraph      -> 0.56

Normalized Reliability of subgraph (without diff_rel)   -> 0.52
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.52

--- 1.413991928100586 seconds RELIABILITY ---

Accuracy of subgraph ->  86.49 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.945205,0.864865,0.00178


*** Subgraph 36 ***

MultiGraph with 30 nodes and 66 edges
Positive:  59
Negative:  535
Different relation: 7236

Mean and std before: -7.570458387293273, 1.7224453206752315
-2.3275867 -12.984095
Mean and std after: -7.589735036189079, 1.4940226034492725
-4.126313 -11.009111
Reliability of subgraph              -> -8.24
Normalized Reliability of subgraph   -> 0.60
Sigmoid Reliability of subgraph      -> 0.68

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.57

--- 1.4425663948059082 seconds RELIABILITY ---

Accuracy of subgraph ->  82.05 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.924051,0.820513,0.00351


*** Subgraph 37 ***

MultiGraph with 30 nodes and 56 edges
Positive:  47
Negative:  457
Different relation: 7326

Mean and std before: -7.558150277192565, 1.713981957235839
-2.2887826 -12.693579
Mean and std after: -7.573016112786467, 1.490677600083483
-4.131158 -10.984157
Reliability of subgraph              -> -8.41
Normalized Reliability of subgraph   -> 0.63
Sigmoid Reliability of subgraph      -> 0.71

Normalized Reliability of subgraph (without diff_rel)   -> 0.61
Normalized Reliability (end)         -> 0.57
Sigmoid Reliability (end)            -> 0.57

--- 1.7197051048278809 seconds RELIABILITY ---

Accuracy of subgraph ->  90.62 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.935484,0.90625,0.004796


*** Subgraph 38 ***

MultiGraph with 30 nodes and 58 edges
Positive:  45
Negative:  477
Different relation: 7308

Mean and std before: -7.5668834257247655, 1.7187382129248052
-2.2887826 -12.984095
Mean and std after: -7.582479370751175, 1.4882690221520483
-4.133936 -10.988977
Reliability of subgraph              -> -8.14
Normalized Reliability of subgraph   -> 0.59
Sigmoid Reliability of subgraph      -> 0.65

Normalized Reliability of subgraph (without diff_rel)   -> 0.57
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.55

--- 1.5280582904815674 seconds RELIABILITY ---

Accuracy of subgraph ->  86.67 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.933333,0.866667,0.002004


*** Subgraph 39 ***

MultiGraph with 30 nodes and 68 edges
Positive:  59
Negative:  553
Different relation: 7218

Mean and std before: -7.561181636392537, 1.7146290398216728
-2.2887826 -12.693579
Mean and std after: -7.580553886840447, 1.4913563799548455
-4.133936 -10.988649
Reliability of subgraph              -> -8.33
Normalized Reliability of subgraph   -> 0.62
Sigmoid Reliability of subgraph      -> 0.70

Normalized Reliability of subgraph (without diff_rel)   -> 0.60
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.57

--- 1.4698100090026855 seconds RELIABILITY ---

Accuracy of subgraph ->  89.74 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.949367,0.897436,0.001446


*** Subgraph 40 ***

MultiGraph with 30 nodes and 70 edges
Positive:  65
Negative:  565
Different relation: 7200

Mean and std before: -7.558369579747566, 1.7260397244305559
-2.3275867 -12.984095
Mean and std after: -7.57190430588769, 1.501166599240502
-4.11405 -11.009111
Reliability of subgraph              -> -7.99
Normalized Reliability of subgraph   -> 0.57
Sigmoid Reliability of subgraph      -> 0.62

Normalized Reliability of subgraph (without diff_rel)   -> 0.55
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.54

--- 1.5327346324920654 seconds RELIABILITY ---

Accuracy of subgraph ->  88.37 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.942529,0.883721,0.003744


*** Subgraph 41 ***

MultiGraph with 30 nodes and 58 edges
Positive:  55
Negative:  467
Different relation: 7308

Mean and std before: -7.5738677567753605, 1.7167790975788646
-2.3756173 -12.902369
Mean and std after: -7.590448238150821, 1.4894964436412537
-4.1429634 -11.0047035
Reliability of subgraph              -> -8.48
Normalized Reliability of subgraph   -> 0.64
Sigmoid Reliability of subgraph      -> 0.72

Normalized Reliability of subgraph (without diff_rel)   -> 0.62
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.56

--- 1.4918344020843506 seconds RELIABILITY ---

Accuracy of subgraph ->  89.19 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.945205,0.891892,0.001369


*** Subgraph 42 ***

MultiGraph with 30 nodes and 69 edges
Positive:  62
Negative:  568
Different relation: 7200

Mean and std before: -7.572628436874155, 1.719520934593408
-2.2887826 -12.984095
Mean and std after: -7.58839781993698, 1.494973419566564
-4.134748 -11.0112
Reliability of subgraph              -> -7.92
Normalized Reliability of subgraph   -> 0.56
Sigmoid Reliability of subgraph      -> 0.62

Normalized Reliability of subgraph (without diff_rel)   -> 0.55
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.55

--- 1.5087261199951172 seconds RELIABILITY ---

Accuracy of subgraph ->  82.93 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.939759,0.829268,0.001484


*** Subgraph 43 ***

MultiGraph with 30 nodes and 68 edges
Positive:  53
Negative:  559
Different relation: 7218

Mean and std before: -7.574701076418656, 1.7208808563869942
-2.2953618 -12.984095
Mean and std after: -7.58805274271289, 1.4944467795253142
-4.133936 -11.012741
Reliability of subgraph              -> -8.23
Normalized Reliability of subgraph   -> 0.60
Sigmoid Reliability of subgraph      -> 0.67

Normalized Reliability of subgraph (without diff_rel)   -> 0.59
Normalized Reliability (end)         -> 0.56
Sigmoid Reliability (end)            -> 0.56

--- 1.5120863914489746 seconds RELIABILITY ---

Accuracy of subgraph ->  85.71 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.943662,0.857143,0.001328


*** Subgraph 44 ***

MultiGraph with 30 nodes and 58 edges
Positive:  58
Negative:  464
Different relation: 7308

Mean and std before: -7.579244324561127, 1.7164579443122499
-2.2953618 -12.902369
Mean and std after: -7.59990294911248, 1.4844215564366516
-4.149559 -11.0112
Reliability of subgraph              -> -8.19
Normalized Reliability of subgraph   -> 0.59
Sigmoid Reliability of subgraph      -> 0.64

Normalized Reliability of subgraph (without diff_rel)   -> 0.57
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.53

--- 1.4726099967956543 seconds RELIABILITY ---

Accuracy of subgraph ->  89.74 %

trained Linear SVM in 0.01 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.948052,0.897436,0.008716


*** Subgraph 45 ***

MultiGraph with 30 nodes and 62 edges
Positive:  57
Negative:  501
Different relation: 7272

Mean and std before: -7.5684886873301265, 1.712725087121177
-2.2887826 -12.693579
Mean and std after: -7.584546814822021, 1.4850545360061191
-4.1434093 -10.988977
Reliability of subgraph              -> -8.39
Normalized Reliability of subgraph   -> 0.62
Sigmoid Reliability of subgraph      -> 0.70

Normalized Reliability of subgraph (without diff_rel)   -> 0.60
Normalized Reliability (end)         -> 0.55
Sigmoid Reliability (end)            -> 0.56

--- 1.4457933902740479 seconds RELIABILITY ---

Accuracy of subgraph ->  89.47 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.947368,0.894737,0.001325


*** Subgraph 46 ***

MultiGraph with 30 nodes and 79 edges
Positive:  72
Negative:  630
Different relation: 7128

Mean and std before: -7.577726939233022, 1.7162673653342222
-2.3275867 -12.984095
Mean and std after: -7.599549440918092, 1.4842849690100204
-4.149559 -11.007502
Reliability of subgraph              -> -8.00
Normalized Reliability of subgraph   -> 0.57
Sigmoid Reliability of subgraph      -> 0.62

Normalized Reliability of subgraph (without diff_rel)   -> 0.55
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.52

--- 1.4180469512939453 seconds RELIABILITY ---

Accuracy of subgraph ->  93.75 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.9375,0.9375,0.001858


*** Subgraph 47 ***

MultiGraph with 30 nodes and 66 edges
Positive:  66
Negative:  528
Different relation: 7236

Mean and std before: -7.578863381182401, 1.713357216276467
-2.2953618 -12.984095
Mean and std after: -7.597417828664244, 1.4864325151414284
-4.1758533 -11.0047035
Reliability of subgraph              -> -8.07
Normalized Reliability of subgraph   -> 0.57
Sigmoid Reliability of subgraph      -> 0.61

Normalized Reliability of subgraph (without diff_rel)   -> 0.55
Normalized Reliability (end)         -> 0.53
Sigmoid Reliability (end)            -> 0.52

--- 1.4129817485809326 seconds RELIABILITY ---

Accuracy of subgraph ->  93.18 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.943182,0.931818,0.00437


*** Subgraph 48 ***

MultiGraph with 30 nodes and 61 edges
Positive:  57
Negative:  501
Different relation: 7272

Mean and std before: -7.573407847062229, 1.7209813134095693
-2.2887826 -12.984095
Mean and std after: -7.591414150917549, 1.4952454310402326
-4.133936 -11.009111
Reliability of subgraph              -> -8.19
Normalized Reliability of subgraph   -> 0.59
Sigmoid Reliability of subgraph      -> 0.66

Normalized Reliability of subgraph (without diff_rel)   -> 0.58
Normalized Reliability (end)         -> 0.54
Sigmoid Reliability (end)            -> 0.55

--- 1.4233744144439697 seconds RELIABILITY ---

Accuracy of subgraph ->  86.84 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.947368,0.868421,0.001116


*** Subgraph 49 ***

MultiGraph with 30 nodes and 66 edges
Positive:  58
Negative:  536
Different relation: 7236

Mean and std before: -7.5627427065205, 1.722962014845065
-2.3275867 -12.902369
Mean and std after: -7.584448218778315, 1.4957377715820006
-4.120717 -11.0047035
Reliability of subgraph              -> -7.85
Normalized Reliability of subgraph   -> 0.55
Sigmoid Reliability of subgraph      -> 0.59

Normalized Reliability of subgraph (without diff_rel)   -> 0.53
Normalized Reliability (end)         -> 0.53
Sigmoid Reliability (end)            -> 0.52

--- 1.5341370105743408 seconds RELIABILITY ---

Accuracy of subgraph ->  94.87 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.974026,0.948718,0.00136


*** Subgraph 50 ***

MultiGraph with 30 nodes and 80 edges
Positive:  80
Negative:  640
Different relation: 7110

Mean and std before: -7.568826126230174, 1.7204860256619972
-2.3275867 -12.984095
Mean and std after: -7.587929683486418, 1.49226887040162
-4.133936 -11.009111
Reliability of subgraph              -> -7.95
Normalized Reliability of subgraph   -> 0.55
Sigmoid Reliability of subgraph      -> 0.58

Normalized Reliability of subgraph (without diff_rel)   -> 0.53
Normalized Reliability (end)         -> 0.52
Sigmoid Reliability (end)            -> 0.51

--- 1.764153003692627 seconds RELIABILITY ---

Accuracy of subgraph ->  96.23 %

trained Linear SVM in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
0,Linear SVM,0.925234,0.962264,0.002021


0.034120606697115474 39546 40943
[-8.191, -8.039, -8.249, -7.864, -8.061, -8.068, -8.095, -8.562, -7.971, -8.287, -7.938, -8.327, -8.376, -8.274, -8.067, -8.398, -8.522, -7.624, -8.18, -8.285, -7.882, -8.208, -8.016, -7.929, -8.177, -8.173, -7.717, -8.269, -8.164, -7.986, -7.925, -7.644, -8.308, -7.743, -7.759, -8.236, -8.412, -8.142, -8.335, -7.988, -8.477, -7.923, -8.233, -8.193, -8.395, -8.002, -8.069, -8.185, -7.85, -7.949]
[0.593, 0.573, 0.602, 0.55, 0.579, 0.579, 0.581, 0.645, 0.561, 0.61, 0.559, 0.61, 0.623, 0.609, 0.579, 0.624, 0.638, 0.515, 0.592, 0.609, 0.553, 0.599, 0.57, 0.557, 0.593, 0.593, 0.527, 0.607, 0.593, 0.565, 0.556, 0.517, 0.609, 0.535, 0.534, 0.604, 0.627, 0.588, 0.618, 0.568, 0.635, 0.56, 0.601, 0.59, 0.622, 0.569, 0.57, 0.594, 0.548, 0.555]
[0.655, 0.622, 0.673, 0.595, 0.641, 0.638, 0.633, 0.734, 0.598, 0.691, 0.602, 0.679, 0.698, 0.685, 0.646, 0.692, 0.722, 0.539, 0.664, 0.691, 0.597, 0.662, 0.618, 0.593, 0.651, 0.661, 0.558, 0.686, 0.655, 0.612, 0.593, 0.534,

## Undirected BFS

In [None]:
# UNDIRECTED GRAPH #

# To obtain the running time of this task
start_time = time.time()

# Number of nodes for each subgraph
k = 30

# Number of subgraphs
h = 100

# List of subgraphs
subgraphs = []

all_visited = []

## RANDOM WALK ##

while len(subgraphs) < h:
    # List of visited nodes
    visited = []
    # Random node
    node = random.choice(list(G.nodes()))
    # Add the node to the subgraph
    visited.append(node)
    all_visited.append(node)
    while len(visited) < k:
        # Neighbors of the chosen entity
        neighbors = list(set(G.neighbors(node)) - set(all_visited))
        if len(neighbors) == 0:
            neighbors = list(set(G.nodes()) - set(visited))
        # Add neighbors to visited nodes
        for n in neighbors:
            if n not in visited and len(visited) < k:
                # Add the node to the subgraph
                visited.append(n)
                all_visited.append(n)

        node = random.choice(visited)

    # Add the generated subgraph to the list (with optional density)
    #if nx.density(nx.subgraph(G, visited)) > 0.6 or nx.density(nx.subgraph(G, visited)) < 0.4:
    #    print(nx.density(nx.subgraph(G, visited)))
    #    print(visited)
    subgraphs.append(visited)

rel_scores      = []
rel_scores_norm = []
rel_scores_sigm = []
acc_scores      = []

perc_entities   = []

# Lists used to evaluate the accuracy on a subgraph
pos_subgraph_triples_sub      = []
diff_rel_subgraph_triples_sub = []
neg_subgraph_triples_sub      = []

# Lists to keep track of all the generated triples
pos_subgraph_triples_tot      = []
diff_rel_subgraph_triples_tot = []
neg_subgraph_triples_tot      = []

c = 0

for s in subgraphs:

    c += 1
    
    print("*** Subgraph " + str(c) + " ***\n")

    # Lists of the generated triples of the current subgraph
    pos_subgraph_triples      = []
    diff_rel_subgraph_triples = []
    neg_subgraph_triples      = []

    # Compute all the combinations of e-r-e
    for comb in itertools.combinations(s, 2):
        #print(comb)
        n1 = n2 = 0
        # Check if exists at least one relation between the current pair of entities
        if G.get_edge_data(comb[0], comb[1]) != None:
            # Save all the relations between the current pair of entities
            edge_data = [value['label'] for key, value in G[comb[0]][comb[1]].items()]
            # Consider the remaining relations with the set difference
            diff_relations = set(relations) - set(edge_data)
            # Add all the 'positive' triples
            for r in edge_data:
                if [comb[0], r, comb[1]] not in pos_subgraph_triples:
                    pos_subgraph_triples.append([comb[0], r, comb[1]])
            # Add all the 'different relation' triples
            for r in diff_relations:
                diff_rel_subgraph_triples.append([comb[0], r, comb[1]])
        else:
            # If there are no relations between the current pair of entities
            # Add all the 'negative' triples
            for r in relations:
                neg_subgraph_triples.append([comb[0], r, comb[1]])
    
    print(nx.subgraph(G, s))
    
    reliability(s, pos_subgraph_triples, diff_rel_subgraph_triples, neg_subgraph_triples)
    accuracy(s, pos_subgraph_triples, diff_rel_subgraph_triples, neg_subgraph_triples)

    for e in s:
        if e not in perc_entities:
            perc_entities.append(e)

entity_difference = [item for item in entities if item not in perc_entities]

print(1 - len(entity_difference) / len(entities), len(entity_difference), len(entities))
    

df = pd.read_csv('stats.csv', index_col=[0])
df_total = df.assign(**{dataset + "_" + str(k) + "_" + str(h): rel_scores, 
                        dataset + "_" + str(k) + "_" + str(h) + "_NORM": rel_scores_norm, 
                        dataset + "_" + str(k) + "_" + str(h) + "_SIGM": rel_scores_sigm,
                        dataset + "_" + str(k) + "_" + str(h) + "_ACC": acc_scores})

df_total.to_csv('stats.csv')


print(rel_scores)
print(rel_scores_norm)
print(rel_scores_sigm)
print(acc_scores)


print("--- %s seconds TOTAL ---\n" % (time.time() - start_time))

## Directed

In [None]:
# DIRECTED GRAPH #

# To obtain the running time of this task
start_time = time.time()

# Number of nodes for each subgraph
k = 30

# Number of subgraphs
h = 50

# List of subgraphs
subgraphs = []

## RANDOM WALK ##

while len(subgraphs) < h:
    # List of visited nodes
    visited = []
    # Random node
    node = random.choice(list(G.nodes()))
    # Add the node to the subgraph
    visited.append(node)
    while len(visited) < k:
        # Neighbors of the chosen entity
        neighbors = list(set(G.neighbors(node)) - set(visited))
        if len(neighbors) == 0:
            neighbors = list(set(G.nodes()) - set(visited))
        # Choose the new node randomly
        node = random.choice(neighbors)
        # Add the node to the subgraph
        visited.append(node)

    # Add the generated subgraph to the list (with optional density)
    #if nx.density(nx.subgraph(G, visited)) > 0.1 or nx.density(nx.subgraph(G, visited)) < 0.06:
    #    print(nx.density(nx.subgraph(G, visited)))
    #    print(visited)
    subgraphs.append(visited)

rel_scores      = []
rel_scores_norm = []
rel_scores_sigm = []
rel_scores_sigm_end = []
acc_scores      = []

perc_entities   = []

c = 0

for s in subgraphs:

    c += 1
    
    print("*** Subgraph " + str(c) + " ***\n")

    #print(s)
    #print(s[::-1])

    pos_subgraph_triples      = []
    diff_rel_subgraph_triples = []
    neg_subgraph_triples      = []

    edge_data_g   = []
    edge_data_rev = []

    # Compute all the combinations of e-r-e (only in one direction)
    for comb in itertools.combinations(s, 2):
        #print(comb)
        n1 = n2 = 0
        # Check if exists at least one relation between the current pair of entities
        if G.get_edge_data(comb[0], comb[1]) != None:
            # Save all the relations between the current pair of entities
            edge_data = [value['label'] for key, value in G[comb[0]][comb[1]].items()]
            # Merge the sets
            #edge_data = list(set(edge_data_g).union(set(edge_data_rev)))
            # Consider the remaining relations with the set difference
            diff_relations = set(relations) - set(edge_data)
            # Add all the 'positive' triples
            for r in edge_data:
                n1 += 1
                pos_subgraph_triples.append([comb[0], r, comb[1]])
            # Add all the 'different relation' triples
            for r in diff_relations:
                #n2 += 1
                diff_rel_subgraph_triples.append([comb[0], r, comb[1]])
        else:
            # If there are no relations between the current pair of entities
            # Add all the 'negative' triples
            for r in relations:
                neg_subgraph_triples.append([comb[0], r, comb[1]])

    # Compute all the combinations of e-r-e (in the opposite direction)
    for comb in itertools.combinations(s[::-1], 2):
        #print(comb)
        n1 = n2 = 0
        # Check if exists at least one relation between the current pair of entities
        if G.get_edge_data(comb[0], comb[1]) != None:
            # Save all the relations between the current pair of entities
            edge_data = [value['label'] for key, value in G[comb[0]][comb[1]].items()]
            # Merge the sets
            #edge_data = list(set(edge_data_g).union(set(edge_data_rev)))
            # Consider the remaining relations with the set difference
            diff_relations = set(relations) - set(edge_data)
            # Add all the 'positive' triples
            for r in edge_data:
                n1 += 1
                pos_subgraph_triples.append([comb[0], r, comb[1]])
            # Add all the 'different relation' triples
            for r in diff_relations:
                #n2 += 1
                diff_rel_subgraph_triples.append([comb[0], r, comb[1]])
        else:
            # If there are no relations between the current pair of entities
            # Add all the 'negative' triples
            for r in relations:
                neg_subgraph_triples.append([comb[0], r, comb[1]])
    
    print(nx.subgraph(G, s))
    
    reliability(s, pos_subgraph_triples, diff_rel_subgraph_triples, neg_subgraph_triples)
    accuracy(s, pos_subgraph_triples, diff_rel_subgraph_triples, neg_subgraph_triples)

    for e in s:
        if e not in perc_entities:
            perc_entities.append(e)

entity_difference = [item for item in entities if item not in perc_entities]

print(1 - len(entity_difference) / len(entities), len(entity_difference), len(entities))
    

df = pd.read_csv('stats.csv', index_col=[0])
df_total = df.assign(**{dataset + "_" + str(k) + "_" + str(h): rel_scores, 
                        dataset + "_" + str(k) + "_" + str(h) + "_NORM": rel_scores_norm, 
                        dataset + "_" + str(k) + "_" + str(h) + "_SIGM": rel_scores_sigm,
                        dataset + "_" + str(k) + "_" + str(h) + "_SIGMEND": rel_scores_sigm_end,
                        dataset + "_" + str(k) + "_" + str(h) + "_ACC": acc_scores})

df_total.to_csv('stats.csv')

print("--- %s seconds TOTAL ---\n" % (time.time() - start_time))

# Reliability/Accuracy of the graph

In [None]:
df = pd.read_csv("stats.csv")
df[dataset + "_total"] = ""
df.to_csv("stats.csv", index=False)

#total_reliability(entities, pos_triples, diff_rel_triples, neg_triples)
accuracy(entities, pos_triples, diff_rel_triples, neg_triples)

# Compute scores

In [None]:
df = pd.DataFrame(list())
df.to_csv("scores.csv")

if len(pos_triples) > 0:
    for i in range(0, len(pos_triples)):
        score_triple = model.score_hrt(torch.LongTensor([[tf.entity_to_id[pos_triples[i][0]], 
                                                                            tf.relation_to_id[pos_triples[i][1]], 
                                                                            tf.entity_to_id[pos_triples[i][2]]]]))
        score_np = score_triple.detach().numpy()
        score_pos.append(score_np)
        
if len(diff_rel_triples) > 0:
    for i in range(0, len(diff_rel_triples)):
        score_triple = model.score_hrt(torch.LongTensor([[tf.entity_to_id[diff_rel_triples[i][0]], 
                                                                            tf.relation_to_id[diff_rel_triples[i][1]], 
                                                                            tf.entity_to_id[diff_rel_triples[i][2]]]]))
        score_np = score_triple.detach().numpy()
        score_diff_rel.append(score_np)

if len(neg_triples) > 0:
    for i in range(0, len(neg_triples)):
        score_triple = model.score_hrt(torch.LongTensor([[tf.entity_to_id[neg_triples[i][0]], 
                                                                            tf.relation_to_id[neg_triples[i][1]], 
                                                                            tf.entity_to_id[neg_triples[i][2]]]]))
        score_np = score_triple.detach().numpy()
        score_neg.append(score_np)
else:
    score_neg = score_diff_rel

min_len = min(len(score_pos), len(score_diff_rel), len(score_neg))

df = pd.DataFrame({'pos': score_pos[:min_len],
                    'diff_rel': score_diff_rel[:min_len],
                    'neg': score_neg[:min_len]},
                    columns=['pos','diff_rel','neg'])

df.to_csv('scores.csv', mode='a', index=False, header=False)

In [None]:
df = pd.DataFrame(list())
df.to_csv("scores.csv")

new_score_pos = []
new_score_diff_rel = []
new_score_neg = []

for e in score_pos:
    new_score_pos.append(e.tolist()[0][0])

for e in score_diff_rel:
    new_score_diff_rel.append(e.tolist()[0][0])

for e in score_neg:
    new_score_neg.append(e.tolist()[0][0])

min_len = min(len(new_score_pos), len(new_score_diff_rel), len(new_score_neg))

df = pd.DataFrame({'pos': new_score_pos[:min_len],
                    'diff_rel': new_score_diff_rel[:min_len],
                    'neg': new_score_neg[:min_len]},
                    columns=['pos','diff_rel','neg'])

df.to_csv('scores.csv', mode='a', index=False, header=False)

# Plot scores

In [None]:
a = np.loadtxt(r'scores.csv',dtype=str,delimiter=',',skiprows=1,usecols=(0,1,2))

pos = a[:,0]

pos_elem = []
for i in pos:
    pos_elem.append(eval(i)[0][0])

diff_rel = a[:,1]
diff_rel_elem = []
for i in diff_rel:
    diff_rel_elem.append(eval(i)[0][0])

neg = a[:,2]
neg_elem = []
for i in neg:
    neg_elem.append(eval(i)[0][0])

sns.distplot(pos_elem, hist=False, rug=True)
sns.distplot(diff_rel_elem, hist=False, rug=True)
sns.distplot(neg_elem, hist=False, rug=True)
plt.legend(labels=["Positive","Different relation","Negative"], fontsize='small')
plt.xlabel("Score")
plt.show()

print("** Mean **")
print("Positive: " + str(statistics.mean(pos_elem)))
print("Different relation: " + str(statistics.mean(diff_rel_elem)))
print("Negative: " + str(statistics.mean(neg_elem)) + "\n")

print("** Standard deviation **")
print("Positive: " + str(statistics.stdev(pos_elem)))
print("Different relation: " + str(statistics.stdev(diff_rel_elem)))
print("Negative: " + str(statistics.stdev(neg_elem)))


# Evaluate triples

In [None]:
df = get_relation_prediction_df(pipeline_result.model, entities[0], entities[1], triples_factory=pipeline_result.training)

score_trip = pipeline_result.model.score_hrt(torch.LongTensor([[tf.entity_to_id[entities[0]], tf.relation_to_id['affects'], tf.entity_to_id[entities[1]]]]))
result_np = score_trip.detach().numpy()
#df = get_relation_prediction_df(pipeline_result.model, 'a-51', 'a-52', triples_factory=pipeline_result.training)
print(df)

print(tf.relation_to_id['affects'])
#print(score_trip, result_np)

# Evaluation metrics

In [None]:
# Precision@k (fixed k)
def pk(r_list, k):
    
    #print(r_list[:k])

    num_true = r_list[:k].count(True)
    
    p = 0
    
    if num_true == 0:
        return 1

    for e in r_list[:k]:
        if e == True:
            p += 1
        elif p >= num_true:
            p += 1

    precision = p / len(r_list[:k])
    
    return float("{:.2f}".format(precision))

# Precision@k (variable k)
def variable_pk(r_list):

    k = r_list.count(True)
    
    p = 0
    
    if k == 0:
        return 1

    for e in r_list[:k]:
        if e == True:
            p += 1
        elif p >= k:
            p += 1

    precision = p / len(r_list[:k])
    
    return float("{:.2f}".format(precision))


# Mean Reciprocal Rank
def mrr(r_list):

    k = 0

    if r_list.count(True) > 0:
        k = len(r_list) - r_list[::-1].index(True)
    else:
        return 1

    for i in range(0, len(r_list[:k])):
        if r_list[i] == True:
            mrr = 1 / (i + 1)
            return float("{:.2f}".format(mrr))

# Evaluation of subgraph A

In [None]:
positives = [0, 0]
all_scores = [0, 0]
to_append_a = []
to_append_b = []

for i in range(1,100):
    for j in range(i,100):
        labels = []
        
        df = get_relation_prediction_df(pipeline_result.model, 'a-' + str(i), 'a-' + str(j), triples_factory=pipeline_result.training)
        labels = df.relation_label.tolist()
        scores = df.score.tolist()
        if(labels[0] == 'a'):
            positives[0] += 1
            all_scores[0] += scores[0]
            all_scores[1] += scores[1]
            to_append_a.append(scores[0])
            to_append_b.append(scores[1])
        else:
            positives[1] += 1
            all_scores[0] += scores[1]
            all_scores[1] += scores[0]
            to_append_a.append(scores[1])
            to_append_b.append(scores[0])

In [None]:
print("True positives/False positives: " + str(positives))
print("Precision: " + str(positives[0]/(positives[0] + positives[1])) + "\n")
print("Average score of all 'a' edges: " + str(all_scores[0]/5000))
print("Average score of all 'b' edges: " + str(all_scores[1]/5000) + "\n")

plt.hist(to_append_a, bins=82, label="A")
plt.hist(to_append_b, bins=82, label="B")
plt.legend(loc="upper left")
plt.ylabel("Count")
plt.xlabel("Scores")
plt.title("Distribution of scores in the subgraph A")

# Evaluation of the toy graph

In [None]:
positives = [0, 0]
all_scores = [0, 0]
to_append_a = []
to_append_b = []

for i in range(1,100):
    for j in range(i,100):
        labels = []
        
        df = get_relation_prediction_df(pipeline_result.model, 'a-' + str(i), 'a-' + str(j), triples_factory=pipeline_result.training)
        labels = df.relation_label.tolist()
        scores = df.score.tolist()
        if(labels[0] == 'a'):
            positives[0] += 1
            all_scores[0] += scores[0]
            all_scores[1] += scores[1]
            to_append_a.append(scores[0])
            to_append_b.append(scores[1])
        else:
            positives[1] += 1
            all_scores[0] += scores[1]
            all_scores[1] += scores[0]
            to_append_a.append(scores[1])
            to_append_b.append(scores[0])
        
        df = get_relation_prediction_df(pipeline_result.model, 'b-' + str(i), 'b-' + str(j), triples_factory=pipeline_result.training)
        labels = df.relation_label.tolist()
        scores = df.score.tolist()
        if(labels[0] == 'b'):
            positives[0] += 1
            all_scores[0] += scores[0]
            all_scores[1] += scores[1]
            to_append_a.append(scores[0])
            to_append_b.append(scores[1])
        else:
            positives[1] += 1
            all_scores[0] += scores[1]
            all_scores[1] += scores[0]
            to_append_a.append(scores[1])
            to_append_b.append(scores[0])

In [None]:
print("True positives/False positives: " + str(positives))
print("Precision: " + str(positives[0]/(positives[0] + positives[1])) + "\n")
print("Average score of all positive edges: " + str(all_scores[0]/10000))
print("Average score of all negative edges: " + str(all_scores[1]/10000) + "\n")

plt.hist(to_append_a, bins=82, label="A")
plt.hist(to_append_b, bins=82, label="B")
plt.legend(loc="upper left")
plt.ylabel("Count")
plt.xlabel("Scores")
plt.title("Distribution of scores in the toy graph")

# Evaluation of the UMLS graph

In [None]:
# VERSION 1 #

positives       = [0, 0] # True positives/False positives
pos_scores      = []
negatives       = [0, 0] # True negatives/False negatives
neg_scores      = []
scores_count    = 0

for e1 in range(0, len(entities)):
    for e2 in range(e1 + 1, len(entities)):
        df = get_relation_prediction_df(pipeline_result.model, str(entities[e1]), str(entities[e2]), triples_factory=pipeline_result.training)
        labels = df.relation_label.tolist()
        scores = df.score.tolist()
        in_training = df.in_training.tolist()

        for i in range(0, len(labels)):
            if [entities[e1], labels[i], entities[e2]] in tf_all and in_training[i] == True:
                positives[0] += 1
                pos_scores.append(scores[i]) 
            elif [entities[e1], labels[i], entities[e2]] not in tf_all and in_training[i] == False:
                negatives[0] += 1
                neg_scores.append(scores[i]) 
            elif [entities[e1], labels[i], entities[e2]] in tf_all and in_training[i] == False:
                negatives[1] += 1
            scores_count += 1


In [None]:
# VERSION 2 # (with P@k, MRR)

positives       = 0     # Number of positive triples 
pos_scores      = []    # List of positives' scores
negatives       = 0     # Number of negative triples
neg_scores      = []    # List of negatives' scores
pairs_count     = 0     # Total pairs of entities
scores_count    = 0     # Total triples (positives/negatives)
prec_at_3       = 0     # Sum of all P@3
prec_at_5       = 0     # Sum of all P@5
prec_at_10      = 0     # Sum of all P@10
var_prec_at_k   = 0     # Sum of all P@k (with variable k)
mrr_value       = 0     # Sum of MRR

for e1 in range(0, len(entities)):
    for e2 in range(e1 + 1, len(entities)):
        df = get_relation_prediction_df(pipeline_result.model, str(entities[e1]), str(entities[e2]), triples_factory=pipeline_result.training)
        labels = df.relation_label.tolist()
        scores = df.score.tolist()
        in_training = df.in_training.tolist()

        ranking_list = []

        for i in range(0, len(labels)):
            if [entities[e1], labels[i], entities[e2]] in tf_all:
                positives += 1
                pos_scores.append(scores[i]) 
                ranking_list.append(True)
            else:
                negatives += 1
                neg_scores.append(scores[i])
                ranking_list.append(False)
            scores_count += 1

        pairs_count += 1

        prec_at_3       += pk(ranking_list, 3)
        prec_at_5       += pk(ranking_list, 5)
        prec_at_10      += pk(ranking_list, 10)
        var_prec_at_k   += variable_pk(ranking_list)
        mrr_value       += mrr(ranking_list)

        


In [None]:
#print("Precision: " + str(positives[0]/(positives[0] + positives[1])))

#print("Recall: " + str(positives[0]/(positives[0] + negatives[1])) + "\n")

print("Precision@3: " + str(float("{:.5f}".format(prec_at_3 / pairs_count))) + "\n")

print("Precision@5: " + str(float("{:.5f}".format(prec_at_5 / pairs_count))) + "\n")

print("Precision@10: " + str(float("{:.5f}".format(prec_at_10 / pairs_count))) + "\n")

print("Variable Precision@k: " + str(float("{:.5f}".format(var_prec_at_k / pairs_count))) + "\n")

print("MRR: " + str(float("{:.5f}".format(mrr_value / pairs_count))))

#print("True Positives/False Positives: " + str(positives))
#print("True Negatives/False Negatives: " + str(negatives))

In [None]:
plt.hist(pos_scores, bins=50, label="Pos")
plt.legend(loc="upper left")
plt.ylabel("Count")
plt.xlabel("Scores")
plt.title("Positive triples")

In [None]:
plt.hist(neg_scores, bins=50, label="Neg")
plt.legend(loc="upper left")
plt.ylabel("Count")
plt.xlabel("Scores")
plt.title("Negative triples")

# Empty the stats document

In [None]:
df = pd.DataFrame(list())
df.to_csv("stats.csv")