In [1]:
import sys
sys.path.append('../..')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

## Load the dataset

In [2]:
import ampligraph
# Benchmark datasets are under ampligraph.datasets module
from ampligraph.datasets import load_fb15k_237
# load fb15k-237 dataset
dataset = load_fb15k_237()

## Train the model

In [3]:
# Import the KGE model
from ampligraph.latent_features import ScoringBasedEmbeddingModel

# you can continue training from where you left after restoring the model
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./transe_train_logs')

# create the model with transe scoring function
model = ScoringBasedEmbeddingModel(eta=5, 
                                     k=300,
                                     scoring_type='TransE')


# compile the model with loss and optimizer
model.compile(optimizer='adam', loss='multiclass_nll')

# fit the model to data.
model.fit(dataset['train'],
             batch_size=10000,
             epochs=10,
             callbacks=[tensorboard_callback])

# the training can be visualised using the following command:
# tensorboard --logdir='./transe_train_logs' --port=8891 
# open the browser and go to the following URL: http://127.0.0.1:8891/

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0b600acca0>

## Predict scores

In [4]:
pred = model.predict(dataset['test'][:5], 
                       batch_size=100)
pred

array([-0.29495624, -0.10703138, -0.0309352 , -3.0869765 , -2.8302596 ],
      dtype=float32)

## Evaluate the model (without filter)

### both subject and object side (s,o) evaluation(standard protocol)

In [5]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], # test set
                       batch_size=100, # evaluation batch size
                       corrupt_side='s,o', # sides to corrupt for scoring and ranking
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 593.0106174772483
MRR: 0.08586959767225415
hits@1: 0.0
hits@10: 0.2306243272335845


### object side evaluation

In [6]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], 
                       batch_size=100, 
                       corrupt_side='o', # corrupt only object side
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 378.83540463841865
MRR: 0.12770093015142223
hits@1: 0.0
hits@10: 0.33785106174772483


### subject side evaluation

In [7]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], 
                       batch_size=100, 
                       corrupt_side='s', # corrupt only subject side
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 807.1858303160778
MRR: 0.044038265193086054
hits@1: 0.0
hits@10: 0.12339759271944417


## Evaluation with filters

In [8]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], 
                       batch_size=100, 
                       corrupt_side='s,o', # corrupt only subject side
                       use_filter={'train':dataset['train'], # Filter to be used for evaluation
                                   'valid':dataset['valid'],
                                   'test':dataset['test']}
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 464.04254330169294
MRR: 0.19242388145244288
hits@1: 0.1279479401115569
hits@10: 0.3219493101086212


## Evaluation using a subset of entities for corruption

In [9]:
# Let's get all the month present in training
months = set(dataset['train'][
    dataset['train'][:, 1] == 
        '/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month'][:, 2])
len(months)

12

In [10]:
# consider we are evaluating the below test set which is specific to one predicate
# This predicate tells the best time of the year(o) to visit a destination (s)
dest_month_test_triples = dataset['test'][
    dataset['test'][:, 1] ==
        '/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month']

In [11]:
# Let's say we want to evaluate this test set by corrupting the object with only months.
# we can pass the months as entities_subset and generate corruptions only using this subset 
# instead of all entities in the graph
# This approach is very useful when the graph size is big and/or 
# when our hypothesis belongs to a specific predicate type
# When graph size is big we can randomly sample fixed number of small subset of entities and use it as corruption

# evaluate on the test set
ranks = model.evaluate(dest_month_test_triples, 
                       batch_size=100, 
                       corrupt_side='o', # corrupt only subject side
                       entities_subset=months,
                       use_filter={'train':dataset['train'], # Filter to be used for evaluation
                                   'valid':dataset['valid'],
                                   'test':dataset['test']}
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 1.0333333333333334
MRR: 0.9833333333333333
hits@1: 0.9666666666666667
hits@10: 1.0


## Visualize the embeddings

In [12]:
from ampligraph.utils import create_tensorboard_visualizations

In [13]:
create_tensorboard_visualizations(model, 
                                  entities_subset=['/m/027rn', '/m/06cx9', '/m/017dcd', '/m/06v8s0', '/m/07s9rl0'], 
                                  labels=['ent1', 'ent2', 'ent3', 'ent4', 'ent5'],
                                  loc = './selected_subset_embeddings_vis')


In [14]:
create_tensorboard_visualizations(model, 
                                  entities_subset='all',
                                  loc = './full_embeddings_vis')

# the embeddings can be visualised using the following command:
# tensorboard --logdir='./full_embeddings_vis' --port=8891 
# open the browser and go to the following URL: http://127.0.0.1:8891/#projector