# General Example
In this example we show how to load a benchmark dataset, how to train a model on it and which are the different types of evaluation protocols that we can use.

In [6]:
import sys
sys.path.append('../..')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

## Load the dataset

In [7]:
import ampligraph
# Benchmark datasets are under ampligraph.datasets module
from ampligraph.datasets import load_fb15k_237
# load fb15k-237 dataset
dataset = load_fb15k_237()

## Train the model

In [8]:
# Import the KGE model
from ampligraph.latent_features import ScoringBasedEmbeddingModel

# you can continue training from where you left after restoring the model
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./transe_train_logs')

# create the model with transe scoring function
model = ScoringBasedEmbeddingModel(eta=5,
                                   k=300,
                                   scoring_type='TransE')

# you can either use optimizers/regularizers/loss/initializers with default values or you can 
# import it and customize the hyperparameters and pass it to compile

# Let's create an adam optimizer with customized learning rate =0.005
adam = tf.keras.optimizers.Adam(learning_rate=0.005)
# Let's compile the model with self_advarsarial loss of default parameters
model.compile(optimizer=adam, loss='self_adversarial')

# fit the model to data.
model.fit(dataset['train'],
             batch_size=10000,
             epochs=10,
             callbacks=[tensorboard_callback])

# the training can be visualised using the following command:
# tensorboard --logdir='./transe_train_logs' --port=8891 
# open the browser and go to the following URL: http://127.0.0.1:8891/

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x3836fd1b0>

## Predict scores

In [9]:
pred = model.predict(dataset['test'][:5], 
                       batch_size=100)
pred

array([-2.266111 , -2.095364 , -2.289988 , -3.8879156, -4.619501 ],
      dtype=float32)

## Evaluate the model (without filter)

### Subject and object (s,o) corruption
This evaluation protocol consists in creating corrupted triples via the corruption of both subject and object of existing triples. This is considered the standard protocol for evaluation in Knowledge Graph Embedding models literature.

In [10]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'],     # test set
                       batch_size=100,      # evaluation batch size
                       corrupt_side='s,o'   # sides to corrupt for scoring and ranking
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 472.3101575496624
MRR: 0.08900155667769008
hits@1: 0.0
hits@10: 0.2428319796457579


### Object corruption
Corruptions are generated by changing only the object of triples.

In [11]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], 
                       batch_size=100, 
                       corrupt_side='o' # corrupt only the object
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 268.11287797240436
MRR: 0.12974471587250763
hits@1: 0.0
hits@10: 0.3512085331245719


### Subject corruption
Corruptions are generated by changing only the subject of triples.

In [12]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], 
                       batch_size=100, 
                       corrupt_side='s' # corrupt only the subject
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 676.5074371269204
MRR: 0.048258397482872535
hits@1: 0.0
hits@10: 0.13445542616694392


## Evaluation with Filters
Triples specified inside the filter are removed from the corruptions that are generated to avoid the creation of false negatives.

In [13]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], 
                       batch_size=100, 
                       corrupt_side='s,o', # corrupt both subject and object
                       use_filter={'train':dataset['train'], # Filter to be used for evaluation
                                   'valid':dataset['valid'],
                                   'test':dataset['test']}
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 364.22042274195127
MRR: 0.19061439893688528
hits@1: 0.12329973578628045
hits@10: 0.32199823857520304


## Evaluation using a subset of entities for corruption
Specify a subset of entities that are used for corrupting, depending on the evaluation strategy chosen, either the subject, the object or both. Notice that, despite not being the standard evaluation protocol, using a subset of entities can generate more meaningful corruptions and also reduce a lot the computational overhead caused by sampling corruptions among all the entities in the knowledge graph.

In [14]:
# Let's get all the month present in training
months = set(dataset['train'][
    dataset['train'][:, 1] == 
        '/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month'][:, 2])
len(months)

12

In [15]:
# consider we are evaluating the below test set which is specific to one predicate
# This predicate tells the best time of the year(o) to visit a destination (s)
dest_month_test_triples = dataset['test'][
    dataset['test'][:, 1] ==
        '/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month']

In [16]:
# Let's say we want to evaluate this test set by corrupting the object with only months.
# we can pass the months as entities_subset and generate corruptions only using this subset 
# instead of all entities in the graph
# This approach is very useful when the graph size is big and/or 
# when our hypothesis belongs to a specific predicate type
# When graph size is big we can randomly sample fixed number of small subset of entities and use it as corruption

# evaluate on the test set
ranks = model.evaluate(dest_month_test_triples, 
                       batch_size=100, 
                       corrupt_side='o', # corrupt only the object
                       entities_subset=months,
                       use_filter={'train':dataset['train'], # Filter to be used for evaluation
                                   'valid':dataset['valid'],
                                   'test':dataset['test']}
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 1.0833333333333333
MRR: 0.9861111111111112
hits@1: 0.9833333333333333
hits@10: 1.0


## Visualize the embeddings

In [17]:
from ampligraph.utils import create_tensorboard_visualizations

In [18]:
create_tensorboard_visualizations(model, 
                                  entities_subset=['/m/027rn', '/m/06cx9', '/m/017dcd', '/m/06v8s0', '/m/07s9rl0'], 
                                  labels=['ent1', 'ent2', 'ent3', 'ent4', 'ent5'],
                                  loc = './selected_subset_embeddings_vis')


In [None]:
create_tensorboard_visualizations(model, 
                                  entities_subset='all',
                                  loc = './full_embeddings_vis')

# the embeddings can be visualised uncommenting the following command:
# ! tensorboard --logdir='./full_embeddings_vis' --port=8891 
# open the browser and go to the following URL: http://127.0.0.1:8891/#projector

# Early Stopping
The following example shows how to use early stopping while training a model and how to create checkpoints.

In [2]:
import sys
sys.path.append('../..')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [4]:
import ampligraph
# Benchmark datasets are under ampligraph.datasets module
from ampligraph.datasets import load_fb15k_237
# load fb15k-237 dataset
dataset = load_fb15k_237()

In [6]:
# Import the KGE model
from ampligraph.latent_features import ScoringBasedEmbeddingModel

# create the model with transe scoring function
model = ScoringBasedEmbeddingModel(eta=1, 
                                     k=10,
                                     scoring_type='TransE')


# compile the model with loss and optimizer
model.compile(optimizer='adam', loss='multiclass_nll')

# Use this for checkpoints at regular intervals
#checkpoint = tf.keras.callbacks.ModelCheckpoint('./chkpt_transe', monitor='val_mrr', verbose=1, save_best_only=True, mode='min')

# Use this for early stopping
checkpoint = tf.keras.callbacks.EarlyStopping(monitor="val_mrr",            # which metrics to monitor
                                              patience=3,                   # If the monitored metric doesnt improve 
                                                                            # for these many checks the model early stops
                                              verbose=1,                    # verbosity
                                              mode="max",                   # how to compare the monitored metrics. 
                                                                            # max - means higher is better
                                              restore_best_weights=True)    # restore the weights with best value

dataset = load_fb15k_237()

model.fit(dataset['train'],
          batch_size=10000,
          epochs=100,
          validation_freq=2,                            # Epochs to elapse before next evaluation
          validation_batch_size=100,                    
          validation_burn_in=1,                         # Epoch to start the validation process
          validation_data = dataset['valid'][::100],
          callbacks=[checkpoint])                       # Pass the callback to the fit function


Epoch 1/100
Epoch 2/100

2023-02-08 13:21:22.984591: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 3/100
Epoch 4/100

2023-02-08 13:21:26.191442: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 5/100
Epoch 6/100

2023-02-08 13:21:29.466724: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 7/100
Epoch 8/100

2023-02-08 13:21:32.519336: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 9/100
Epoch 10/100

2023-02-08 13:21:35.457800: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 11/100
Epoch 12/100
      2/Unknown - 0s 159ms/step===>..] - ETA: 0s - loss: 5171.8193

2023-02-08 13:21:39.521803: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 13/100
Epoch 14/100


2023-02-08 13:21:40.702638: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 15/100
Epoch 16/100

2023-02-08 13:21:42.851130: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 17/100
Epoch 18/100

2023-02-08 13:21:45.612853: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 19/100
Epoch 20/100

2023-02-08 13:21:47.841319: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Epoch 21/100
Epoch 22/100
      2/Unknown - 0s 152ms/step===>..] - ETA: 0s - loss: 4140.2354

2023-02-08 13:21:50.470017: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: RaggedFromRowLengths/RowPartitionFromRowLengths/assert_non_negative/assert_less_equal/Assert/AssertGuard/branch_executed/_8


Restoring model weights from the end of the best epoch: 16.
Epoch 22: early stopping


<tensorflow.python.keras.callbacks.History at 0x28a6d6da0>

In [7]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], # test set
                       batch_size=100, # evaluation batch size
                       corrupt_side='s,o', 
                       use_filter={'train':dataset['train'], # Filter to be used for evaluation
                                   'valid':dataset['valid'],
                                   'test':dataset['test']}
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 600.0142626480086
MRR: 0.18888812773966743
hits@1: 0.1267491926803014
hits@10: 0.3130687934240141


## Save and restore

In [None]:
# if using ModelCheckpoint then we can restore the checkpoints using restore model
# from ampligraph.utils import restore_model
# model = restore_model('chkpt_transe')

In [1]:
import sys
sys.path.append('../..')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [2]:
import ampligraph
# Benchmark datasets are under ampligraph.datasets module
from ampligraph.datasets import load_fb15k_237
# load fb15k-237 dataset
dataset = load_fb15k_237()

In [3]:
# Import the KGE model
from ampligraph.latent_features import ScoringBasedEmbeddingModel

# create the model with transe scoring function
model = ScoringBasedEmbeddingModel(eta=1, 
                                     k=10,
                                     scoring_type='TransE')


# compile the model with loss and optimizer
model.compile(optimizer='adam', loss='multiclass_nll')

# Use this for checkpoints at regular intervals
checkpoint = tf.keras.callbacks.ModelCheckpoint('./chkpt1_transe', monitor='val_mrr', verbose=0, 
                                                save_best_only=True, mode='min')

dataset = load_fb15k_237()

model.fit(dataset['train'],
          batch_size=10000,
          epochs=10,
          validation_freq=2,
          validation_batch_size=100,
          validation_burn_in=2,
          validation_data = dataset['valid'][::100],
          callbacks=[checkpoint])     # Pass the callback to the fit function


Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2931fd2a0>

In [4]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], # test set
                       batch_size=100, # evaluation batch size
                       corrupt_side='s,o', 
                       use_filter={'train':dataset['train'], # Filter to be used for evaluation
                                   'valid':dataset['valid'],
                                   'test':dataset['test']}
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 921.3923084450533
MRR: 0.17730360323410402
hits@1: 0.12151384675604267
hits@10: 0.2851551032390645


In [5]:
from ampligraph.utils import save_model
# explictly save the model
save_model(model, 'saved_model_transE')




In [6]:
from ampligraph.utils import restore_model

# restore saved models or checkpoints
model = restore_model('saved_model_transE')

Saved model does not include a db file. Skipping.


In [7]:
# evaluate on the test set
ranks = model.evaluate(dataset['test'], # test set
                       batch_size=100, # evaluation batch size
                       corrupt_side='s,o', 
                       use_filter={'train':dataset['train'], # Filter to be used for evaluation
                                   'valid':dataset['valid'],
                                   'test':dataset['test']}
                       )

# import the evaluation metrics
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score

print('MR:', mr_score(ranks))
print('MRR:', mrr_score(ranks))
print('hits@1:', hits_at_n_score(ranks, 1))
print('hits@10:', hits_at_n_score(ranks, 10))

MR: 921.3923084450533
MRR: 0.17730360323410402
hits@1: 0.12151384675604267
hits@10: 0.2851551032390645


# Partitioned Training
The following example shows how to train a model with partitioning.

In [3]:
import sys
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
import numpy as np
import ampligraph

## Train and predict scores

In [4]:
# Import the KGE model
from ampligraph.latent_features import ScoringBasedEmbeddingModel

PATH_TO_DATASET = 'your/path/to/dataset/'

# create the model with transe scoring function
partitioned_model = ScoringBasedEmbeddingModel(eta=2, 
                                               k=50, 
                                               scoring_type='TransE')
partitioned_model.compile(optimizer='adam', loss='multiclass_nll')

# Here we have specified the path of the input file
# you can also load using default dataloaders load_fb15k_237() and pass numpy array inputs
partitioned_model.fit(PATH_TO_DATASET + 'wn18RR/train.txt',
                      batch_size=10000, 
                      partitioning_k=3, # set flag to partition the inputs
                      epochs=10)


_split: memory before: 848.0Bytes, after: 4.3447MB, consumed: 4.3439MB; exec time: 29.242s


2023-02-08 16:47:49.873938: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x28f31d630>

In [6]:
# Unfiltered evaluation
ranks = partitioned_model.evaluate(PATH_TO_DATASET + 'wn18RR/test.txt', 
                                   batch_size=400)

from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)



210 triples containing invalid keys skipped!


(20079.140731874144, 0.011132840015629617, 0.0, 0.03625170998632011, 2924)

In [7]:
# Filtered evaluation
ranks = partitioned_model.evaluate(PATH_TO_DATASET + 'wn18RR/test.txt', 
                        batch_size=400,
                        corrupt_side='s,o',
                        use_filter={'train': PATH_TO_DATASET + 'wn18RR/train.txt',
                                    'valid': PATH_TO_DATASET + 'wn18RR/valid.txt',
                                    'test': PATH_TO_DATASET + 'wn18RR/test.txt'})

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)



210 triples containing invalid keys skipped!

210 triples containing invalid keys skipped!

210 triples containing invalid keys skipped!


(20066.594562243503,
 0.01583735697421522,
 0.005471956224350205,
 0.038132694938440494,
 2924)

In [8]:
from ampligraph.utils import save_model
save_model(model=partitioned_model, model_name_path='./partitioned_model')

The path ./partitioned_model already exists. This save operation will overwrite the model                 at the specified path.


In [10]:
from ampligraph.utils import restore_model
model = restore_model('./partitioned_model')

Saved model does not include a db file. Skipping.


In [11]:
# Unfiltered evaluation
ranks = model.evaluate(PATH_TO_DATASET + 'wn18RR/test.txt',
                       batch_size=400)

from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


210 triples containing invalid keys skipped!


(20079.140731874144, 0.011132840015629617, 0.0, 0.03625170998632011, 2924)

In [12]:
ranks = model.evaluate(PATH_TO_DATASET + 'wn18RR/test.txt', 
                        batch_size=400,
                        corrupt_side='s,o',
                        use_filter={'train': PATH_TO_DATASET + 'wn18RR/train.txt',
                                    'valid': PATH_TO_DATASET + 'wn18RR/valid.txt',
                                    'test': PATH_TO_DATASET + 'wn18RR/test.txt'})

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)



210 triples containing invalid keys skipped!

210 triples containing invalid keys skipped!

210 triples containing invalid keys skipped!


(20066.594562243503,
 0.01583735697421522,
 0.005471956224350205,
 0.038132694938440494,
 2924)