In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
import pandas as pd
tf.config.set_soft_device_placement(False)
tf.debugging.set_log_device_placement(True)
import numpy as np
from ampligraph.datasets import load_fb15k_237, load_yago3_10
from ampligraph.evaluation.protocol import create_mappings, to_idx

from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score
import time
print(tf.__version__)
assert(tf.__version__.startswith('2.5'))

from ampligraph.datasets import load_fb15k_237, load_fb13, load_fb15k, load_wn11, load_wn18, load_wn18rr, load_yago3_10
from ampligraph.latent_features import ScoringBasedEmbeddingModel

2.5.0-dev20201208


Jump to [Partitioned](#Training/eval-with-partition)

# Discovery

In [6]:
import requests
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns


from ampligraph.datasets import load_from_csv
from ampligraph.discovery import find_clusters

# International football matches triples
# See tutorial here to understand how the triples are created from a tabular dataset:
url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/football.csv'
open('football.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'football.csv', sep=',')[:, 1:]

model = ScoringBasedEmbeddingModel(eta=5, 
                                 k=300,
                                 scoring_type='ComplEx')



model.compile(optimizer='adam', loss='multiclass_nll')

model.fit(X,
          batch_size=10000,
          epochs=10)

df = pd.DataFrame(X, columns=["s", "p", "o"])
teams = np.unique(np.concatenate((df.s[df.s.str.startswith("Team")],
                               df.o[df.o.str.startswith("Team")])))
team_embeddings = model.get_embeddings(teams, embedding_type='e')

embeddings_2d = PCA(n_components=2).fit_transform(np.array([i for i in team_embeddings]))

# Find clusters of embeddings using KMeans

kmeans = KMeans(n_clusters=6, n_init=100, max_iter=500)
clusters = find_clusters(teams, model, kmeans, mode='e')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
import pandas as pd
import numpy as np
import re

# The IMDB dataset used here is part of the Movies5 dataset found on:
# The Magellan Data Repository (https://sites.google.com/site/anhaidgroup/projects/data)
import requests
url = 'http://pages.cs.wisc.edu/~anhai/data/784_data/movies5.tar.gz'
open('movies5.tar.gz', 'wb').write(requests.get(url).content)
import tarfile
tar = tarfile.open('movies5.tar.gz', "r:gz")
tar.extractall()
tar.close()

# Reading tabular dataset of IMDB movies and filling the missing values
imdb = pd.read_csv("movies5/csv_files/imdb.csv")
imdb["directors"] = imdb["directors"].fillna("UnknownDirector")
imdb["actors"] = imdb["actors"].fillna("UnknownActor")
imdb["genre"] = imdb["genre"].fillna("UnknownGenre")
imdb["duration"] = imdb["duration"].fillna("0")

# Creating knowledge graph triples from tabular dataset
imdb_triples = []

for _, row in imdb.iterrows():
    movie_id = "ID" + str(row["id"])
    directors = row["directors"].split(",")
    actors = row["actors"].split(",")
    genres = row["genre"].split(",")
    duration = "Duration" + str(int(re.sub("\D", "", row["duration"])) // 30)

    directors_triples = [(movie_id, "hasDirector", d) for d in directors]
    actors_triples = [(movie_id, "hasActor", a) for a in actors]
    genres_triples = [(movie_id, "hasGenre", g) for g in genres]
    duration_triple = (movie_id, "hasDuration", duration)

    imdb_triples.extend(directors_triples)
    imdb_triples.extend(actors_triples)
    imdb_triples.extend(genres_triples)
    imdb_triples.append(duration_triple)

# Training knowledge graph embedding with ComplEx model
model = ScoringBasedEmbeddingModel(eta=5, 
                             k=300,
                             scoring_type='ComplEx')



model.compile(optimizer='adam', loss='multiclass_nll')
imdb_triples = np.array(imdb_triples)
model.fit(imdb_triples,
      batch_size=10000,
      epochs=20)

# Finding duplicates movies (entities)
from ampligraph.discovery import find_duplicates

entities = np.unique(imdb_triples[:, 0])
dups, _ = find_duplicates(entities, model, mode='e', tolerance=0.45)
id_list = []
for data in dups:
    for i in data:
        id_list.append(int(i[2:]))
print(imdb.iloc[id_list[:6]][['movie_name', 'year']])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
                movie_name  year
1785           Interceptor  2009
1786           Interceptor  2009
1101  Last Flight to Abuja  2012
1102  Last Flight to Abuja  2012
2183   Showdown at Area 51  2007
2184   Showdown at Area 51  2007


In [11]:
import requests
from ampligraph.datasets import load_from_csv
from ampligraph.discovery import discover_facts
from ampligraph.discovery import query_topn

# Game of Thrones relations dataset
url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/GoT.csv'
open('GoT.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'GoT.csv', sep=',')

model = ScoringBasedEmbeddingModel(eta=5, 
                             k=150,
                             scoring_type='TransE')



model.compile(optimizer='adagrad', loss='pairwise')
model.fit(X,
      batch_size=100,
      epochs=20)
query_topn(model, top_n=5,
        head='Eddard Stark', relation='ALLIED_WITH', tail=None,
        ents_to_consider=None, rels_to_consider=None)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


(array([['Eddard Stark', 'ALLIED_WITH', 'House Baelish of the Fingers'],
        ['Eddard Stark', 'ALLIED_WITH', 'Larys Strong'],
        ['Eddard Stark', 'ALLIED_WITH', 'Harlan Hunter'],
        ['Eddard Stark', 'ALLIED_WITH', 'House Dalt of Lemonwood'],
        ['Eddard Stark', 'ALLIED_WITH', 'House Tarbeck of Tarbeck Hall']],
       dtype='<U44'),
 array([-1.3960195, -1.3976465, -1.4125105, -1.4159653, -1.4202976],
       dtype=float32))

# Train/eval without partition

In [12]:
optim = tf.optimizers.Adam(learning_rate=0.01)
# optim = 'adam'

# loss = nll
# loss = 'self_adversarial'
from ampligraph.latent_features.loss_functions import SelfAdversarialLoss, NLLMulticlass
loss = SelfAdversarialLoss({'margin': 0.1, 'alpha': 5, 'reduction': 'sum'})
loss = NLLMulticlass({'reduction': 'sum'})
model = ScoringBasedEmbeddingModel(eta=5, 
                                     k=300,
                                     scoring_type='ComplEx')



model.compile(optimizer=optim, loss=loss)

from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('./chkpt1', monitor='loss', verbose=1, save_best_only=True, mode='min')

dataset = load_fb15k_237()

start = time.time()
model.fit(dataset['train'],
             batch_size=10000,
             epochs=10,
             validation_freq=50,
             validation_batch_size=100,
             validation_data = dataset['valid'],
         callbacks=[checkpoint])

end = time.time()
print('Time taken:', end-start)

Epoch 1/10

Epoch 00001: loss improved from inf to 1.68641, saving model to ./chkpt1
Epoch 2/10

Epoch 00002: loss improved from 1.68641 to 1.07681, saving model to ./chkpt1
Epoch 3/10

Epoch 00003: loss improved from 1.07681 to 0.76761, saving model to ./chkpt1
Epoch 4/10

Epoch 00004: loss improved from 0.76761 to 0.60263, saving model to ./chkpt1
Epoch 5/10

Epoch 00005: loss improved from 0.60263 to 0.50113, saving model to ./chkpt1
Epoch 6/10

Epoch 00006: loss improved from 0.50113 to 0.43178, saving model to ./chkpt1
Epoch 7/10

Epoch 00007: loss improved from 0.43178 to 0.38175, saving model to ./chkpt1
Epoch 8/10

Epoch 00008: loss improved from 0.38175 to 0.34378, saving model to ./chkpt1
Epoch 9/10

Epoch 00009: loss improved from 0.34378 to 0.31407, saving model to ./chkpt1
Epoch 10/10

Epoch 00010: loss improved from 0.31407 to 0.29004, saving model to ./chkpt1
Time taken: 7.73827338218689


In [13]:
from ampligraph.discovery import discover_facts

In [14]:
discover_facts(dataset['train'][:100], 
               model, 
               top_n=100, 
               strategy='random_uniform', 
               max_candidates=100, 
               target_rel='/location/country/form_of_government', 
               seed=0)


100 triples containing invalid keys skipped!


(array([['/m/028_yv', '/location/country/form_of_government', '/m/0m313']],
       dtype=object),
 array([66.]))

In [15]:
ranks = model.evaluate(np.array([['/m/01cr28', '/location/country/form_of_government', '/m/02lkcc'],
                     ['/m/07tw_b', '/location/country/form_of_government', '/m/02lkcc'],
                     ['/m/073tm9', '/location/country/form_of_government', '/m/02lkcc']]), 
                       use_filter={'train': dataset['train']}, 
                       corrupt_side='s,o', 
                       verbose=True)


272115 triples containing invalid keys skipped!


In [16]:
ranks

array([[  128,     4],
       [ 9137, 14157],
       [11342, 14494]], dtype=int32)

In [17]:
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                       entities_subset=['/m/08966', '/m/05lf_', '/m/0f8l9c', '/m/04ghz4m'],
                      
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
mrr_score(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


0.964608816909678

In [9]:
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
mrr_score(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


0.16188943877767542

In [10]:
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                       entities_subset=list(model.data_indexer.entities_dict.values()),
                      
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
mrr_score(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


0.16188943877767542

In [None]:
np.random.seed(0)
pred = model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100)
print(pred.shape)
print(np.sort(pred))

In [5]:
model.calibrate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                batch_size=10000, positive_base_rate=0.9, epochs=100)


28 triples containing invalid keys skipped!


In [6]:

out = model.predict_proba('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', batch_size=10000)
print(np.sort(out))
print(np.argsort(out))
pred_out = model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt')
print(np.sort(pred_out))
print(np.argsort(pred_out))



28 triples containing invalid keys skipped!
[0.4154276  0.41815612 0.42447674 ... 0.9985715  0.9989201  0.9998617 ]
[ 4066   611 18634 ...   990 10437 14612]

28 triples containing invalid keys skipped!
[-2.611742  -2.5582657 -2.4347575 ... 30.218775  31.553257  41.35157  ]
[ 4066   611 18634 ...   990 10437 14612]


In [7]:

start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
         use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!
Time taken: 59.169644355773926


(298.0981994324298,
 0.22008795174458182,
 0.13516488893238085,
 0.3928711224190234,
 20438)

In [8]:
model.save_weights('./calibrated_model')

## Load Checkpoint and evaluate

In [2]:
start = time.time()
loaded_model = ScoringBasedEmbeddingModel(eta=5, 
                                     k=300, 
                                     scoring_type='ComplEx')
loaded_model.load_weights('./calibrated_model')
ranks = loaded_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
         use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)

NameError: name 'ScoringBasedEmbeddingModel' is not defined

In [10]:
pred = loaded_model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100)
print(pred.shape)
print(pred)


28 triples containing invalid keys skipped!
(20438,)
[14.611029   9.369468   7.55066   ...  5.55168    0.9343734  5.4705043]


In [11]:
out = loaded_model.predict_proba('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', batch_size=10000)
np.argsort(out), np.argsort(loaded_model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt')), np.sort(out)


28 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


(array([ 4066,   611, 18634, ...,   990, 10437, 14612]),
 array([ 4066,   611, 18634, ...,   990, 10437, 14612]),
 array([0.4154276 , 0.41815612, 0.42447674, ..., 0.9985715 , 0.9989201 ,
        0.9998617 ], dtype=float32))

# Training/eval with partition 

## Training with RandomEdges partitioner

In [3]:
from ampligraph.datasets import DummyBackend, SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY
dataset_loader = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt', 
                                 backend=SQLiteAdapter,
                                    batch_size=1000, 
                                    dataset_type='train', 
                                     use_filter=False,
                                    use_indexer=True)

In [4]:


# Choose the partitioner 
partitioner = PARTITION_ALGO_REGISTRY.get('RandomEdges')(dataset_loader, k=3)

_split: memory before: 896.0Bytes, after: 12.98MB, consumed: 12.979MB; exec time: 31.859s


In [5]:
optim = tf.optimizers.Adam(learning_rate=0.001, amsgrad=True)

partitioned_model = ScoringBasedEmbeddingModel(eta=5, 
                                     k=300, 
                                     scoring_type='TransE')
partitioned_model.compile(optimizer=optim, loss=nll)

In [6]:

start = time.time()
partitioned_model.fit(partitioner,
                     batch_size=1000, use_partitioning=True,             
                     epochs=2)
print((time.time()-start))

Epoch 1/2
Epoch 2/2
51.306065797805786


In [7]:

dataset_loader_test = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                        backend=SQLiteAdapter,
                                        batch_size=400, 
                                        dataset_type='test', 
                                        use_indexer=partitioned_model.data_handler.get_mapper())


28 triples containing invalid keys skipped!


In [8]:

start = time.time()
ranks = partitioned_model.evaluate(dataset_loader_test, 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)

Time taken: 136.147305727005


(1664.7270770134064, 0.08627521188313407, 0.0, 0.23722967022213523, 20438)

## Save and Load

In [13]:
partitioned_model.save_weights('./best_model')

In [14]:
loaded_part_model = ScoringBasedEmbeddingModel(eta=5, 
                                     k=300, 
                                     scoring_type='TransE')

loaded_part_model.load_weights('./best_model')

In [15]:

dataset_loader_test = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                        backend=DummyBackend,
                                        batch_size=400, 
                                        dataset_type='test', 
                                        use_indexer=loaded_part_model.data_indexer)


28 triples containing invalid keys skipped!


In [16]:


start = time.time()
ranks = loaded_part_model.evaluate(dataset_loader_test, 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)

Time taken: 131.47749257087708


(2613.1418925530875, 0.07781069124506454, 0.0, 0.22360309227908798, 20438)

# Training/eval with partition (default Partitioning Approach)

In [9]:
from ampligraph.datasets import DummyBackend, SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY


In [10]:
optim = tf.optimizers.Adam(learning_rate=0.001, amsgrad=True)

partitioned_model = ScoringBasedEmbeddingModel(eta=5, 
                                     k=300, 
                                     scoring_type='TransE')
partitioned_model.compile(optimizer=optim, loss=nll)


start = time.time()
partitioned_model.fit('/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                     batch_size=1000, use_partitioning=True,
                     epochs=10)
print((time.time()-start))

_split: memory before: 98.973MB, after: 109.88MB, consumed: 10.909MB; exec time: 88.649s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
221.17552399635315


In [11]:


start = time.time()
ranks = partitioned_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!
Time taken: 135.5718469619751


(662.5322193952442, 0.08724452920413757, 0.0, 0.23862413151971817, 20438)

## Filtered evaluation

In [20]:
start = time.time()
ranks = partitioned_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=400,
                       corrupt_side='s,o',
                        use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                              'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                              'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!
Time taken: 297.1006762981415


(664.6330365006361,
 0.1936046545792393,
 0.13240043057050593,
 0.3117477248263039,
 20438)