In [1]:
import sys
sys.path.append('..')

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
import pandas as pd
tf.config.set_soft_device_placement(False)
tf.debugging.set_log_device_placement(True)
import numpy as np
from ampligraph.datasets import load_fb15k_237, load_yago3_10
from ampligraph.evaluation.protocol import create_mappings, to_idx

from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score
import time
print(tf.__version__)
assert(tf.__version__.startswith('2.4'))

from ampligraph.datasets import load_fb15k_237, load_fb13, load_fb15k, load_wn11, load_wn18, load_wn18rr, load_yago3_10
from ampligraph.latent_features import ScoringBasedEmbeddingModel

2.4.0


In [2]:
dataset = load_fb15k_237()

Jump to 
- [Partitioned](#Training/eval-with-partition)
- [Discovery](#Discovery)

# Train/eval without partition

### Train a random model

In [3]:
optim = tf.optimizers.Adam(learning_rate=0.001)
# optim = 'adam'

# loss = nll
# loss = 'self_adversarial'

from ampligraph.latent_features.loss_functions import SelfAdversarialLoss, NLLMulticlass
loss = SelfAdversarialLoss({'margin': 0.1, 'alpha': 5, 'reduction': 'sum'})
loss = NLLMulticlass({'reduction': 'mean'})
model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50,
                                     scoring_type='Random')



model.compile(optimizer=optim, loss=loss)

from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('./chkpt1', monitor='loss', verbose=1, save_best_only=True, mode='min')

dataset = load_fb15k_237()

start = time.time()
model.fit(dataset['train'],
             batch_size=10000,
             epochs=10,
             validation_freq=25,
             validation_batch_size=100,
             validation_data = dataset['valid'],
         callbacks=[checkpoint])

end = time.time()
print('Time taken:', end-start)

Epoch 1/10

Epoch 00001: loss improved from inf to 6982.92139, saving model to ./chkpt1
Epoch 2/10

Epoch 00002: loss did not improve from 6982.92139
Epoch 3/10

Epoch 00003: loss did not improve from 6982.92139
Epoch 4/10

Epoch 00004: loss did not improve from 6982.92139
Epoch 5/10

Epoch 00005: loss did not improve from 6982.92139
Epoch 6/10

Epoch 00006: loss did not improve from 6982.92139
Epoch 7/10

Epoch 00007: loss did not improve from 6982.92139
Epoch 8/10

Epoch 00008: loss did not improve from 6982.92139
Epoch 9/10

Epoch 00009: loss did not improve from 6982.92139
Epoch 10/10

Epoch 00010: loss did not improve from 6982.92139
Time taken: 2.653794288635254


### Evaluate the random model

In [4]:
# Full evaluation (default protocol) using filters
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
mrr_score(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


0.0005047810301817834

### Train a TransE model

In [5]:
optim = tf.optimizers.Adam(learning_rate=0.001)
# optim = 'adam'

# loss = nll
# loss = 'self_adversarial'

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./transe_logs')


from ampligraph.latent_features.loss_functions import SelfAdversarialLoss, NLLMulticlass
loss = SelfAdversarialLoss({'margin': 0.1, 'alpha': 5, 'reduction': 'sum'})
loss = NLLMulticlass({'reduction': 'mean'})
model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50,
                                     scoring_type='TransE')



model.compile(optimizer=optim, loss=loss)

from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('./chkpt1', monitor='loss', verbose=1, save_best_only=True, mode='min')

dataset = load_fb15k_237()

start = time.time()
model.fit(dataset['train'],
             batch_size=10000,
             epochs=10,
             validation_freq=25,
             validation_batch_size=100,
             validation_data = dataset['valid'],
         callbacks=[checkpoint, tensorboard_callback])

end = time.time()
print('Time taken:', end-start)

Epoch 1/10

Epoch 00001: loss improved from inf to 6643.76416, saving model to ./chkpt1
Epoch 2/10

Epoch 00002: loss improved from 6643.76416 to 6524.13721, saving model to ./chkpt1
Epoch 3/10

Epoch 00003: loss improved from 6524.13721 to 6375.65381, saving model to ./chkpt1
Epoch 4/10

Epoch 00004: loss improved from 6375.65381 to 6191.37549, saving model to ./chkpt1
Epoch 5/10

Epoch 00005: loss improved from 6191.37549 to 5990.84033, saving model to ./chkpt1
Epoch 6/10

Epoch 00006: loss improved from 5990.84033 to 5789.72119, saving model to ./chkpt1
Epoch 7/10

Epoch 00007: loss improved from 5789.72119 to 5595.84326, saving model to ./chkpt1
Epoch 8/10

Epoch 00008: loss improved from 5595.84326 to 5412.95654, saving model to ./chkpt1
Epoch 9/10

Epoch 00009: loss improved from 5412.95654 to 5240.85889, saving model to ./chkpt1
Epoch 10/10

Epoch 00010: loss improved from 5240.85889 to 5080.15381, saving model to ./chkpt1
Time taken: 4.028819561004639


### Visualize the embeddings

In [6]:
from ampligraph.utils import create_tensorboard_visualizations

In [7]:
create_tensorboard_visualizations(model, 
                                  entities_subset=['/m/027rn', '/m/06cx9', '/m/017dcd', '/m/06v8s0', '/m/07s9rl0'], 
                                  labels=['ent1', 'ent2', 'ent3', 'ent4', 'ent5'],
                                  loc = './small_embeddings_vis')

In [8]:
create_tensorboard_visualizations(model, 
                                  entities_subset='all',
                                  loc = './full_embeddings_vis')

In [9]:
# the embeddings can be visualised using the following command:
# tensorboard --logdir='./full_embeddings_vis' --port=8891 
# open the browser and go to the following URL: http://127.0.0.1:8891/#projector

### Evaluate

In [10]:
# evaluate using filters
ranks = model.evaluate(np.array([['/m/01cr28', '/location/country/form_of_government', '/m/02lkcc'],
                     ['/m/07tw_b', '/location/country/form_of_government', '/m/02lkcc'],
                     ['/m/073tm9', '/location/country/form_of_government', '/m/02lkcc']]), 
                       use_filter={'train': dataset['train']}, 
                       corrupt_side='s,o', 
                       verbose=True)



In [11]:
ranks

array([[ 5080, 11170],
       [12839, 12575],
       [ 7409,  9715]], dtype=int32)

In [12]:
# evaluate using filters (filters are file names instead of numpy arrays)
# corruptions generated using entities subset
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                       entities_subset=['/m/08966', '/m/05lf_', '/m/0f8l9c', '/m/04ghz4m'],
                      
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
mrr_score(ranks) # will give very high mrr


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


0.9297122190690545

In [13]:
# Full evaluation (default protocol) using filters
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
mrr_score(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


0.17401659700409594

In [14]:
# same as above but just for sanity checking if entities_subset works or not
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                       entities_subset=list(model.data_indexer.backend.entities_dict.values()),
                      
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mrr_score(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!
Time taken: 61.74235558509827


0.17401659700409594

In [15]:
np.random.seed(0)
pred = model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100)
print(pred.shape)
print(np.sort(pred))


28 triples containing invalid keys skipped!
(20438,)
[-1.9735914e+00 -1.9564396e+00 -1.9247011e+00 ... -7.0771109e-04
 -7.0771069e-04 -7.0771034e-04]


### Model calibration

In [16]:
# calibrate on the test set
model.calibrate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                batch_size=10000, positive_base_rate=0.9, epochs=100)


28 triples containing invalid keys skipped!


In [17]:
# check if the sorted probability indices match the sorted regular scores 
# It should be same as calibration doesnt change ranking, it just calibrates the range of scores
out = model.predict_proba('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', batch_size=10000)
print(np.sort(out))
print(np.argsort(out))
pred_out = model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt')
print(np.sort(pred_out))
print(np.argsort(pred_out))



28 triples containing invalid keys skipped!
[0.5706937 0.5706937 0.5706937 ... 0.664212  0.6656726 0.6664607]
[ 3581 18100   105 ... 18952 10367  1919]

28 triples containing invalid keys skipped!
[-1.9735914e+00 -1.9564396e+00 -1.9247011e+00 ... -7.0771109e-04
 -7.0771069e-04 -7.0771034e-04]
[ 1919 10367 18952 ... 11458  1247 17056]


In [18]:
# calibration should not affect the regular evaluation
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
         use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!
Time taken: 61.1954927444458


(541.0079264115863,
 0.17401659700409594,
 0.11341618553674528,
 0.2935952637244349,
 20438)

In [19]:
model.save_weights('./calibrated_model')

## Load Checkpoint and evaluate

In [20]:
# Loaded the model should return exact same results as earlier
start = time.time()
loaded_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='TransE')
loaded_model.load_weights('./calibrated_model')
ranks = loaded_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
         use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!
Time taken: 60.51377248764038


(541.0079264115863,
 0.17401659700409594,
 0.11341618553674528,
 0.2935952637244349,
 20438)

In [21]:
pred = loaded_model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100)
print(pred.shape)
print(pred)


28 triples containing invalid keys skipped!
(20438,)
[-0.11748545 -0.05667515 -0.00765535 ... -0.10156396 -1.5409082
 -0.01155617]


In [22]:
# sorted calibration scores order and regular predict scores order must match
out = loaded_model.predict_proba('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', batch_size=10000)
np.argsort(out), np.argsort(loaded_model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt')), np.sort(out)


28 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


(array([ 3581, 18100,   105, ..., 18952, 10367,  1919]),
 array([ 1919, 10367, 18952, ..., 11458,  1247, 17056]),
 array([0.5706937, 0.5706937, 0.5706937, ..., 0.664212 , 0.6656726,
        0.6664607], dtype=float32))

# Training/eval with partition 

## Training with RandomEdges partitioner

In [23]:
from ampligraph.datasets import DummyBackend, SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY
dataset_loader = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt', 
                                 backend=SQLiteAdapter,
                                    batch_size=1000, 
                                    dataset_type='train', 
                                     use_filter=False,
                                    use_indexer=True)

In [24]:
# Choose the partitioner 
partitioner = PARTITION_ALGO_REGISTRY.get('RandomEdges')(dataset_loader, k=3)


_split: memory before: 896.0Bytes, after: 12.929MB, consumed: 12.928MB; exec time: 31.597s


In [25]:

optim = tf.optimizers.Adam(learning_rate=0.001, amsgrad=True)

partitioned_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='TransE')
partitioned_model.compile(optimizer=optim, loss='multiclass_nll')

In [26]:

start = time.time()
partitioned_model.fit(partitioner,
                     batch_size=1000, use_partitioning=True,             
                     epochs=10)
print((time.time()-start))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
183.84469532966614


In [27]:

dataset_loader_test = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                        backend=SQLiteAdapter,
                                        batch_size=400, 
                                        dataset_type='test', 
                                        use_indexer=partitioned_model.data_handler.get_mapper())


28 triples containing invalid keys skipped!


In [28]:

start = time.time()
ranks = partitioned_model.evaluate(dataset_loader_test, 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)

Time taken: 121.93693733215332


(990.8508904980918, 0.08935615089071472, 0.0, 0.2461591153733242, 20438)

## Save and Load

In [29]:
partitioned_model.save_weights('./best_model')

In [30]:
loaded_part_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='TransE')

loaded_part_model.load_weights('./best_model')

In [31]:

dataset_loader_test = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                        backend=DummyBackend,
                                        batch_size=400, 
                                        dataset_type='test', 
                                        use_indexer=loaded_part_model.data_indexer)


28 triples containing invalid keys skipped!


In [32]:


start = time.time()
ranks = loaded_part_model.evaluate(dataset_loader_test, 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)

Time taken: 122.09048652648926


(990.8508904980918, 0.08935615089071472, 0.0, 0.2461591153733242, 20438)

# Training/eval with partition (default Partitioning Approach)

In [33]:
from ampligraph.datasets import DummyBackend, SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY


In [34]:
optim = tf.optimizers.Adam(learning_rate=0.0001, amsgrad=True)

partitioned_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='TransE')
partitioned_model.compile(optimizer=optim, loss='multiclass_nll')




In [35]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs_split')
start = time.time()
partitioned_model.fit('/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                     batch_size=10000, use_partitioning=True,
                     epochs=100, callbacks=[tensorboard_callback])
print((time.time()-start))

_split: memory before: 47.273MB, after: 59.971MB, consumed: 12.698MB; exec time: 85.994s
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/1

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
770.9260272979736


In [36]:
len(set(dataset['train'][:, 0]).union(set(dataset['train'][:, 2])))

14505

In [37]:


start = time.time()
ranks = partitioned_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!
Time taken: 123.25823903083801


(1158.5643898620217, 0.08530392588433558, 0.0, 0.22668558567374497, 20438)

## Filtered evaluation

In [38]:
start = time.time()
ranks = partitioned_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=400,
                       corrupt_side='s,o',
                        use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                              'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                              'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!
Time taken: 323.94514298439026


(1032.1129024366376,
 0.18253165803137616,
 0.12374009198551718,
 0.3002984636461493,
 20438)

## Random model with partitioning

In [39]:
from ampligraph.datasets import DummyBackend, SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY

optim = tf.optimizers.Adam(learning_rate=0.0001, amsgrad=True)

partitioned_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='Random')
partitioned_model.compile(optimizer=optim, loss='multiclass_nll')


start = time.time()
partitioned_model.fit('/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                     batch_size=10000, use_partitioning=True,
                     epochs=10, callbacks=[])
print((time.time()-start))

_split: memory before: 98.814MB, after: 111.52MB, consumed: 12.703MB; exec time: 87.679s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
162.06040239334106


In [40]:
start = time.time()
ranks = partitioned_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!
Time taken: 125.02911829948425


(7279.448282610823, 0.0005559095188126157, 0.0, 0.0004892846658185732, 20438)

# Discovery

### Discover Facts

In [50]:
from ampligraph.discovery import discover_facts

model = ScoringBasedEmbeddingModel(eta=5, 
                                     k=300,
                                     scoring_type='ComplEx')



model.compile(optimizer='adam', loss='multiclass_nll')

start = time.time()
model.fit(dataset['train'],
             batch_size=10000,
             epochs=10,
             validation_freq=50,
             validation_batch_size=100,
             validation_data = dataset['valid'])

end = time.time()
print('Time taken:', end-start)

discover_facts(dataset['train'][:100], 
               model, 
               top_n=100, 
               strategy='random_uniform', 
               max_candidates=100, 
               target_rel='/location/country/form_of_government', 
               seed=0)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Time taken: 8.281585216522217


(array([['/m/0fvf9q', '/location/country/form_of_government', '/m/0m313']],
       dtype=object),
 array([36.]))

### Find Clusters

In [51]:
import requests
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns


from ampligraph.datasets import load_from_csv
from ampligraph.discovery import find_clusters

# International football matches triples
# See tutorial here to understand how the triples are created from a tabular dataset:
url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/football.csv'
open('football.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'football.csv', sep=',')[:, 1:]

model = ScoringBasedEmbeddingModel(eta=5, 
                                 k=300,
                                 scoring_type='ComplEx')



model.compile(optimizer='adam', loss='multiclass_nll')

model.fit(X,
          batch_size=10000,
          epochs=10)

df = pd.DataFrame(X, columns=["s", "p", "o"])
teams = np.unique(np.concatenate((df.s[df.s.str.startswith("Team")],
                               df.o[df.o.str.startswith("Team")])))
team_embeddings = model.get_embeddings(teams, embedding_type='e')

embeddings_2d = PCA(n_components=2).fit_transform(np.array([i for i in team_embeddings]))

# Find clusters of embeddings using KMeans

kmeans = KMeans(n_clusters=6, n_init=100, max_iter=500)
clusters = find_clusters(teams, model, kmeans, mode='e')
print(np.unique(clusters, return_counts=True))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(array([0, 1, 2, 3, 4, 5], dtype=int32), array([ 56,   7,  25, 154,  22,  49]))


### Find Duplicates

In [52]:
import pandas as pd
import numpy as np
import re

# The IMDB dataset used here is part of the Movies5 dataset found on:
# The Magellan Data Repository (https://sites.google.com/site/anhaidgroup/projects/data)
import requests
url = 'http://pages.cs.wisc.edu/~anhai/data/784_data/movies5.tar.gz'
open('movies5.tar.gz', 'wb').write(requests.get(url).content)
import tarfile
tar = tarfile.open('movies5.tar.gz', "r:gz")
tar.extractall()
tar.close()

# Reading tabular dataset of IMDB movies and filling the missing values
imdb = pd.read_csv("movies5/csv_files/imdb.csv")
imdb["directors"] = imdb["directors"].fillna("UnknownDirector")
imdb["actors"] = imdb["actors"].fillna("UnknownActor")
imdb["genre"] = imdb["genre"].fillna("UnknownGenre")
imdb["duration"] = imdb["duration"].fillna("0")

# Creating knowledge graph triples from tabular dataset
imdb_triples = []

for _, row in imdb.iterrows():
    movie_id = "ID" + str(row["id"])
    directors = row["directors"].split(",")
    actors = row["actors"].split(",")
    genres = row["genre"].split(",")
    duration = "Duration" + str(int(re.sub("\D", "", row["duration"])) // 30)

    directors_triples = [(movie_id, "hasDirector", d) for d in directors]
    actors_triples = [(movie_id, "hasActor", a) for a in actors]
    genres_triples = [(movie_id, "hasGenre", g) for g in genres]
    duration_triple = (movie_id, "hasDuration", duration)

    imdb_triples.extend(directors_triples)
    imdb_triples.extend(actors_triples)
    imdb_triples.extend(genres_triples)
    imdb_triples.append(duration_triple)

# Training knowledge graph embedding with ComplEx model
model = ScoringBasedEmbeddingModel(eta=5, 
                             k=300,
                             scoring_type='ComplEx')



model.compile(optimizer='adam', loss='multiclass_nll')
imdb_triples = np.array(imdb_triples)
model.fit(imdb_triples,
      batch_size=10000,
      epochs=20)

# Finding duplicates movies (entities)
from ampligraph.discovery import find_duplicates

entities = np.unique(imdb_triples[:, 0])
dups, _ = find_duplicates(entities, model, mode='e', tolerance=0.45)
id_list = []
for data in dups:
    for i in data:
        id_list.append(int(i[2:]))
print(imdb.iloc[id_list[:6]][['movie_name', 'year']])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
                                             movie_name  year
2259         Pirates of the Caribbean: Dead Man's Chest  2006
2101           Pirates of the Caribbean: At World's End  2007
125   Rod the Stormtrooper: Episode V - The Hidden D...  2015
126   Rod the Stormtrooper: Episode V - The Hidden D...  2015
6820                       Tarzan and the Green Goddess  1938
6819                       Tarzan and the Green Goddess  1938


### Query TopN

In [53]:
import requests
from ampligraph.datasets import load_from_csv
from ampligraph.discovery import discover_facts
from ampligraph.discovery import query_topn

# Game of Thrones relations dataset
url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/GoT.csv'
open('GoT.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'GoT.csv', sep=',')

model = ScoringBasedEmbeddingModel(eta=5, 
                             k=150,
                             scoring_type='TransE')



model.compile(optimizer='adagrad', loss='pairwise')
model.fit(X,
      batch_size=100,
      epochs=20)
query_topn(model, top_n=5,
        head='Eddard Stark', relation='ALLIED_WITH', tail=None,
        ents_to_consider=None, rels_to_consider=None)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


(array([['Eddard Stark', 'ALLIED_WITH', 'House Frey of the Crossing'],
        ['Eddard Stark', 'ALLIED_WITH', 'House Stark of Winterfell'],
        ['Eddard Stark', 'ALLIED_WITH',
         "House Targaryen of King's Landing"],
        ['Eddard Stark', 'ALLIED_WITH',
         'House Lannister of Casterly Rock'],
        ['Eddard Stark', 'ALLIED_WITH', 'House Greyjoy of Pyke']],
       dtype='<U44'),
 array([-1.0268682, -1.051951 , -1.0546845, -1.1503534, -1.1699506],
       dtype=float32))