In [1]:
import sys
sys.path.append('..')

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

import tensorflow as tf
tf.get_logger().setLevel('ERROR')
import pandas as pd
tf.config.set_soft_device_placement(False)
tf.debugging.set_log_device_placement(True)
import numpy as np
from ampligraph.datasets import load_fb15k_237, load_yago3_10
from ampligraph.evaluation.protocol import create_mappings, to_idx

from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score
import time
print(tf.__version__)
assert(tf.__version__.startswith('2.4'))

from ampligraph.datasets import load_fb15k_237, load_fb13, load_fb15k, load_wn11, load_wn18, load_wn18rr, load_yago3_10
from ampligraph.latent_features import ScoringBasedEmbeddingModel


2.4.0


In [2]:
dataset = load_fb15k_237()

Jump to 
- [Partitioned](#Training/eval-with-partition)
- [Discovery](#Discovery)

# Train/eval without partition

### Train a random model

In [3]:
optim = tf.optimizers.Adam(learning_rate=0.001)
# optim = 'adam'

# loss = nll
# loss = 'self_adversarial'

from ampligraph.latent_features.loss_functions import SelfAdversarialLoss, NLLMulticlass
loss = SelfAdversarialLoss({'margin': 0.1, 'alpha': 5, 'reduction': 'sum'})
loss = NLLMulticlass({'reduction': 'mean'})
model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50,
                                     scoring_type='Random')



model.compile(optimizer=optim, loss=loss)

from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('./chkpt1', monitor='loss', verbose=1, save_best_only=True, mode='min')

dataset = load_fb15k_237()

start = time.time()
model.fit(dataset['train'],
             batch_size=10000,
             epochs=1,
             validation_freq=25,
             validation_batch_size=100,
             validation_data = dataset['valid'],
         callbacks=[checkpoint])

end = time.time()
print('Time taken:', end-start)


Epoch 00001: loss improved from inf to 6982.92139, saving model to ./chkpt1
Time taken: 13.374548196792603


### Evaluate the random model

In [4]:
# Full evaluation (default protocol) using filters
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
mrr_score(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!


0.0005052450328641726

### Train a model

In [5]:
optim = tf.optimizers.Adam(learning_rate=0.001)
# optim = 'adam'

# loss = nll
# loss = 'self_adversarial'

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./distmult_logs')

from ampligraph.latent_features.loss_functions import SelfAdversarialLoss, NLLMulticlass
loss = SelfAdversarialLoss({'margin': 0.1, 'alpha': 5, 'reduction': 'mean'})
loss = NLLMulticlass({'reduction': 'mean'})
model = ScoringBasedEmbeddingModel(eta=30, 
                                     k=350,
                                     scoring_type='DistMult')



model.compile(optimizer=optim, loss=loss, entity_relation_regularizer=tf.keras.regularizers.L2(0.0001))

from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('./chkpt1', monitor='loss', verbose=1, save_best_only=True, mode='min')

dataset = load_fb15k_237()

start = time.time()
model.fit(dataset['train'],
             batch_size=10000,
             epochs=10,
             validation_freq=50,
             validation_batch_size=100,
             validation_data = dataset['valid'],
         callbacks=[checkpoint, tensorboard_callback])

end = time.time()
print('Time taken:', end-start)

# the training can be visualised using the following command:
# tensorboard --logdir='./distmult_logs' --port=8891 

Epoch 1/10

Epoch 00001: loss improved from inf to 6736.08936, saving model to ./chkpt1
Epoch 2/10

Epoch 00002: loss improved from 6736.08936 to 6730.60938, saving model to ./chkpt1
Epoch 3/10

Epoch 00003: loss improved from 6730.60938 to 6694.96289, saving model to ./chkpt1
Epoch 4/10

Epoch 00004: loss improved from 6694.96289 to 6556.94189, saving model to ./chkpt1
Epoch 5/10

Epoch 00005: loss improved from 6556.94189 to 6240.37549, saving model to ./chkpt1
Epoch 6/10

Epoch 00006: loss improved from 6240.37549 to 5774.27100, saving model to ./chkpt1
Epoch 7/10

Epoch 00007: loss improved from 5774.27100 to 5272.26270, saving model to ./chkpt1
Epoch 8/10

Epoch 00008: loss improved from 5272.26270 to 4811.17578, saving model to ./chkpt1
Epoch 9/10

Epoch 00009: loss improved from 4811.17578 to 4411.64844, saving model to ./chkpt1
Epoch 10/10

Epoch 00010: loss improved from 4411.64844 to 4070.07446, saving model to ./chkpt1
Time taken: 25.957428216934204


In [6]:
# Full evaluation (default protocol) using filters
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})

mrr_score(ranks), hits_at_n_score(ranks, 1)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!


(0.2602269005785717, 0.18793424014091398)

### Visualize the embeddings

In [7]:
from ampligraph.utils import create_tensorboard_visualizations

In [8]:
create_tensorboard_visualizations(model, 
                                  entities_subset=['/m/027rn', '/m/06cx9', '/m/017dcd', '/m/06v8s0', '/m/07s9rl0'], 
                                  labels=['ent1', 'ent2', 'ent3', 'ent4', 'ent5'],
                                  loc = './small_embeddings_vis')

In [9]:
create_tensorboard_visualizations(model, 
                                  entities_subset='all',
                                  loc = './full_embeddings_vis')

In [10]:
# the embeddings can be visualised using the following command:
# tensorboard --logdir='./full_embeddings_vis' --port=8891 
# open the browser and go to the following URL: http://127.0.0.1:8891/#projector

### Evaluate

In [11]:
# evaluate using filters
ranks = model.evaluate(np.array([['/m/01cr28', '/location/country/form_of_government', '/m/02lkcc'],
                     ['/m/07tw_b', '/location/country/form_of_government', '/m/02lkcc'],
                     ['/m/073tm9', '/location/country/form_of_government', '/m/02lkcc']]), 
                       batch_size=3,
                       use_filter={'train': dataset['train'],
                                  'test': dataset['test']}, 
                       corrupt_side='s,o', 
                       verbose=True)

mrr_score(ranks), hits_at_n_score(ranks, 10)



(0.00023974997689174383, 0.0)

In [12]:
# evaluate using filters (filters are file names instead of numpy arrays)
# corruptions generated using entities subset
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=500,
                       corrupt_side='s,o',
                       entities_subset=['/m/08966', '/m/05lf_', '/m/0f8l9c', '/m/04ghz4m'],
                      
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
mrr_score(ranks), hits_at_n_score(ranks, 1) # will give very high mrr


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!


(0.9626798121146883, 0.9292739015559253)

In [13]:
# Full evaluation (default protocol) using filters
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o')

mrr_score(ranks), hits_at_n_score(ranks, 1)


28 triples containing invalid keys skipped!


(0.10250035472836136, 0.0)

In [14]:
# Full evaluation (default protocol) using filters
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})

mrr_score(ranks), hits_at_n_score(ranks, 1)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!


(0.2602269005785717, 0.18793424014091398)

In [15]:
# same as above but just for sanity checking if entities_subset works or not
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
                       entities_subset=model.data_indexer.backend.get_all_entities(),
                      
                      use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mrr_score(ranks), hits_at_n_score(ranks, 1)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!
Time taken: 120.94262504577637


(0.2602269005785717, 0.18793424014091398)

In [16]:
np.random.seed(0)
pred = model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100)
print(pred.shape)
print(np.sort(pred))


28 triples containing invalid keys skipped!
(20438,)
[-2.897735  -1.8650669 -1.8186188 ...  7.3988943  7.3988943  7.887154 ]


### Model calibration

In [17]:
# calibrate on the test set
model.calibrate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                batch_size=10000, positive_base_rate=0.9, epochs=100)


28 triples containing invalid keys skipped!


In [18]:
# check if the sorted probability indices match the sorted regular scores 
# It should be same as calibration doesnt change ranking, it just calibrates the range of scores
out = model.predict_proba('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', batch_size=10000)
print(np.sort(out))
print(np.argsort(out))
pred_out = model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', batch_size=10000)
print(np.sort(pred_out))
print(np.argsort(pred_out))



28 triples containing invalid keys skipped!
[0.3826677  0.44528604 0.44815987 ... 0.89083356 0.89083356 0.9021644 ]
[ 3834 18634  4066 ...  2021  9247 14612]

28 triples containing invalid keys skipped!
[-2.897735  -1.8650669 -1.8186188 ...  7.3988943  7.3988943  7.887154 ]
[ 3834 18634  4066 ...  2021  9247 14612]


In [19]:
# calibration should not affect the regular evaluation
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
         use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!
Time taken: 120.24336504936218


(335.3337410705548,
 0.2602269005785717,
 0.18793424014091398,
 0.4024611018690674,
 20438)

In [20]:
model.save_weights('./calibrated_model')

## Load Checkpoint and evaluate

In [21]:
# Loaded the model should return exact same results as earlier
start = time.time()
loaded_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=350, 
                                     scoring_type='DistMult')
loaded_model.load_weights('./calibrated_model')
ranks = loaded_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100,
                       corrupt_side='s,o',
         use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                  'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                  'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!
Time taken: 120.79659652709961


(335.3337410705548,
 0.2602269005785717,
 0.18793424014091398,
 0.4024611018690674,
 20438)

In [22]:
pred = loaded_model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=100)
print(pred.shape)
print(pred)


28 triples containing invalid keys skipped!
(20438,)
[3.7724204  2.6032825  5.378133   ... 1.8153479  0.29437166 4.713559  ]


In [23]:
# sorted calibration scores order and regular predict scores order must match
out = loaded_model.predict_proba('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                 batch_size=10000)
np.argsort(out), \
np.argsort(loaded_model.predict('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt')), \
np.sort(out)



28 triples containing invalid keys skipped!

28 triples containing invalid keys skipped!


(array([ 3834, 18634,  4066, ...,  2021,  9247, 14612]),
 array([ 3834, 18634,  4066, ...,  2021,  9247, 14612]),
 array([0.3826677 , 0.44528604, 0.44815987, ..., 0.89083356, 0.89083356,
        0.9021644 ], dtype=float32))

# Training/eval with partition 

## Training with RandomEdges partitioner

In [24]:
from ampligraph.datasets import DummyBackend, SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY
dataset_loader = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt', 
                                 backend=SQLiteAdapter,
                                    batch_size=1000, 
                                    dataset_type='train', 
                                     use_filter=False,
                                    use_indexer=True)


In [25]:
# Choose the partitioner 
partitioner = PARTITION_ALGO_REGISTRY.get('RandomEdges')(dataset_loader, k=3)


_split: memory before: 0.0Bytes, after: 12.897MB, consumed: 12.897MB; exec time: 9.9758s


In [26]:

optim = tf.optimizers.Adam(learning_rate=0.001, amsgrad=True)

partitioned_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='DistMult')
partitioned_model.compile(optimizer=optim, loss='multiclass_nll')


In [27]:

start = time.time()
partitioned_model.fit(partitioner,
                     batch_size=1000, use_partitioning=True,             
                     epochs=10)
print((time.time()-start))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
163.33203172683716


In [28]:

dataset_loader_test = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                        backend=SQLiteAdapter,
                                        batch_size=400, 
                                        dataset_type='test', 
                                        use_indexer=partitioned_model.data_handler.get_mapper())



28 triples containing invalid keys skipped!


In [29]:

start = time.time()
ranks = partitioned_model.evaluate(dataset_loader_test, 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


Time taken: 118.36480283737183


(1178.6854633525786, 0.07375494493608606, 0.0, 0.20351795674723555, 20438)

## Save and Load

In [30]:
partitioned_model.save_weights('./best_model')


In [31]:
loaded_part_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='DistMult')

loaded_part_model.load_weights('./best_model')


In [32]:

dataset_loader_test = GraphDataLoader('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                        backend=SQLiteAdapter,
                                        batch_size=400, 
                                        dataset_type='test', 
                                        use_indexer=loaded_part_model.data_indexer)



28 triples containing invalid keys skipped!


In [33]:


start = time.time()
ranks = loaded_part_model.evaluate(dataset_loader_test, 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)


Time taken: 120.21392798423767


(1178.6854633525786, 0.07375494493608606, 0.0, 0.20351795674723555, 20438)

# Training/eval with partition (default Partitioning Approach)

In [34]:
from ampligraph.datasets import DummyBackend, SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY


In [35]:
optim = tf.optimizers.Adam(learning_rate=0.001, amsgrad=True)

partitioned_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='DistMult')
partitioned_model.compile(optimizer=optim, loss='multiclass_nll')


In [36]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs_split')
start = time.time()
partitioned_model.fit('/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                     batch_size=10000, use_partitioning=True,
                     epochs=10, callbacks=[tensorboard_callback])
print((time.time()-start))


_split: memory before: 34.015MB, after: 46.787MB, consumed: 12.772MB; exec time: 124.36s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
210.38515615463257


In [37]:
len(set(dataset['train'][:, 0]).union(set(dataset['train'][:, 2])))


14505

In [38]:


start = time.time()
ranks = partitioned_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)



28 triples containing invalid keys skipped!
Time taken: 122.74676465988159


(1336.0252470887563, 0.06651532603609149, 0.0, 0.18416674821411097, 20438)

## Filtered evaluation

In [39]:
start = time.time()
ranks = partitioned_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                       batch_size=400,
                       corrupt_side='s,o',
                        use_filter={'train':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                              'valid':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/valid.txt',
                              'test':'/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt'})
end = time.time()
print('Time taken:', end-start)
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)



28 triples containing invalid keys skipped!

9 triples containing invalid keys skipped!
Time taken: 358.242479801178


(1194.9367110284763,
 0.1584834152025046,
 0.10822976807906841,
 0.2574126626871514,
 20438)

## Random model with partitioning

In [40]:
from ampligraph.datasets import DummyBackend, SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY

optim = tf.optimizers.Adam(learning_rate=0.001, amsgrad=True)

partitioned_model = ScoringBasedEmbeddingModel(eta=2, 
                                     k=50, 
                                     scoring_type='Random')
partitioned_model.compile(optimizer=optim, loss='multiclass_nll')


start = time.time()
partitioned_model.fit('/home/spai/code/ampligraph_projects/dataset/fb15k-237/train.txt',
                     batch_size=10000, use_partitioning=True,
                     epochs=10, callbacks=[])
print((time.time()-start))


_split: memory before: 67.406MB, after: 80.176MB, consumed: 12.77MB; exec time: 124.95s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
208.36215567588806


In [41]:
start = time.time()
ranks = partitioned_model.evaluate('/home/spai/code/ampligraph_projects/dataset/fb15k-237/test.txt', 
                                   batch_size=400)
end = time.time()
print('Time taken:', end-start)

mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)



28 triples containing invalid keys skipped!
Time taken: 123.16092610359192


(7279.448282610823, 0.0005559095188126157, 0.0, 0.0004892846658185732, 20438)

# Discovery

### Discover Facts

In [4]:
from ampligraph.discovery import discover_facts

model = ScoringBasedEmbeddingModel(eta=5, 
                                     k=50,
                                     scoring_type='ComplEx')



model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), 
              loss='multiclass_nll')

start = time.time()
model.fit(dataset['train'],
             batch_size=10000,
             epochs=10,
             validation_freq=50,
             validation_batch_size=100,
             validation_data = dataset['valid'])

end = time.time()
print('Time taken:', end-start)

discover_facts(dataset['train'][:100], 
               model, 
               top_n=100, 
               strategy='random_uniform', 
               max_candidates=100, 
               target_rel='/location/country/form_of_government', 
               seed=0)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Time taken: 24.852190494537354


(array([['/m/0fvf9q', '/location/country/form_of_government', '/m/0k049']],
       dtype=object),
 array([94.]))

### Find Clusters

In [43]:
import requests
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns


from ampligraph.datasets import load_from_csv
from ampligraph.discovery import find_clusters

# International football matches triples
# See tutorial here to understand how the triples are created from a tabular dataset:
url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/football.csv'
open('football.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'football.csv', sep=',')[:, 1:]

model = ScoringBasedEmbeddingModel(eta=5, 
                                 k=300,
                                 scoring_type='ComplEx')



model.compile(optimizer='adam', loss='multiclass_nll')

model.fit(X,
          batch_size=10000,
          epochs=10)

df = pd.DataFrame(X, columns=["s", "p", "o"])
teams = np.unique(np.concatenate((df.s[df.s.str.startswith("Team")],
                               df.o[df.o.str.startswith("Team")])))
team_embeddings = model.get_embeddings(teams, embedding_type='e')

embeddings_2d = PCA(n_components=2).fit_transform(np.array([i for i in team_embeddings]))

# Find clusters of embeddings using KMeans

kmeans = KMeans(n_clusters=6, n_init=100, max_iter=500)
clusters = find_clusters(teams, model, kmeans, mode='e')
print(np.unique(clusters, return_counts=True))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(array([0, 1, 2, 3, 4, 5], dtype=int32), array([ 49,   1,  56,  54,  43, 110]))


### Find Duplicates

In [44]:
import pandas as pd
import numpy as np
import re

# The IMDB dataset used here is part of the Movies5 dataset found on:
# The Magellan Data Repository (https://sites.google.com/site/anhaidgroup/projects/data)
import requests
url = 'http://pages.cs.wisc.edu/~anhai/data/784_data/movies5.tar.gz'
open('movies5.tar.gz', 'wb').write(requests.get(url).content)
import tarfile
tar = tarfile.open('movies5.tar.gz', "r:gz")
tar.extractall()
tar.close()

# Reading tabular dataset of IMDB movies and filling the missing values
imdb = pd.read_csv("movies5/csv_files/imdb.csv")
imdb["directors"] = imdb["directors"].fillna("UnknownDirector")
imdb["actors"] = imdb["actors"].fillna("UnknownActor")
imdb["genre"] = imdb["genre"].fillna("UnknownGenre")
imdb["duration"] = imdb["duration"].fillna("0")

# Creating knowledge graph triples from tabular dataset
imdb_triples = []

for _, row in imdb.iterrows():
    movie_id = "ID" + str(row["id"])
    directors = row["directors"].split(",")
    actors = row["actors"].split(",")
    genres = row["genre"].split(",")
    duration = "Duration" + str(int(re.sub("\D", "", row["duration"])) // 30)

    directors_triples = [(movie_id, "hasDirector", d) for d in directors]
    actors_triples = [(movie_id, "hasActor", a) for a in actors]
    genres_triples = [(movie_id, "hasGenre", g) for g in genres]
    duration_triple = (movie_id, "hasDuration", duration)

    imdb_triples.extend(directors_triples)
    imdb_triples.extend(actors_triples)
    imdb_triples.extend(genres_triples)
    imdb_triples.append(duration_triple)

# Training knowledge graph embedding with ComplEx model
model = ScoringBasedEmbeddingModel(eta=5, 
                             k=300,
                             scoring_type='ComplEx')



model.compile(optimizer='adam', loss='multiclass_nll')
imdb_triples = np.array(imdb_triples)
model.fit(imdb_triples,
      batch_size=10000,
      epochs=20)

# Finding duplicates movies (entities)
from ampligraph.discovery import find_duplicates

entities = np.unique(imdb_triples[:, 0])
dups, _ = find_duplicates(entities, model, mode='e', tolerance=0.45)
id_list = []
for data in dups:
    for i in data:
        id_list.append(int(i[2:]))
print(imdb.iloc[id_list[:6]][['movie_name', 'year']])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
        movie_name  year
841         Tattah  2013
840         Tattah  2013
1677  Re-Generator  2010
1676  Re-Generator  2010
2477     Ambulance  2005
2476     Ambulance  2005


### Query TopN

In [45]:
import requests
from ampligraph.datasets import load_from_csv
from ampligraph.discovery import discover_facts
from ampligraph.discovery import query_topn

# Game of Thrones relations dataset
url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/GoT.csv'
open('GoT.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'GoT.csv', sep=',')

model = ScoringBasedEmbeddingModel(eta=5, 
                             k=150,
                             scoring_type='DistMult')



model.compile(optimizer='Adam', loss='pairwise')
model.fit(X,
      batch_size=100,
      epochs=20)
query_topn(model, top_n=5,
        head='Eddard Stark', relation='ALLIED_WITH', tail=None,
        ents_to_consider=None, rels_to_consider=None)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


(array([['Eddard Stark', 'ALLIED_WITH', 'House Stark of Winterfell'],
        ['Eddard Stark', 'ALLIED_WITH', 'The Vale'],
        ['Eddard Stark', 'ALLIED_WITH', 'House Goodbrother of Hammerhorn'],
        ['Eddard Stark', 'ALLIED_WITH', 'House Locke of Oldcastle'],
        ['Eddard Stark', 'ALLIED_WITH', 'House Greyjoy of Pyke']],
       dtype='<U44'),
 array([2.2265291 , 0.5242001 , 0.5207645 , 0.48392776, 0.46780267],
       dtype=float32))