# Partitioned Training
The following example shows how to train a model when the dataset is too large to fit in memory and thus requires partitioning.

In [24]:
import sys
sys.path.append('../..')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
import numpy as np
import ampligraph

In [25]:
# Import the KGE model
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.datasets.sqlite_adapter import SQLiteAdapter
from ampligraph.datasets.graph_data_loader import GraphDataLoader

In [26]:
PATH_TO_DATASET = 'your/path/to/dataset/'

# Graph loader - loads the data from the file, numpy array, etc and generates batchs for iterating
# Internally it will first map raw data to indices and store in db.
# then this will map the raw triples to indices and store in another db
dataset_loader = GraphDataLoader(PATH_TO_DATASET + 'fb15k-237/train.txt', 
                                  backend=SQLiteAdapter, # type of backend to use
                                  batch_size=1000,       # batch size to use while iterating over this dataset
                                  dataset_type='train',  # dataset type
                                  use_filter=False,      # Whether to use filter or not
                                  use_indexer=True)      # indicates that the data needs to be mapped to index


In [27]:
# Choose the partitioner - in this case we choose BucketGraphPartitioner partitioner
from ampligraph.datasets import BucketGraphPartitioner
partitioner = BucketGraphPartitioner(dataset_loader, k=3)

# The above code will create a partitioner by passing the graph dataloader object
# the partitioner will partition the data and will internally create multiple graph 
# data loaders for each partition.


_split: memory before: 52.366MB, after: 65.027MB, consumed: 12.661MB; exec time: 95.922s


In [28]:
# create and compile a model as usual
partitioned_model = ScoringBasedEmbeddingModel(eta=2, 
                                               k=50, 
                                               scoring_type='DistMult')

partitioned_model.compile(optimizer='adam', loss='multiclass_nll')

partitioned_model.fit(partitioner,            # Pass the partitioner object as input to the fit function
                                              # this will generate data for the model during training
                                              # No need to pass partitioning_k parameter as this will be 
                                              # overridden by partitioner_k of input partitioner
                      batch_size=1000,        # Batch size
                      epochs=10)              # number of epochs



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x28d5002e0>

In [29]:
# Create an instance of graph(triple) loader for test set by passing sql backend. 
# the data will be indexed using the models training indexer
# and the indexed triples will be stored in a database in chunks
dataset_loader_test = GraphDataLoader(PATH_TO_DATASET + 'fb15k-237/test.txt',
                                      backend=SQLiteAdapter,                         # Type of backend to use
                                      batch_size=400,                                # Batch size to use while iterating over this dataset
                                      dataset_type='test',                           # Dataset type
                                      use_indexer=partitioned_model.data_indexer)    # Get the data_indexer from the trained model 
                                                                                     # and map the concepts to the same indices used at training



28 triples containing invalid keys skipped!


In [30]:
ranks = partitioned_model.evaluate(dataset_loader_test, # pass the dataloader object as input to the 
                                                        # evaluate function. this will generate data
                                                        # for the model during training
                                   batch_size=400)
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)



(672.3955377238477, 0.0827018782039032, 0.0, 0.22237988061454153, 20438)

In [31]:
from ampligraph.utils import save_model
save_model(model=partitioned_model, model_name_path='./partitioned_model_bucket')



In [32]:
from ampligraph.utils import restore_model
model = restore_model('./partitioned_model_bucket')

In [33]:
dataset_loader_test = GraphDataLoader(PATH_TO_DATASET + 'fb15k-237/test.txt', 
                                        backend=SQLiteAdapter,          # type of backend to use
                                        batch_size=400,                 # batch size to use while iterating over this dataset
                                        dataset_type='test',            # dataset type
                                        use_indexer=model.data_indexer)    # get the mapper from the trained model 
                                                                                                    # and map the concepts to same indices 
                                                                                                    # as used during training



28 triples containing invalid keys skipped!


In [34]:
ranks = model.evaluate(dataset_loader_test, # pass the dataloader object as input to the 
                                            # evaluate function. this will generate data
                                            # for the model during training
                       batch_size=400)
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score
mr_score(ranks), mrr_score(ranks), hits_at_n_score(ranks, 1), hits_at_n_score(ranks, 10), len(ranks)



(672.3955377238477, 0.0827018782039032, 0.0, 0.22237988061454153, 20438)