In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
import pandas as pd
tf.config.set_soft_device_placement(False)
tf.debugging.set_log_device_placement(True)
import numpy as np
from ampligraph.datasets import load_fb15k_237, load_yago3_10
from ampligraph.evaluation.protocol import create_mappings, to_idx

from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score, mr_score
import time
print(tf.__version__)
assert(tf.__version__.startswith('2.3'))


from ampligraph.datasets import GraphDataLoader
from ampligraph.datasets.graph_partitioner import PARTITION_ALGO_REGISTRY

from ampligraph.datasets import load_fb15k_237, load_fb13, load_fb15k, load_wn11, load_wn18, load_wn18rr, load_yago3_10


2.3.0


In [2]:
from ampligraph.latent_features import ScoringBasedEmbeddingModel

In [3]:
# load the dataset
full_dataset = load_fb15k_237()

In [4]:
# this would go into separate classes, like we had in ampligraph 1 (for loss functions and initializers)
# initializer
def xavier(in_shape, out_shape):
    std = np.sqrt(2 / (in_shape + out_shape))
    return np.random.normal(0, std, size=(in_shape, out_shape)).astype(np.float32)

#loss function
def nll(scores_pred, eta):
    scores_neg = scores_pred[1]
    scores_pos = scores_pred[0]

    scores_neg_reshaped = tf.reshape(scores_neg, [eta, tf.shape(scores_pos)[0]])
    neg_exp = tf.exp(scores_neg_reshaped)
    pos_exp = tf.exp(scores_pos)
    softmax_score = pos_exp / (tf.reduce_sum(neg_exp, axis=0) + pos_exp)

    loss = -tf.reduce_sum(tf.math.log(softmax_score))
    return loss

In [5]:
rel_to_idx, ent_to_idx = create_mappings(full_dataset['train'])
train_dataset = to_idx(full_dataset['train'], rel_to_idx=rel_to_idx, ent_to_idx=ent_to_idx)

pd.DataFrame(train_dataset, columns=['s', 'p', 'o']).to_csv('fb15k_237_train.csv', 
                                                           header=False,
                                                           index=False, sep='\t')

In [6]:
test_dataset = to_idx(full_dataset['test'], rel_to_idx=rel_to_idx, ent_to_idx=ent_to_idx)

pd.DataFrame(test_dataset, columns=['s', 'p', 'o']).to_csv('fb15k_237_test.csv', 
                                                           header=False,
                                                           index=False, sep='\t')


x_filter = np.concatenate([full_dataset['train'], full_dataset['valid'], full_dataset['test']], 0)

filter_dataset = to_idx(x_filter, rel_to_idx=rel_to_idx, ent_to_idx=ent_to_idx)

pd.DataFrame(filter_dataset, columns=['s', 'p', 'o']).to_csv('fb15k_237_filter.csv', 
                                                           header=False,
                                                           index=False, sep='\t')


In [7]:
model = ScoringBasedEmbeddingModel(eta=5, 
                                                     k=300, 
                                                     max_ent_size=14505, 
                                                     max_rel_size=237,
                                                     scoring_type='ComplEx')


In [8]:
# uncomment if you have already saved the checkpoint
# model.load_weights('./chkpt1')

In [9]:
model.compile(optimizer='adam', loss=nll)

In [10]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('./chkpt1', monitor='loss', verbose=1, save_best_only=True, mode='min')

In [11]:
model.fit('/home/spai/code/ampligraph_projects/tf2/AmpliGraph-Lab/fb15k_237_train.csv',
         batch_size=30000,
         epochs=20, callbacks=[checkpoint])

Epoch 1/20
     10/Unknown - 1s 124ms/step - loss: 53752.6797
Epoch 00001: loss improved from inf to 53752.67969, saving model to ./chkpt1
Epoch 2/20
Epoch 00002: loss improved from 53752.67969 to 53745.41406, saving model to ./chkpt1
Epoch 3/20
Epoch 00003: loss improved from 53745.41406 to 53732.83984, saving model to ./chkpt1
Epoch 4/20
Epoch 00004: loss improved from 53732.83984 to 53706.23047, saving model to ./chkpt1
Epoch 5/20
Epoch 00005: loss improved from 53706.23047 to 53650.63281, saving model to ./chkpt1
Epoch 6/20
Epoch 00006: loss improved from 53650.63281 to 53542.53516, saving model to ./chkpt1
Epoch 7/20
Epoch 00007: loss improved from 53542.53516 to 53348.93359, saving model to ./chkpt1
Epoch 8/20
Epoch 00008: loss improved from 53348.93359 to 53028.23438, saving model to ./chkpt1
Epoch 9/20
Epoch 00009: loss improved from 53028.23438 to 52533.30078, saving model to ./chkpt1
Epoch 10/20
Epoch 00010: loss improved from 52533.30078 to 51818.27734, saving model to ./chk

<tensorflow.python.keras.callbacks.History at 0x7fdd6d7a63d0>

In [12]:
model.summary()

Model: "scoring_based_embedding_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
ComplEx (ComplEx)            multiple                  0         
_________________________________________________________________
corruption_generation_layer_ multiple                  0         
_________________________________________________________________
embedding_lookup_layer (Embe multiple                  8845200   
_________________________________________________________________
loss (Mean)                  multiple                  2         
Total params: 8,845,202
Trainable params: 8,845,200
Non-trainable params: 2
_________________________________________________________________


In [13]:
start = time.time()
ranks = model.evaluate('/home/spai/code/ampligraph_projects/tf2/AmpliGraph-Lab/fb15k_237_test.csv')
end = time.time()



In [None]:
from ampligraph.datasets import SQLiteAdapter
from ampligraph.datasets import GraphDataLoader
import tensorflow as tf
class Test():
    def __init__(self):

        
        self.data_loader = GraphDataLoader('/home/spai/code/ampligraph_projects/tf2/AmpliGraph-Lab/fb15k_237_test.csv', 
                                      backend=SQLiteAdapter,
                                      batch_size=100, dataset_type="test", verbose=True) 

        num_ents = self.data_loader.backend.mapper.ents_length
        num_rels = self.data_loader.backend.mapper.rels_length
        max_ent_size = self.data_loader.backend.mapper.ents_length


        dataset = self.data_loader.get_batch()
        
        print('iterating...', dataset)
        total_triples = 0
    #with self.data_loader.backend:
        for inputs in dataset:
            total_triples += len(inputs)

        print(total_triples)
            
Test()

# Ignore the below code

In [None]:
# this class is responsible for training the embeddings on multiple GPUs
class ModelTrainer():
        
    def __init__(self, batch_size, max_ents, max_rels, k, dataset, seed=0):
        # max_ent_size - is the max embeddings that can be loaded in memory
        np.random.seed(seed)
        self.eta = 10
        self.models = None # would be a list later (one per device)
        self.optimizers = None # would be a list later (one per device)
        
        
        self.batch_size=batch_size
        self.num_ents = max_ents
        self.num_rels = max_rels
        self.max_ent_size = max_ents
        
        # create the dataset handle
        self.data_loader = GraphDataLoader(dataset, 
                              batch_size=batch_size, dataset_type="train")
        # create the model and optimizer
        with tf.device('GPU:0'):
            self.optimizers = tf.optimizers.Adam(lr=0.001)
            self.models = ScoringBasedEmbeddingModel(eta=self.eta, 
                                                     k=k, 
                                                     max_ent_size=self.max_ent_size, 
                                                     max_rel_size=self.num_rels,
                                                     scoring_type='ComplEx')
        
        self.batch_size=batch_size
        
        self.k = self.models.k
        
    
    @tf.function(experimental_relax_shapes=True)
    def train_step(self, inputs, optimizer):
        with tf.GradientTape() as tape:
            # get the model predictions
            preds = self.models(inputs, training=0)
            # compute the loss
            loss = nll(preds, self.eta)
            # regularizer - will be in a separate class like ampligraph 1
            loss += (0.0001 * (tf.reduce_sum(tf.pow(tf.abs(self.models.encoding_layer.ent_emb), 3)) + \
                              tf.reduce_sum(tf.pow(tf.abs(self.models.encoding_layer.rel_emb), 3))))

        # compute the grads
        gradients = tape.gradient(loss, [self.models.encoding_layer.ent_emb, 
                                         self.models.encoding_layer.rel_emb])
        # update the trainable params
        optimizer.apply_gradients(zip(gradients, [self.models.encoding_layer.ent_emb, 
                                                  self.models.encoding_layer.rel_emb]))   
        return loss
        
                

    def train(self, epochs = 5):
        # create the generator 
        dataset = tf.data.Dataset.from_generator(self.data_loader.get_batch,
                                             output_types=(tf.int32),
                                             output_shapes=((None, 3)))
        dataset = dataset.prefetch(0)



        for i in range(epochs):
            total_loss = []
            print(i)
            self.global_epoch = i
            # train on batches
            for j, inputs in dataset.enumerate():
                self.global_batch = 0
                with tf.device('{}'.format('GPU:0')):
                    loss = self.train_step(inputs, self.optimizers)
                
                total_loss.append(loss/inputs.shape[0])
            
            print('\n\n\n\nloss------------------{}:{}'.format(i, np.mean(total_loss)))
        print('done')
        return

In [None]:

model_trainer = ModelTrainer(batch_size = 30000, 
                             max_ents = 14505, # ideally we should get from the dataset apis/partition manager
                             max_rels=237,
                             k=300,  
                             dataset='/home/spai/code/ampligraph_projects/tf2/AmpliGraph-Lab/fb15k_237_train.csv')

start = time.time()
model_trainer.train(30)
end = time.time()

In [None]:
class ModelEvaluator():
    def __init__(self, ent_emb, rel_emb, models, max_ent_size, dataset, filterset):
        self.ent_emb = ent_emb
        self.rel_emb = rel_emb
        self.models = models
        self.max_ent_size = max_ent_size
        self.dataset_handle = GraphDataLoader(dataset, 
                                  batch_size=100, dataset_type="test")
        
    def get_next_batch(self):
        for out in self.dataset_handle:

            yield self.ent_emb[out[:, 0]], self.rel_emb[out[:, 1]], self.ent_emb[out[:, 2]]
            
    def evaluate(self):
        dataset = tf.data.Dataset.from_generator(self.get_next_batch,
                                                 output_types=(tf.float32, tf.float32, tf.float32),
                                                 output_shapes=((None, self.models.k), 
                                                                (None, self.models.k), 
                                                                (None, self.models.k)))
        dataset = dataset.prefetch(1)

        self.all_ranks = []
        for i, inputs in dataset.enumerate():
            batch_size = self.max_ent_size
            batch_count = np.int32(np.round(self.ent_emb.shape[0]/batch_size))
            overall_rank = np.zeros((inputs[0].shape[0], 2))
            for j in range(batch_count):
                ent_embs = self.ent_emb[j * batch_size : (j+1) * batch_size, :]
                rel_embs = self.rel_emb
                with tf.device('GPU:0'):
                    sub_rank, obj_rank = self.models.get_ranks(inputs, ent_embs)
                overall_rank[:, 0] +=  sub_rank.numpy()
                overall_rank[:, 1] +=  obj_rank.numpy()
            overall_rank = overall_rank + 1
            self.all_ranks.extend(overall_rank.tolist())

In [None]:
model_evaluator = ModelEvaluator(model_trainer.models.encoding_layer.ent_emb.numpy(),
                                      model_trainer.models.encoding_layer.rel_emb.numpy(),
                                      model_trainer.models,
                                      max_ent_size=14505,
                                      dataset='/home/spai/code/ampligraph_projects/tf2/AmpliGraph-Lab/fb15k_237_test.csv',
                                      filterset='/home/spai/code/ampligraph_projects/tf2/AmpliGraph-Lab/fb15k_237_filter.csv')



start=time.time()
model_evaluator.evaluate()
print((time.time()-start)/60)

In [None]:
mrr_score(model_evaluator.all_ranks), hits_at_n_score(model_evaluator.all_ranks, 10)

# Ignore this - partition related

In [None]:
# Partitioning based

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')


tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=8000),
    tf.config.experimental.VirtualDeviceConfiguration(memory_limit=8000)])

logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")

logical_gpus  

In [None]:
# this class is responsible for training the embeddings on multiple GPUs
class ModelTrainer():
    
    # this class manages the partitions
    # This would later be responsible for persisting the input data and creating the partitions
    # during training time, it will load the partitions and related embeddings 
    class PartitionManager():
        def __init__(self, num_buckets, k, dataset, strategy, num_devices=1):
            self.num_buckets = num_buckets
            
            num_entities = len(set(dataset[:, 0]).union(dataset[:, 2]))
            num_rels = len(set(dataset[:, 1]))
            
            self.rel_to_idx, self.ent_to_idx = create_mappings(dataset)
            dataset = to_idx(dataset, self.ent_to_idx, self.rel_to_idx)
            
            partitioner = PARTITION_ALGO_REGISTRY.get(strategy)(dataset, k=num_buckets)
            self.partitions = partitioner.split()
            
            if num_buckets > 1:
                self.num_partitions = np.sum(np.arange(1, num_buckets+1))
            else:
                self.num_partitions = num_buckets
            
            # needs to go into the database
            self.entity_embeddings = xavier(num_entities, k)
            self.rel_embeddings = xavier(num_rels, k)
            
            self.num_devices = 1
            
            # use multiple GPUs only if we do multiple partitions
            if self.num_partitions > 1:
                self.num_devices = num_devices
            
        
        def get_next_partition(self):
            # get the next partition to train on, along with the embeddings of nodes in that partition
            for i in range(len(self.partitions)):
                
                    entities_of_partition = set(self.partitions[i][:, 0]).union(self.partitions[i][:, 2])
                    partition_dict = dict(zip(entities_of_partition, 
                                              np.arange(len(entities_of_partition))))
                    new_rel_dict = dict(zip(np.arange(len(self.rel_to_idx)), np.arange(len(self.rel_to_idx))))
                    # remap the triples to reflect the position in the embedding matrix
                    remapped_triples = to_idx(self.partitions[i], partition_dict, new_rel_dict)
                    yield partition_dict, \
                            new_rel_dict, \
                            remapped_triples, \
                            self.entity_embeddings[list(partition_dict.keys()), :], \
                            self.rel_embeddings[list(new_rel_dict.keys()), :]
        
        
        
    def __init__(self, batch_size, max_ent_size, k, dataset, strategy, num_buckets=1, num_devices=1, seed=0):
        # max_ent_size - is the max embeddings that can be loaded in memory
        np.random.seed(seed)
        self.eta = 10
        self.models = None # would be a list later (one per device)
        self.optimizers = None # would be a list later (one per device)
        self.max_ent_size = max_ent_size
        
        self.batch_size=batch_size
        self.num_devices = num_devices
        self.num_ents = len(set(dataset[:, 0]).union(dataset[:, 2]))
        self.num_rels = len(set(dataset[:, 1]))
        
        # create embedding model and optimizer
        for i in range(num_devices):
            with tf.device('GPU:{}'.format(i)):
                self.optimizers = tf.optimizers.Adam(lr=0.001)
                self.models = ScoringBasedEmbeddingModel(eta=self.eta, 
                                                         k=k, 
                                                         max_ent_size=max_ent_size, 
                                                         max_rel_size=self.num_rels,
                                                         scoring_type='ComplEx')
        
        self.batch_size=batch_size
        
        self.k = self.models.k
        self.partition_manager = self.PartitionManager(num_buckets, self.k, dataset, strategy, num_devices)
        
        # this is the hyperparams of the optimizer - would be later moved 
        # to database calls and retrieved from partition manager
        self.optimizer_hyperparams_ent = np.zeros(shape=(self.num_ents, 2, self.k), 
                                                  dtype=np.float32)
        self.optimizer_hyperparams_rel = np.zeros(shape=(self.num_rels, 2, self.k), 
                                                  dtype=np.float32)
        
        
    def train_dataset_generator(self, dataset):
        # generator for the training data
        batch_count = dataset.shape[0]//self.batch_size + 1
        for j in range(batch_count):
            inputs = dataset[j * self.batch_size : (j+1) * self.batch_size, :].astype(np.int32)
            yield inputs
            
    def update_partion_embeddings_after_train(self):
        # before changing the partition, save the trained embeddings and optimizer params
        
        self.partition_manager.entity_embeddings[list(self.ent_dict.keys()), :] = self.models.encoding_layer.ent_emb.numpy()[:len(self.ent_dict), :]
        self.partition_manager.rel_embeddings[list(self.rel_dict.keys()), :] = self.models.encoding_layer.rel_emb.numpy()[:len(self.rel_dict), :]
        
        opt_weights = self.optimizers.get_weights()
        if len(opt_weights)>0:
            self.optimizer_hyperparams_rel[list(self.rel_dict.keys()), :, :] = np.concatenate([opt_weights[2][:len(self.rel_dict)][:, np.newaxis, :], 
                                                                                                 opt_weights[4][:len(self.rel_dict)][:, np.newaxis, :]], 1)
            
            self.optimizer_hyperparams_ent[list(self.ent_dict.keys()), :, :] = np.concatenate([opt_weights[1][:len(self.ent_dict)][:, np.newaxis, :], 
                                                                                             opt_weights[3][:len(self.ent_dict)][:, np.newaxis, :]], 1)
        
        
    def change_partition(self):
        # load a new partition and update the trainable params and optimizer hyperparams
        self.ent_dict, self.rel_dict, remapped_triples, ent_embs, rel_embs = next(self.partition_iterator)
        print('partition has {} triples', remapped_triples.shape)
        self.partition_dataset_iterator = iter(self.train_dataset_generator(remapped_triples))
        self.models.partition_change_updates(len(self.ent_dict), ent_embs, rel_embs)
        if self.global_epoch >1:
            # needs to be better handled
            optimizer_rel_weights_updates_beta1 = self.optimizer_hyperparams_rel[list(self.rel_dict.keys()), 0, :]
            optimizer_rel_weights_updates_beta2 = self.optimizer_hyperparams_rel[list(self.rel_dict.keys()), 1, :]
            optimizer_ent_weights_updates_beta1 = self.optimizer_hyperparams_ent[list(self.ent_dict.keys()), 0, :]
            optimizer_ent_weights_updates_beta2 = self.optimizer_hyperparams_ent[list(self.ent_dict.keys()), 1, :]
            
            optimizer_rel_weights_updates_beta1 = np.pad(optimizer_rel_weights_updates_beta1, 
                                                         ((0, self.num_rels - optimizer_rel_weights_updates_beta1.shape[0]), 
                                                          (0,0)), 
                                                         'constant', 
                                                         constant_values=(0))
            optimizer_rel_weights_updates_beta2 = np.pad(optimizer_rel_weights_updates_beta2, 
                                                         ((0, self.num_rels - optimizer_rel_weights_updates_beta2.shape[0]), 
                                                          (0,0)), 
                                                         'constant', 
                                                         constant_values=(0))
            optimizer_ent_weights_updates_beta1 = np.pad(optimizer_ent_weights_updates_beta1, 
                                                         ((0, self.max_ent_size - optimizer_ent_weights_updates_beta1.shape[0]), 
                                                          (0,0)), 
                                                         'constant', 
                                                         constant_values=(0))
            optimizer_ent_weights_updates_beta2 = np.pad(optimizer_ent_weights_updates_beta2, 
                                                         ((0, self.max_ent_size - optimizer_ent_weights_updates_beta2.shape[0]), 
                                                          (0,0)), 
                                                         'constant', 
                                                         constant_values=(0))
            
            self.optimizers.set_weights(self.optimizers.get_weights())

            self.optimizers.set_weights([self.optimizers.iterations.numpy(), 
                                         optimizer_ent_weights_updates_beta1,
                                         optimizer_rel_weights_updates_beta1,
                                         optimizer_ent_weights_updates_beta2,
                                         optimizer_rel_weights_updates_beta2
                                        ])

            
    def get_next_batch(self):
        try:
            self.partition_iterator = iter(self.partition_manager.get_next_partition())
            # get new partition
            self.change_partition()
            while True:
                try:
                    # get batches from the current partition
                    out = next(self.partition_dataset_iterator)
                    yield out
                except StopIteration:
                    # if no more batch data - save the trained params and load next partition
                    self.update_partion_embeddings_after_train()
                    self.change_partition()
        except StopIteration:
            self.update_partion_embeddings_after_train()
            #if no more partitions, end
            return
                
    
    @tf.function(experimental_relax_shapes=True)
    def train_step(self, inputs, optimizer):
        with tf.GradientTape() as tape:
            # get the model predictions
            preds = self.models(inputs, training=0)
            # compute the loss
            loss = nll(preds, self.eta)
            # regularizer - will be in a separate class like ampligraph 1
            loss += (0.0001 * (tf.reduce_sum(tf.pow(tf.abs(self.models.encoding_layer.ent_emb), 3)) + \
                              tf.reduce_sum(tf.pow(tf.abs(self.models.encoding_layer.rel_emb), 3))))

        # compute the grads
        gradients = tape.gradient(loss, [self.models.encoding_layer.ent_emb, 
                                         self.models.encoding_layer.rel_emb])
        # update the trainable params
        optimizer.apply_gradients(zip(gradients, [self.models.encoding_layer.ent_emb, 
                                                  self.models.encoding_layer.rel_emb]))   
        return loss
        
                

    def train(self, epochs = 5):
        dataset = tf.data.Dataset.from_generator(self.get_next_batch,
                                             output_types=(tf.int32),
                                             output_shapes=((None, 3)))
        dataset = dataset.prefetch(0)



        for i in range(epochs):
            total_loss = []
            print(i)
            self.global_epoch = i

            for j, inputs in dataset.enumerate():
                self.global_batch = 0
                with tf.device('{}'.format('GPU:0')):
                    loss = self.train_step(inputs, self.optimizers)
                
                total_loss.append(loss/inputs.shape[0])
            
            print('\n\n\n\nloss------------------{}:{}'.format(i, np.mean(total_loss)))
        print('done')
        return


In [None]:
num_buckets = 3
model_trainer = ModelTrainer(30000, 
                             14505, 
                             k=300,
                             dataset=full_dataset['train'], 
                             strategy='RandomVertices',
                             num_buckets=num_buckets, 
                             num_devices=1)

start = time.time()
model_trainer.train(1)
end = time.time()

print(end - start)

In [None]:
# continue training for 10 more epochs
start = time.time()
model_trainer.train(1)
end = time.time()

In [None]:
# compare loss of prev model with model trained for 20 epochs.

model_trainer = ModelTrainer(30000, 
                             14505, 
                             k=300,
                             dataset=full_dataset['train'], 
                             strategy='Bucket',
                             num_buckets=1, 
                             num_devices=1)

start = time.time()
model_trainer.train(2)
end = time.time()

print(end - start)

In [None]:
from functools import partial



from ampligraph.evaluation import filter_unseen_entities

from ampligraph.datasets import NumpyDatasetAdapter
filter_triples = np.concatenate([full_dataset['train'],
                                full_dataset['valid'],
                                full_dataset['test']], 0).astype(np.str)

test_dataset = full_dataset['test']
dataset_handle = NumpyDatasetAdapter()
dataset_handle.use_mappings(model_trainer.partition_manager.rel_to_idx, 
                            model_trainer.partition_manager.ent_to_idx)

# need to fix filter_unseen_entities - this is temp workaround
model_trainer.models.ent_to_idx = model_trainer.partition_manager.ent_to_idx
model_trainer.models.rel_to_idx = model_trainer.partition_manager.rel_to_idx
test_dataset = filter_unseen_entities(test_dataset.astype(np.str), model_trainer.models, verbose=True)
print(test_dataset.shape)
dataset_handle.set_data(test_dataset, 'test')

print(filter_triples.shape)
filter_triples = filter_unseen_entities(filter_triples, model_trainer.models, verbose=True)
dataset_handle.set_filter(filter_triples)

print(filter_triples.shape)

class ModelEvaluator():
    def __init__(self, ent_emb, rel_emb, models, max_ent_size, dataset_handle):
        self.ent_emb = ent_emb
        self.rel_emb = rel_emb
        self.models = models
        self.max_ent_size = max_ent_size
        self.dataset_handle = dataset_handle
        
    def get_next_batch(self):
        batch_size = 100
        batch_count = np.int32(np.round(self.dataset_handle.get_size('test')/batch_size))
        test_generator = partial(self.dataset_handle.get_next_batch,
                             dataset_type='test',
                             use_filter=False, 
                             batches_count=batch_count)

        batch_iterator = iter(test_generator())
        print('Btahces: ', np.int32(np.round(batch_count/batch_size)))
        for i in range(batch_count):
            out = next(batch_iterator)

            yield self.ent_emb[out[:, 0]], self.rel_emb[out[:, 1]], self.ent_emb[out[:, 2]]
            
    def evaluate(self):
        dataset = tf.data.Dataset.from_generator(self.get_next_batch,
                                                 output_types=(tf.float32, tf.float32, tf.float32),
                                                 output_shapes=((None, self.models.k), 
                                                                (None, self.models.k), 
                                                                (None, self.models.k)))
        dataset = dataset.prefetch(1)

        self.all_ranks = []
        for i, inputs in dataset.enumerate():
            batch_size = self.max_ent_size
            batch_count = np.int32(np.round(self.ent_emb.shape[0]/batch_size))
            overall_rank = np.zeros((inputs[0].shape[0], 2))
            for j in range(batch_count):
                ent_embs = self.ent_emb[j * batch_size : (j+1) * batch_size, :]
                rel_embs = self.rel_emb
                with tf.device('GPU:0'):
                    sub_rank, obj_rank = self.models.get_ranks(inputs, ent_embs)
                overall_rank[:, 0] +=  sub_rank.numpy()
                overall_rank[:, 1] +=  obj_rank.numpy()
            overall_rank = overall_rank + 1
            self.all_ranks.extend(overall_rank.tolist())
    

In [None]:
model_evaluator = ModelEvaluator(model_trainer.partition_manager.entity_embeddings,
                                      model_trainer.partition_manager.rel_embeddings,
                                      model_trainer.models,
                                      max_ent_size=max_entities,
                                      dataset_handle=dataset_handle)



start=time.time()
model_evaluator.evaluate()
print(i, (time.time()-start)/60)


In [None]:
model_evaluator.all_ranks = np.array(model_evaluator.all_ranks) + 1

In [None]:
mrr_score(model_evaluator.all_ranks), hits_at_n_score(model_evaluator.all_ranks, 2)