<h1>Imports</h1>

In [1]:
import pandas as pd
import numpy as np
import os
import time
import keras
from functools import partial
from keras import layers
from keras.layers import merge
from keras import optimizers 
from keras import backend as K
from sklearn.utils import shuffle

Using TensorFlow backend.


<h1>Functions for the training/test model</h1>

In [2]:
def rel_init(k,shape, dtype=None):
    unnormed = K.random_uniform(shape=shape, minval=-6/np.sqrt(k), maxval=6/np.sqrt(k), dtype=dtype)
    norm     = K.sqrt(K.sum(K.pow(unnormed,2), axis=-1, keepdims=True))  
    return unnormed/norm

def L2_norm(x, keepdims=True):
    return K.sqrt(K.sum(K.pow(x,2), axis=-1, keepdims=keepdims))

def L1_norm(x, keepdims=True):
    return K.sum(K.abs(x), axis=-1, keepdims=keepdims)


def loss_function(y_true, y_pred):  ## y_true is unused, but keras "fit" method requires it, so when calling
    return K.sum(y_pred)            ## the "fit" method also the y argument must be passed.

<h1>Dataset manipulation functions</h1>

In [3]:
def create_indexed_dataset(dataset, entities, rel):
    #substituting the entities and the relationships in the dataset with their respective index in the 
    #entities dataframe and relationships dataframe

    index_dataset=pd.merge(dataset, entities.rename(columns={"entity": "h"}), 
                           on='h', left_index=True).drop('h',1).rename(columns={"index": "h"})
    index_dataset=pd.merge(index_dataset, entities.rename(columns={"entity": "t"}), 
                           on='t', left_index=True).drop('t',1).rename(columns={"index": "t"})
    index_dataset=pd.merge(index_dataset, rel.rename(columns={"relationship": "l"}), 
                           on='l', left_index=True).drop('l',1).rename(
                           columns={"index": "l"}).reset_index().drop('index',1).reset_index()
    return index_dataset


def generate_pos_neg_set(index_dataset):
   
    Negative=index_dataset.copy()

    entities_size=np.shape(model.layers[2].get_weights()[0])[0]
    #generate negative triplets

    Negative_h,Negative_t=np.split(shuffle(Negative, random_state=np.random.randint(0,10000)), 2)
    Negative_h['h']=Negative_h['h'].apply(lambda x: np.random.randint(0,entities_size))
    Negative_t['t']=Negative_t['t'].apply(lambda x: np.random.randint(0,entities_size))
    Negative=pd.concat((Negative_h, Negative_t))

    #remove fake negative triplets
    Negative = pd.merge(index_dataset.drop('index',1), Negative, on=['h', 't', 'l'], how='right', indicator='Exist')
    Negative = Negative.drop(Negative[Negative['Exist']=='both'].index, axis=0).drop('Exist',1).rename(columns={"h": "h1", "t": "t1"})

    #merge positive and negative triplets
    Total=pd.merge(index_dataset, Negative, on='index').drop(['l_y','index'],1).rename(columns={"l_x": "l"})

    Entities_set=Total.drop(['l'],1).to_numpy()
    Rel_set=Total['l'].to_numpy()
    n=np.shape(Rel_set)[0]

    e=Entities_set.reshape(n,4)
    l=Rel_set.reshape(n,1)
    
    return [e,l]

<h1>Training model definition

In [4]:
def training_model_creation(rel_set, entities_set, k, norm_type='L2', optimizer='sgd'):

    #get the sizes of the sets of unique relationships and entities
    
    rel_size=np.shape(rel_set)[0]
    entities_size=np.shape(entities_set)[0]
    
    #Define initializers for the embedings
    
    init=keras.initializers.RandomUniform(minval=-6/np.sqrt(k), maxval=6/np.sqrt(k))
    rel_initializer=partial(rel_init,k)
    
    #Define norm constraint on entities' embeddings
    norm=keras.constraints.UnitNorm(axis=1)

    #Define input shape
    in_e=keras.Input((4,))
    in_r=keras.Input((1,))

    #Define embedding layers
    embedding_e = layers.Embedding(entities_size, k, input_length=4, embeddings_initializer=init, embeddings_constraint=norm)(in_e)
    embedding_r = layers.Embedding(rel_size, k, input_length=1, embeddings_initializer=rel_initializer)(in_r)

    #Concatenate embedding layers into one layer
    embedding   = layers.Concatenate(axis=1)([embedding_e,embedding_r])

    #Find the values for the triplets (h,l,t) (positive) and (h1,l,t1) (negative)
    
    h           = layers.Lambda( lambda y: y[:,0,:])(embedding)
    t           = layers.Lambda( lambda y: y[:,1,:])(embedding)
    h1          = layers.Lambda( lambda y: y[:,2,:])(embedding)
    t1          = layers.Lambda( lambda y: y[:,3,:])(embedding)
    l           = layers.Lambda( lambda y: y[:,4,:])(embedding)

    pos         = merge.subtract([merge.add([h,l]),t])
    neg         = merge.subtract([merge.add([h1,l]),t1])

    #Compute the dissimilarities/energies of the two triplets...

    if norm_type=='L2':
        pos_mid     = layers.Lambda(L2_norm)(pos) #L2 norm
        neg_mid     = layers.Lambda(L2_norm)(neg) #L2 norm

    elif norm_type=='L1':
        pos_mid     = layers.Lambda(L1_norm)(pos) #L1 norm
        neg_mid     = layers.Lambda(L1_norm)(neg) #L1 norm

    #...then subtract them
    
    sub         = merge.subtract([pos_mid,neg_mid])

    #finally compute [\gamma+d(positive_triplet)-d(negative_triplet)]_+
    
    out         = layers.Lambda(lambda y: K.maximum(y+gamma,0))(sub)

    
    model=keras.Model([in_e,in_r], out)

    model.compile(loss=loss_function, optimizer=optimizer)
    return model

<h1>Function for test model definition and execution

<h3>The function performs prediction for either head, tail or relationship in the triplet

In [5]:
def Multitask_Tester(model, index_dataset, rel_set, entities_set, task='tail', norm_type='L2', batch_test=25):
    
    #computing sizes to provide to the embedding layers
    rel_size=np.shape(rel_set)[0]
    entities_size=np.shape(entities_set)[0]
    
    #dividing the dataset for testing
    Test_entities_set=index_dataset.drop(['l'],1).to_numpy()  #Head [0], Tail[1]
    Test_rel_set=index_dataset['l'].to_numpy() #Relationship

    if task=='relationship':
        
        in_e=keras.Input(batch_shape=(batch_test,2))

        #using a constant input for the relationships
        in_r=keras.Input(tensor=K.constant(np.array(list(range(rel_size))*batch_test), shape=(batch_test,rel_size)))

        embedding_e = layers.Embedding(entities_size, k, input_length=2)(in_e)
        embedding_r = layers.Embedding(rel_size, k, input_length=rel_size)(in_r)

        embedding   = layers.Concatenate(axis=1)([embedding_e,embedding_r])

        h           = layers.Lambda( lambda y: y[:,0,:])(embedding)
        t           = layers.Lambda( lambda y: y[:,1,:])(embedding)
        l           = layers.Lambda( lambda y: y[:,2:,:])(embedding)

        ents        = merge.subtract([h,t])

        dist        = merge.add([l,ents])

    elif task=='head':
        
        in_t=keras.Input(batch_shape=(batch_test,1))

        #using a constant input for the heads
        in_h=keras.Input(tensor=K.constant(np.array(list(range(entities_size))*batch_test), shape=(batch_test,entities_size)))

        in_r=keras.Input(batch_shape=(batch_test,1))

        embedding_r = layers.Embedding(rel_size, k, input_length=1)(in_r)
        embedding_t = layers.Embedding(entities_size, k, input_length=1)(in_t)
        embedding_h = layers.Embedding(entities_size, k, input_length=entities_size)(in_h)

        embedding   = layers.Concatenate(axis=1)([embedding_r,embedding_t,embedding_h])

        l           = layers.Lambda( lambda y: y[:,0,:])(embedding)
        t           = layers.Lambda( lambda y: y[:,1,:])(embedding)
        h           = layers.Lambda( lambda y: y[:,2:,:])(embedding)

        rhs         = merge.subtract([l,t])

        dist        = merge.add([h,rhs])
    
    elif task=='tail':
        
        in_h=keras.Input(batch_shape=(batch_test,1))

        #using a constant input for the tails
        in_t=keras.Input(tensor=K.constant(np.array(list(range(entities_size))*batch_test), shape=(batch_test,entities_size)))

        in_r=keras.Input(batch_shape=(batch_test,1))

        embedding_r = layers.Embedding(rel_size, k, input_length=1)(in_r)
        embedding_h = layers.Embedding(entities_size, k, input_length=1)(in_h)
        embedding_t = layers.Embedding(entities_size, k, input_length=entities_size)(in_t)

        embedding   = layers.Concatenate(axis=1)([embedding_r,embedding_h,embedding_t])

        l           = layers.Lambda( lambda y: y[:,0,:])(embedding)
        h           = layers.Lambda( lambda y: y[:,1,:])(embedding)
        t           = layers.Lambda( lambda y: y[:,2:,:])(embedding)

        lhs         = merge.add([h,l])

        dist        = merge.subtract([lhs,t])
        
    #computing the dissimilarity measure
    
    if norm_type=='L2':
        res         = layers.Lambda( lambda y: K.pow(L2_norm(y,False),-1))(dist)

    elif norm_type=='L1':
        res         = layers.Lambda( lambda y: K.pow(L1_norm(y,False),-1))(dist)
        
        
    #finalization of the model and actual testing 
    if task=='relationship':
        
        #sorting the outputs by increasing dissmilarity measure
        sorted_data = layers.Lambda(lambda y: K.tf.nn.top_k(y, k=rel_size, sorted=True).indices)(res)
        
        #creating the model with the layers defined above
        model_test=keras.Model([in_e,in_r], sorted_data)
        
        #copying the emebedding layers from the trained model
        model_test.layers[2].set_weights(model.layers[2].get_weights())
        model_test.layers[3].set_weights(model.layers[3].get_weights())
        
        #testing
        n=np.shape(Test_rel_set)[0]
    
        ent=Test_entities_set.reshape(n,2)

        y_true=Test_rel_set.reshape(n,1)

        out=model_test.predict(ent, batch_size=batch_test)
        ranks=np.argwhere(out==y_true)[:,-1]

    elif task=='head':
        
        sorted_data = layers.Lambda(lambda y: K.tf.nn.top_k(y, k=entities_size , sorted=True).indices)(res)
        model_test=keras.Model([in_r,in_h,in_t], sorted_data)
        model_test.layers[3].set_weights(model.layers[3].get_weights())
        model_test.layers[4].set_weights(model.layers[2].get_weights())
        model_test.layers[5].set_weights(model.layers[2].get_weights())
        
        n=np.shape(Test_entities_set)[0]
    
        rel=Test_rel_set.reshape(n,1)
        tails= Test_entities_set[:,1].reshape(n,1)

        y_true=Test_entities_set[:,0].reshape(n,1)

        out=model_test.predict([rel,tails], batch_size=batch_test)
        ranks=np.argwhere(out==y_true)[:,-1]


    elif task=='tail':
        sorted_data = layers.Lambda(lambda y: K.tf.nn.top_k(y, k=entities_size , sorted=True).indices)(res)
        model_test=keras.Model([in_r,in_t,in_h], sorted_data)
        model_test.layers[3].set_weights(model.layers[3].get_weights())
        model_test.layers[4].set_weights(model.layers[2].get_weights())
        model_test.layers[5].set_weights(model.layers[2].get_weights())
        
        n=np.shape(Test_entities_set)[0]
    
        rel=Test_rel_set.reshape(n,1)
        heads= Test_entities_set[:,0].reshape(n,1)

        y_true=Test_entities_set[:,1].reshape(n,1)

        out=model_test.predict([rel,heads], batch_size=batch_test)
        ranks=np.argwhere(out==y_true)[:,-1]
        
    mean_rank=int(np.mean(ranks))
    hit10=float("{:.2f}".format(sum(ranks<10)*100/len(ranks)))
    hit1=float("{:.2f}".format(sum(ranks==0)*100/len(ranks)))
        
    return mean_rank, hit10, hit1

<h1>Main</h1>

<h3>Hyperparameters' Definition

In [6]:
k=50 #or 20
eta=0.001 #or 0.01
batch=100
norm_type='L1' # or 'L2'
optimizer='SGD' #or 'Adam'
gamma=1

<h3>Importing training set to create model with dummy embeddings</h3>

In [7]:
dataset=pd.DataFrame(pd.read_csv(os.getcwd()+'/FB15k/freebase_mtr100_mte100-train.txt', sep='\t', names=['h','l','t']))
#getting all the entities in the dataset and creating related dataframe

entities=pd.DataFrame(pd.DataFrame(np.hstack([dataset['h'], dataset['t']]))[0].unique(), columns=['entity']).reset_index()

#getting all the relationships in the dataset and creating related dataframe

rel=pd.DataFrame(dataset['l'].unique(), columns=['relationship']).reset_index()

index_dataset=create_indexed_dataset(dataset, entities, rel)

<h3>Creating model with dummy embeddings</h3>

In [8]:
model=training_model_creation(rel, entities, k, norm_type)







<h3>Load the embeddings

In [9]:
import re

try:
    
    path=os.getcwd()+'/Results/BATCH'+str(batch)+'/'+norm_type+'/k'+str(k)+'/'+optimizer+'/'

    directory=os.listdir(path)
    for el in directory: 
        matching_e=re.findall('entities*.*'+str(eta)+'*.*', el)

        if len(matching_e) != 0:
            entities_emb=np.load(path+matching_e[0])

        matching_r=re.findall('rel*.*'+str(eta)+'*.*', el)

        if len(matching_r) != 0:
            rel_emb=np.load(path+matching_r[0])
            
    model.layers[2].set_weights(entities_emb) #set entities embeddings
    model.layers[3].set_weights(rel_emb) #set relationships emebeddings
except:
    pass









<h3>Importing test set

In [10]:
index_test=create_indexed_dataset(pd.read_csv(os.getcwd()+'/FB15k/freebase_mtr100_mte100-test.txt', sep='\t', 
                                              names=['h','l','t']), entities, rel).drop('index',1).head(19000)

<h3>Testing the model on test set

In [11]:
batch_test=19 #test set size must be a multiple of test batch size

R=Multitask_Tester(model, index_test, rel, entities,
                   task='relationship', norm_type=norm_type, batch_test=batch_test)
H=Multitask_Tester(model, index_test, rel, entities,
                   task='head', norm_type=norm_type, batch_test=batch_test)
T=Multitask_Tester(model, index_test, rel, entities,
                   task='tail', norm_type=norm_type, batch_test=batch_test)
Test_Res=[R,H,T]

In [12]:
print("\t\t\tMean Rank\tHit@10\tHit@1")
print('Entity prediction\t:',np.mean(Test_Res[1:], axis=0)) 
print('Relationship prediction\t:',Test_Res[0]) 

			Mean Rank	Hit@10	Hit@1
Entity prediction	: [159.5    41.73    7.695]
Relationship prediction	: (33, 74.33, 20.83)
