In [1]:
import os,json,pickle,collections,time
import numpy as np
from utils import *
from keras.backend.tensorflow_backend import set_session

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

config = tf.ConfigProto()  
config.gpu_options.allow_growth=True  
sess = tf.Session(config=config)  

Using TensorFlow backend.


In [2]:
import keras
from keras.layers import *
from layer import GAT
from keras import activations, constraints, initializers, regularizers
from keras import backend as K
from keras.layers import Layer, Dropout, LeakyReLU
import tensorflow as tf

class TokenEmbedding(keras.layers.Embedding):
    """Embedding layer with weights returned."""

    def compute_output_shape(self, input_shape):
        return self.input_dim, self.output_dim

    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, inputs):
        return K.identity(self.embeddings)
    
def get_attr_model(node_size,prop_size,node_hidden,batch_size,
                   n_attn_heads = 2,dropout_rate = 0.3,gamma = 3,lr = 0.005,activation='relu'):

    ent_p = Input(shape=(None,prop_size))
    ent_input = Lambda(lambda x: K.cast(K.squeeze(x,axis=0), dtype='float32'))(ent_p)
    ent_hidden = Dense(node_hidden,activation=activation)(ent_input)
    ent_hidden = BatchNormalization()(ent_hidden)
    output = Dense(node_hidden,activation=activation)(ent_hidden)
    output = Dropout(dropout_rate)(output)
    alignment_input = Input(shape=(None,4))
    find = Lambda(lambda x:K.gather(reference=x[0],indices=K.cast(K.squeeze(x[1],axis=0), 'int32')))([output,alignment_input])

    def loss_function(tensor):
        def dis(ll,rr):
            return K.sum(K.abs(ll-rr),axis=-1,keepdims=True)
        l,r,fl,fr = [tensor[:,0,:],tensor[:,1,:],tensor[:,2,:],tensor[:,3,:]]
        loss = K.relu(gamma + dis(l,r) - dis(l,fr)) + K.relu(gamma + dis(l,r) - dis(fl,r))
        return tf.reduce_sum(loss,keep_dims=True) / (batch_size)
    loss = Lambda(loss_function)(find)
    
    inputs = [ent_p]
    train_model = keras.Model(inputs = inputs + [alignment_input],outputs = loss)
    train_model.compile(loss=lambda y_true,y_pred: y_pred,optimizer=keras.optimizers.adam(lr=lr))

    feature_model = keras.Model(inputs = inputs,outputs = [output])
    return train_model,feature_model

def get_struc_model(node_size,rel_size,node_hidden,rel_hidden,triple_size,batch_size,
                    n_attn_heads = 2,dropout_rate = 0.3,gamma = 3,lr = 0.005,depth = 2):
    
    adj_input = Input(shape=(None,2))
    rel_adj = Input(shape=(None,2))
    ent_adj = Input(shape=(None,2))
    
    org_feature = TokenEmbedding(node_size,node_hidden,trainable = True)(adj_input) 
    rel_feature = TokenEmbedding(rel_size,rel_hidden,trainable = True)(adj_input)
    gat_in = [org_feature,rel_feature,adj_input,rel_adj,ent_adj]

    
    ent_feature = GAT(node_size,activation='relu',
                               rel_size = rel_size,
                               depth = depth,
                               attn_heads=n_attn_heads,
                               triple_size = triple_size,
                               attn_heads_reduction='average',   
                               dropout_rate=dropout_rate)(gat_in)
    ent_feature = Dropout(dropout_rate)(ent_feature)    
    
    alignment_input = Input(shape=(None,4))
    find = Lambda(lambda x:K.gather(reference=x[0],indices=K.cast(K.squeeze(x[1],axis=0), 'int32')))([ent_feature,alignment_input])

    def loss_function(tensor):
        def dis(ll,rr):
            return K.sum(K.abs(ll-rr),axis=-1,keepdims=True)
        l,r,fl,fr = [tensor[:,0,:],tensor[:,1,:],tensor[:,2,:],tensor[:,3,:]]
        loss = K.relu(gamma + dis(l,r) - dis(l,fr)) + K.relu(gamma + dis(l,r) - dis(fl,r))
        return tf.reduce_sum(loss,keep_dims=True) / (batch_size)
    loss = Lambda(loss_function)(find)
    
    inputs = [adj_input,rel_adj,ent_adj]
    train_model = keras.Model(inputs = inputs + [alignment_input],outputs = loss)
    train_model.compile(loss=lambda y_true,y_pred: y_pred,optimizer=keras.optimizers.adam(lr=lr))

    feature_model = keras.Model(inputs = inputs,outputs = [ent_feature])
    return train_model,feature_model

In [3]:
def model_attr(ent_prop_matrix,train_pair,dim=100,lr=0.005):
   
    model,get_emb = get_attr_model(lr=0.005,dropout_rate=0.3,node_size=node_size,
                              prop_size=prop_size,batch_size = batch_size,gamma = 3,node_hidden=dim)
    #model.summary()
    losses,ave_loss =[],[]
    batch = get_train_set(train_pair,ents1,ents2,batch_size=batch_size)
    for epoch in range(2000):
        train_set = next(batch)
        inputs = [ent_prop_matrix,train_set]
        inputs = [np.expand_dims(item,axis=0) for item in inputs]

        loss = model.train_on_batch(inputs, np.array([0]))
        losses.append(loss)
        ave_loss.append(np.mean(losses[-100:]))
        
        if epoch%100 == 0:
            print('\tEpoch %d \tloss=%.5f \tave_loss=%.4f'%(epoch,loss,ave_loss[-1]))

        if epoch > 500 and loss<ave_loss[-1] and ave_loss[-1] > ave_loss[-100]:
            print('\tEpoch %d \tloss=%.5f \tave_loss=%.4f'%(epoch,loss,ave_loss[-1]))
            break # Early stop.

    vec = get_emb.predict_on_batch(inputs[:-1])
    return vec
        
def model_struc(adj_matrix,rel_matrix,ent_matrix,train_pair,dim=100,lr=0.005):
    
    model,get_emb = get_struc_model(lr=0.005,dropout_rate=0.3,node_size=node_size,rel_size=rel_size,n_attn_heads = 2,
                                      depth=2,gamma = 3,node_hidden=dim,rel_hidden=dim,
                                      triple_size = triple_size,batch_size = batch_size)
    #model.summary()
    losses,ave_loss =[],[]
    batch = get_train_set(train_pair,ents1,ents2,batch_size=batch_size)
    for epoch in range(2000):
        train_set = next(batch)
        inputs = [adj_matrix,rel_matrix,ent_matrix,train_set]
        inputs = [np.expand_dims(item,axis=0) for item in inputs]

        loss = model.train_on_batch(inputs, np.array([0]))
        losses.append(loss)
        ave_loss.append(np.mean(losses[-100:]))
        
        if epoch%100 == 0:
            print('\tEpoch %d \tloss=%.5f \tave_loss=%.4f'%(epoch,loss,ave_loss[-1]))

        if epoch > 500 and ave_loss[-1] > ave_loss[-100] and loss<ave_loss[-1]:
            print('\tEpoch %d \tloss=%.5f \tave_loss=%.4f'%(epoch,loss,ave_loss[-1]))
            break # Early stop.
            
    vec = get_emb.predict_on_batch(inputs[:-1])            
    return vec

In [4]:
def get_pairs(vec,anchors,ents1,ents2,a0,q):
    new_pair = set()    
    re_ents1 = sorted(list(ents1 - set(anchors[:,0])))
    re_ents2 = sorted(list(ents2 - set(anchors[:,1])))
    
    Lvec = np.array([vec[i] for i in re_ents1])
    Rvec = np.array([vec[i] for i in re_ents2])
    
    Lvec = Lvec / np.linalg.norm(Lvec,axis=-1,keepdims=True)
    Rvec = Rvec / np.linalg.norm(Rvec,axis=-1,keepdims=True)
    sim_o = -Lvec.dot(Rvec.T)
    Lsim = sim_o.argsort(-1)
    Rsim = sim_o.argsort(0)

    c1,c2=0,0
    for i,j in enumerate(Lsim[:,0]):
        dist = -sim_o[i,j]
        if Rsim[0,j]==i:
            c1+=1
            thresh = max(a0 - n_iter*q,0.8)
            if dist > thresh:
                c2+=1
                e1,e2 = re_ents1[i],re_ents2[j]
                pair = str([e1,e2])
                new_pair.add(pair)
    #print('\tPotential pairs:',len(re_ents1),c1,c2,'\tThreshold:',thresh)
    return new_pair

def get_train_set(train_pair,ents1,ents2,batch_size):
    train = np.repeat(train_pair,batch_size//len(train_pair)+1,axis=0)
    np.random.shuffle(train); train = train[:batch_size]
    while True:
        f = np.random.randint(0,node_size,train.shape)
        train_set = np.concatenate([train,f],axis = -1)
        yield train_set

In [5]:
n_train = 0.3
dim = 100
for dataset in ['zh_en/','ja_en/','fr_en/']:
    for seed in range(5):
        result=[]
        train_pair,test_pair = load_anchor('data/'+dataset, train_ratio = n_train,seed=seed)
        adj_matrix,adj_features,rel_features,ent_prop,ents1,ents2 =load_data('data/'+dataset,p=0.0001)
        adj_matrix = np.stack(adj_matrix.nonzero(),axis = 1)
        rel_matrix = np.stack(rel_features.nonzero(),axis = 1)
        ent_matrix = np.stack(adj_features.nonzero(),axis = 1)
        ent_prop = ent_prop.todense()
        prop_size = ent_prop.shape[1]
        node_size = adj_features.shape[1]
        rel_size = rel_features.shape[1]
        triple_size = len(adj_matrix)
        batch_size = node_size
        
        score_list=[]
        for n_iter in range(10):
            print('-----------------------------------------------------------------------------------------------')    
            print(time.ctime(),'\tSeed:%d Iter: %d, Dataset: %s, # of train pairs: %d'%(seed,n_iter,dataset,len(train_pair)))

            # Struture model
            K.clear_session()
            print('\tRunning the structure model...')
            embeddings = model_struc(adj_matrix,rel_matrix,ent_matrix,train_pair,dim=dim,lr=0.005)
            score = get_hits(embeddings,test_pair)
            struc_model_score = score[-1]
            record = ['Sturc model',dataset,seed,n_iter,n_train,dim]+score
            result.append(record)
            print('\tScore: ',score)
            
            pairs = get_pairs(embeddings,train_pair,ents1,ents2,a0=0.9,q=0.05)
            new_pair = np.array([eval(i) for i in pairs])
            train_pair = np.vstack((train_pair,new_pair))
            print('\tStruc model found %d new pairs in Iter %d'%(len(new_pair),n_iter))
            
            # Attribute model
            ent_prop_matrix = extension_attr(ent_prop,train_pair)
            ent_prop_matrix = adj_features.dot(ent_prop_matrix)
            
            K.clear_session()
            print('\tRunning the attribute model...')
            embeddings = model_attr(ent_prop_matrix,train_pair,dim=dim,lr=0.005)
            score = get_hits(embeddings,test_pair)
            attr_model_score = score[-1]
            record = ['Attr model',dataset,seed,n_iter,n_train,dim]+score
            result.append(record)
            print('\tScore:',score)
            
            pairs = get_pairs(embeddings,train_pair,ents1,ents2,a0=0.95,q=0.05)
            new_pair = np.array([eval(i) for i in pairs])
            train_pair = np.vstack((train_pair,new_pair))
            print('\tAttr model found %d new pairs in Iter %d'%(len(new_pair),n_iter))   
            
            # Early stop
            score_list.append(max(attr_model_score,struc_model_score))
            if n_iter>3 and score_list[-1]-score_list[-2]<0.1:
                print('Early stop...')
                break
                
        json.dump(eval(str(result)),open('result_dataset_{}_seed_{}.txt'.format(dataset[:-1],seed),'w'))
        break # Uncomment this line to repeat evaluation on other random seeds.
    break # Uncomment this line to evaluate on other datasets.

Tue Dec 29 22:29:45 2020 	Loading data...
-----------------------------------------------------------------------------------------------
Tue Dec 29 22:29:55 2020 	Seed:0 Iter: 0, Dataset: zh_en/, # of train pairs: 4500
	Running the structure model...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


	Epoch 0 	loss=5.41147 	ave_loss=5.4115
	Epoch 100 	loss=0.00669 	ave_loss=0.2032
	Epoch 200 	loss=0.00529 	ave_loss=0.0066
	Epoch 300 	loss=0.00337 	ave_loss=0.0046
	Epoch 400 	loss=0.00326 	ave_loss=0.0036
	Epoch 500 	loss=0.00269 	ave_loss=0.0030
	Epoch 600 	loss=0.00200 	ave_loss=0.0026
	Epoch 700 	loss=0.00302 	ave_loss=0.0024
	Epoch 800 	loss=0.00260 	ave_loss=0.0022
	Epoch 900 	loss=0.00164 	ave_loss=0.0021
	Epoch 1000 	loss=0.00229 	ave_loss=0.0019
	Epoch 1026 	loss=0.00059 	ave_loss=0.0020
	Score:  [59.32380952380952, 88.3047619047619, 0.694690007435666, 56.5047619047619, 86.52380952380952, 0.6691877775592217]
	Struc model found 5171 new pairs in Iter 0
	Running the attribute model...
	Epoch 0 	loss=2.66541 	ave_loss=2.6654
	Epoch 100 	loss=0.04206 	ave_loss=0.1136
	Epoch 200 	loss=0.02853 	ave_loss=0.0340
	Epoch 300 	loss=0.02181 	ave_loss=0.0261
	Epoch 400 	loss=0.01999 	ave_loss=0.0221
	Epoch 500 	loss=0.02006 	ave_loss=0.0197
	Epoch 600 	loss=0.01721 	ave_loss=0.0183
	Epoc