Google colab setup. Uncomment the following cell if the notebook is running in colab

In [None]:
'''
!git clone https://github.com/EthanPhan/KGAT.git
!cp -r KGAT/* .

!pip install googledrivedownloader
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1WAkulkadNFiOmH3uu6JH1QC3XkAkASlq',
                                    dest_path='./data/FB15k-237/2hop.pickle',
                                    unzip=False)

!ls ./data/FB15k-237/

# Install TensorFlow 2.0
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
'''

In [1]:
import tensorflow as tf
from models import SpKBGATModified
import numpy as np
from copy import deepcopy

from preprocess import read_entity_from_id, read_relation_from_id, init_embeddings, build_data
from create_batch import Corpus
from utils import save_model

import random
import argparse
import os
import sys
import logging
import time
import pickle

Arguments to pass arround

In [5]:
class Args:
    data = "./data/FB15k-237/"
    epochs_gat = 3000
    epochs_conv = 150
    weight_decay_gat = 1e-5
    weight_decay_conv = 1e-5
    pretrained_emb = True
    embedding_size = 50
    lr = 1e-3
    get_2hop = True
    use_2hop = True
    partial_2hop = True
    output_folder = "./checkpoints/fb/out/"
    batch_size_gat = 272115
    valid_invalid_ratio_gat = 2
    drop_GAT = 0.3
    entity_out_dim = [100, 200]
    nheads_GAT = [2, 2]
    margin = 1
    batch_size_conv = 128
    valid_invalid_ratio_conv = 40
    out_channels = 50
    drop_conv = 0.3
    

args=Args()

In [6]:
def load_data(args):
    train_data, validation_data, test_data, entity2id, relation2id, headTailSelector, unique_entities_train = build_data(
        args.data, is_unweigted=False, directed=True)

    if args.pretrained_emb:
        entity_embeddings, relation_embeddings = init_embeddings(os.path.join(args.data, 'entity2vec.txt'),
                                                                 os.path.join(args.data, 'relation2vec.txt'))
        print("Initialised relations and entities from TransE")

    else:
        entity_embeddings = np.random.randn(
            len(entity2id), args.embedding_size)
        relation_embeddings = np.random.randn(
            len(relation2id), args.embedding_size)
        print("Initialised relations and entities randomly")

    corpus = Corpus(args, train_data, validation_data, test_data, entity2id, relation2id, headTailSelector,
                    args.batch_size_gat, args.valid_invalid_ratio_gat, unique_entities_train, args.get_2hop)

    return corpus, entity_embeddings, relation_embeddings

Corpus_, entity_embeddings, relation_embeddings = load_data(args)

number of unique_entities -> 14505
number of unique_entities -> 9809
number of unique_entities -> 10348
Initialised relations and entities from TransE
Graph created
Total triples count 310116, training triples 272115, validation_triples 17535, test_triples 20466


In [7]:
entity_embeddings_copied = deepcopy(entity_embeddings)
relation_embeddings_copied = deepcopy(relation_embeddings)

In [8]:
def batch_gat_loss(gat_loss_func, train_indices, entity_embed, relation_embed):
    len_pos_triples = int(
        train_indices.shape[0] / (int(args.valid_invalid_ratio_gat) + 1))

    pos_triples = train_indices[:len_pos_triples]
    neg_triples = train_indices[len_pos_triples:]
    pos_triples = np.tile(pos_triples,(int(args.valid_invalid_ratio_gat), 1))

    norm_entity = tf.nn.l2_normalize(entity_embed, axis=1)
    norm_relation = tf.nn.l2_normalize(relation_embed, axis=1)

    pos_source_embeds = tf.nn.embedding_lookup(norm_entity, pos_triples[:, 0])
    pos_relation_embeds = tf.nn.embedding_lookup(norm_relation, pos_triples[:, 1])
    pos_tail_embeds = tf.nn.embedding_lookup(norm_entity, pos_triples[:, 2])

    neg_source_embeds = tf.nn.embedding_lookup(norm_entity, neg_triples[:, 0])
    neg_relation_embeds = tf.nn.embedding_lookup(norm_relation, neg_triples[:, 1])
    neg_tail_embeds = tf.nn.embedding_lookup(norm_entity, neg_triples[:, 2])

    score_positive = tf.reduce_sum(tf.abs(pos_source_embeds + \
                                          pos_relation_embeds - \
                                          pos_tail_embeds),
                                   axis = 1)
    score_negative = tf.reduce_sum(tf.abs(neg_source_embeds + \
                                          neg_relation_embeds - \
                                          neg_tail_embeds),
                                   axis = 1)

    loss = tf.reduce_mean(tf.maximum(0., score_positive + args.margin - score_negative))
    return loss

In [9]:
def train_gat(args):

    # Creating the gat model here.
    ####################################

    print("Defining model")

    print(
        "\nModel type -> GAT layer with {} heads used , Initital Embeddings training".format(args.nheads_GAT[0]))
    gat = SpKBGATModified(entity_embeddings, relation_embeddings, args.entity_out_dim, args.entity_out_dim,
                                args.drop_GAT, args.nheads_GAT)

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        args.lr,
        decay_steps=500,
        decay_rate=0.5,
        staircase=True)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, decay=args.weight_decay_gat)

    gat_loss_func = tf.keras.losses.Hinge()

    current_batch_2hop_indices = np.array([])
    if(args.use_2hop):
        current_batch_2hop_indices = Corpus_.get_batch_nhop_neighbors_all(args)

    epoch_losses = []   # losses of all epochs
    print("Number of epochs {}".format(args.epochs_gat))

    for epoch in range(args.epochs_gat):
        print("\nepoch-> ", epoch)
        random.shuffle(Corpus_.train_triples)
        Corpus_.train_indices = np.array(
            list(Corpus_.train_triples)).astype(np.int32)

        epoch_loss = []

        if len(Corpus_.train_indices) % args.batch_size_gat == 0:
            num_iters_per_epoch = len(
                Corpus_.train_indices) // args.batch_size_gat
        else:
            num_iters_per_epoch = (
                len(Corpus_.train_indices) // args.batch_size_gat) + 1

        for iters in range(num_iters_per_epoch):
            with tf.GradientTape() as tape:
                train_indices, train_values = Corpus_.get_iteration_batch(iters)

                train_indices = np.array(train_indices)
                train_values = np.array(train_values)

                # forward pass
                entity_embed, relation_embed = gat([Corpus_.train_adj_matrix,
                                                    train_indices,
                                                    current_batch_2hop_indices])

                # calculate loss
                loss = batch_gat_loss(
                    gat_loss_func, train_indices, entity_embed, relation_embed)
                
            grads = tape.gradient(loss, gat.trainable_weights)
            optimizer.apply_gradients(zip(grads, gat.trainable_weights))

            epoch_loss.append(loss)

            print('Iteration ', iters, loss.numpy())

        print("Epoch {} , average loss {}".format(
            epoch, sum(epoch_loss) / len(epoch_loss)))
        epoch_losses.append(sum(epoch_loss) / len(epoch_loss))

In [None]:
train_gat(args)

Defining model

Model type -> GAT layer with 2 heads used , Initital Embeddings training
length of unique_entities  14505
Number of epochs 3000

epoch->  0
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use tf.identity instead.
Iteration  0 0.96977437
Epoch 0 , average loss 0.9697743654251099

epoch->  1
