In [1]:
import os,sys,inspect
import joblib
import tensorflow as tf
import numpy as np
import h5py
import scipy.sparse.linalg as la
import scipy.sparse as sp
import scipy
import time
import pickle

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
%matplotlib inline

import scipy.io as sio
from utils import process_data

In [2]:
#learning parameters and path dataset

num_total_iter_training = 1000
learning_rate = 0.005
val_test_interval = 5
num_hidden_feat = 8
gamma = 5e-4
    
#cora dataset loading

A, X, Y, train_idx, val_idx, test_idx = process_data.load_data("cora")
X = process_data.preprocess_features(X)

(2708, 2708)
(2708, 1433)


In [3]:
class GAT_new:
    
    def frobenius_norm(self, tensor):
        square_tensor = tf.square(tensor)
        tensor_sum = tf.reduce_sum(square_tensor)
        frobenius_norm = tf.sqrt(tensor_sum)
        return frobenius_norm    
     
    def __init__(self, A, X, Y, num_layers, num_hidden_feat, num_heads, learning_rate=5e-2, gamma=1e-3, idx_gpu = '/gpu:0'):
        
        self.num_hidden_feat = num_hidden_feat
        self.num_heads = num_heads
        self.learning_rate = learning_rate
        self.gamma=gamma
        N1 = np.shape(A)[0]
        eps = 1e-12
        self.l2_reg = 0
       
        
        with tf.Graph().as_default() as g:
                self.graph = g
                
                with tf.device(idx_gpu):

                        self.A = tf.constant(A, dtype=tf.float32)
                        self.X = tf.constant(X, dtype=tf.float32) 
                        self.Y = tf.constant(Y, dtype=tf.float32)
                        
                        #placeholder definition
                        self.idx_nodes = tf.placeholder(tf.int32)
                        self.keep_prob = tf.placeholder(tf.float32)
                        
                        self.l_input = tf.nn.dropout(self.X,  self.keep_prob) # N X F0
                        
                        
                        attns = []
                        
                        for k in range(num_heads[0]):
                            self.W0 = tf.get_variable("W0_" + str(k) , shape=[X.shape[1], self.num_hidden_feat], initializer=tf.contrib.layers.xavier_initializer())
                            
                            self.l1_a0 = tf.get_variable("L1_A0_" + str(k), shape=[1, self.num_hidden_feat], initializer=tf.contrib.layers.xavier_initializer())
                            self.l1_a1 = tf.get_variable("L1_A1_" + str(k), shape=[1, self.num_hidden_feat], initializer=tf.contrib.layers.xavier_initializer())
                            
                            self.x1_new = tf.matmul(self.l_input, self.W0) # N X F0 . F0 X F1 = N X F1 now
                            
                            
                            self.half_0_attn = tf.matmul(self.l1_a0, tf.transpose(self.x1_new)) # 1 X F1 . F1 X N = 1 X N
                            self.x1_new = tf.nn.dropout(self.x1_new,  self.keep_prob)
                            self.half_1_attn = tf.matmul(self.l1_a1, tf.transpose(self.x1_new)) # 1 X F1 . F1 X N = 1 X N
                            
                            self.e_half_0 = tf.multiply(self.A, self.half_0_attn) # N X N . N X 1 in broadcast fashion - N X N
                            self.e_half_1 = tf.multiply(self.A, tf.transpose(self.half_1_attn)) # multiplies each ROW
                            
                            self.res = tf.nn.leaky_relu(self.e_half_0 + self.e_half_1, alpha=0.2) # N X N
                            
                            self.res = tf.math.multiply(tf.exp(self.res), self.A)
                            self.res = tf.math.divide(self.res, tf.reduce_sum(self.res, axis = 1, keepdims=True))
                            
                            
                            self.x1_new = tf.nn.dropout(self.x1_new,  self.keep_prob)
                            self.res = tf.nn.elu(tf.matmul(self.res, self.x1_new)) # N X N . N X F1 = N X F1
                            
                            attns.append(self.res)
                            
                            self.l2_reg += tf.nn.l2_loss(self.W0)
                            self.l2_reg += tf.nn.l2_loss(self.l1_a0)
                            self.l2_reg += tf.nn.l2_loss(self.l1_a1)
                        
                        
                        self.res = tf.concat(attns, axis=-1) #N X F1 X K
                        self.res = tf.reshape(self.res, [N1, self.num_heads[0] * self.num_hidden_feat])

                        

                        self.W1 = tf.get_variable("W1", shape=[self.num_heads[0] * self.num_hidden_feat, Y.shape[1]], initializer=tf.contrib.layers.xavier_initializer())
                        
                        
                        self.l2_a0 = tf.get_variable("L2_A0", shape=[1, Y.shape[1]], initializer=tf.contrib.layers.xavier_initializer())
                        self.l2_a1 = tf.get_variable("L2_A1", shape=[1, Y.shape[1]], initializer=tf.contrib.layers.xavier_initializer())
                        
                        
                        #-------Second Layer------------------------------------
                        
                        self.l2_input = tf.nn.dropout(self.res,  self.keep_prob)

                        self.x2_new = tf.matmul(self.l2_input, self.W1) # N X F0 . F0 X F1 = N X F1 now
                        self.half_0_attn_2 = tf.matmul(self.l2_a0, tf.transpose(self.x2_new)) # 1 X F1 . F1 X N = 1 X N
                        self.x2_new = tf.nn.dropout(self.x2_new,  self.keep_prob)
                        self.half_1_attn_2 = tf.matmul(self.l2_a1, tf.transpose(self.x2_new)) # 1 X F1 . F1 X N = 1 X N
                        
                        self.e_half_0_2 = tf.multiply(self.A, self.half_0_attn_2) # NXN times 1XN in broadcast fashion - N X N
                        self.e_half_1_2 = tf.multiply(self.A, tf.transpose(self.half_1_attn_2))
                        
                        self.res_2 = tf.nn.leaky_relu(self.e_half_0_2 + self.e_half_1_2, alpha=0.2)
                                                  
                        self.res_2 = tf.math.multiply(tf.exp(self.res_2), self.A)
                        self.res_2 = tf.math.divide(self.res_2, tf.reduce_sum(self.res_2, axis = 1, keepdims=True))

                        self.x2_new = tf.nn.dropout(self.x2_new,  self.keep_prob)
                        self.logits =tf.matmul(self.res_2, self.x2_new) # N X F1
                        
                        self.l_out = tf.gather(self.logits, self.idx_nodes)
                        self.c_Y = tf.gather(self.Y, self.idx_nodes)
                        
                        #loss function definition
                        self.l2_reg += tf.nn.l2_loss(self.W1)
                        self.l2_reg += tf.nn.l2_loss(self.l2_a0)
                        self.l2_reg += tf.nn.l2_loss(self.l2_a1)
                        self.data_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.l_out, labels=self.c_Y)) 
                        self.loss = self.data_loss + self.gamma*self.l2_reg
                        
                        #solver definition
                        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
                        self.opt_step = self.optimizer.minimize(self.loss)
                        
                        #predictions and accuracy extraction
                        self.c_predictions = tf.argmax(tf.nn.softmax(self.l_out), 1)
                        self.accuracy = tf.contrib.metrics.accuracy(self.c_predictions, tf.argmax(self.c_Y, 1))
                        
                        #gradients computation
                        self.trainable_variables = tf.trainable_variables()
                        self.var_grad = tf.gradients(self.loss, tf.trainable_variables())
                        self.norm_grad = self.frobenius_norm(tf.concat([tf.reshape(g, [-1]) for g in self.var_grad], 0))
                        #self.norm_grad = '0'
                        
                        #session creation
                        config = tf.ConfigProto(allow_soft_placement = True)
                        config.gpu_options.allow_growth = True
                        self.session = tf.Session(config=config)

                        #session initialization
                        init = tf.global_variables_initializer()
                        self.session.run(init)

In [4]:
A_tilde = sp.csr_matrix(A)
A_tilde.setdiag(1)
A_tilde = A_tilde.todense()



In [5]:
num_exp = 1 #number of times training GAT over the given dataset

list_all_acc = []
list_all_cost_val_avg  = []
list_all_data_cost_val_avg = []
list_all_acc_val_avg   = []
list_all_cost_test_avg = []
list_all_acc_test_avg  = []

num_done = 0
for seed in range(num_exp):
    GCNN = GAT_new(A_tilde, X, Y, num_layers=2, num_hidden_feat=8, num_heads=[8,1], learning_rate=learning_rate, gamma=gamma)

    cost_train_avg      = []
    grad_norm_train_avg = []
    acc_train_avg       = []
    cost_test_avg       = []
    grad_norm_test_avg  = []
    acc_test_avg        = []
    cost_val_avg        = []
    data_cost_val_avg   = []
    acc_val_avg         = []
    iter_test           = []
    list_training_time = list()

    #Training code
    for i in range(num_total_iter_training):
        if (len(cost_train_avg) % val_test_interval) == 0:
            #Print last training performance
            if (len(cost_train_avg)>0):
                print("[TRN] epoch = %03i, cost = %3.2e, |grad| = %.2e, acc = %3.2e (%03.2fs)" % \
                (len(cost_train_avg), cost_train_avg[-1], grad_norm_train_avg[-1], acc_train_avg[-1], time.time() - tic))

            #Validate the model
            tic = time.time()
            
            feed_dict = {GCNN.idx_nodes: val_idx, GCNN.keep_prob:1.0}
            acc_val, cost_val, data_cost_val = GCNN.session.run([GCNN.accuracy, GCNN.loss, GCNN.data_loss], feed_dict)
            
            data_cost_val_avg.append(data_cost_val)
            cost_val_avg.append(cost_val)
            acc_val_avg.append(acc_val)
            print("[VAL] epoch = %03i, data_cost = %3.2e, cost = %3.2e, acc = %3.2e (%03.2fs)" % \
                (len(cost_train_avg), data_cost_val_avg[-1], cost_val_avg[-1], acc_val_avg[-1],  time.time() - tic))

            #Test the model
            tic = time.time()
            
            feed_dict = {GCNN.idx_nodes: test_idx, GCNN.keep_prob:1.0}
            acc_test, cost_test = GCNN.session.run([GCNN.accuracy, GCNN.loss], feed_dict)
            
            cost_test_avg.append(cost_test)
            acc_test_avg.append(acc_test)
            print("[TST] epoch = %03i, cost = %3.2e, acc = %3.2e (%03.2fs)" % \
                (len(cost_train_avg), cost_test_avg[-1], acc_test_avg[-1],  time.time() - tic))
            iter_test.append(len(cost_train_avg))

        tic = time.time()

        tic = time.time()
        feed_dict = {GCNN.idx_nodes: train_idx, GCNN.keep_prob: 0.6}
        
        _, current_training_loss, norm_grad, current_acc_training = GCNN.session.run([GCNN.opt_step, GCNN.loss, GCNN.norm_grad, GCNN.accuracy], feed_dict) 

        #_, current_training_loss, current_acc_training = GCNN.session.run([GCNN.opt_step, GCNN.loss, GCNN.accuracy], feed_dict) 
        training_time = time.time() - tic   
        #norm_grad = 1.0

        cost_train_avg.append(current_training_loss)
        grad_norm_train_avg.append(norm_grad)
        acc_train_avg.append(current_acc_training)

    #Compute and print statistics of the last realized experiment
    list_all_acc.append(100*(np.asarray(acc_test_avg)[np.asarray(data_cost_val_avg)==np.min(data_cost_val_avg)]))
    list_all_cost_val_avg.append(cost_val_avg)
    list_all_data_cost_val_avg.append(data_cost_val_avg)
    list_all_acc_val_avg.append(acc_val_avg)
    list_all_cost_test_avg.append(cost_test_avg)
    list_all_acc_test_avg.append(acc_test_avg)

    print('Num done: %d' % num_done)
    print('Max accuracy on test set achieved: %f%%' % np.max(np.asarray(acc_test_avg)*100))
    print('Max suggested accuracy: %f%%' % (100*(np.asarray(acc_test_avg)[np.argmin(data_cost_val_avg)]),))
    print('Current mean: %f%%' % np.mean(list_all_acc))
    print('Current std: %f' % np.std(list_all_acc))

    num_done += 1

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Use tf.cast instead.
[VAL] epoch = 000, data_cost = 1.95e+00, cost = 1.99e+00, acc = 1.62e-01 (7.80s)
[TST] epoch = 000, cost = 1.99e+00, acc = 1.62e-01 (7.89s)
[TRN] epoch = 005, cost = 1.95e+00, |grad| = 4.81e-02, acc = 5.71e-01 (1.26s)
[VAL] epoch = 005, data_cost = 1.92e+00, cost = 1.95e+00, acc = 7.34e-01 (0.28s)
[TST] epoch = 005, cost = 1.95e+00, acc

[TST] epoch = 150, cost = 1.17e+00, acc = 7.99e-01 (0.28s)
[TRN] epoch = 155, cost = 9.40e-01, |grad| = 9.47e-02, acc = 8.21e-01 (1.21s)
[VAL] epoch = 155, data_cost = 9.45e-01, cost = 1.21e+00, acc = 7.94e-01 (0.26s)
[TST] epoch = 155, cost = 1.16e+00, acc = 8.04e-01 (0.27s)
[TRN] epoch = 160, cost = 1.03e+00, |grad| = 1.03e-01, acc = 7.93e-01 (1.24s)
[VAL] epoch = 160, data_cost = 9.37e-01, cost = 1.20e+00, acc = 7.92e-01 (0.27s)
[TST] epoch = 160, cost = 1.16e+00, acc = 8.04e-01 (0.28s)
[TRN] epoch = 165, cost = 9.84e-01, |grad| = 9.32e-02, acc = 8.29e-01 (1.21s)
[VAL] epoch = 165, data_cost = 9.26e-01, cost = 1.19e+00, acc = 7.92e-01 (0.28s)
[TST] epoch = 165, cost = 1.15e+00, acc = 8.08e-01 (0.28s)
[TRN] epoch = 170, cost = 9.85e-01, |grad| = 8.86e-02, acc = 7.93e-01 (1.17s)
[VAL] epoch = 170, data_cost = 9.18e-01, cost = 1.19e+00, acc = 7.92e-01 (0.27s)
[TST] epoch = 170, cost = 1.14e+00, acc = 8.07e-01 (0.28s)
[TRN] epoch = 175, cost = 9.46e-01, |grad| = 8.36e-02, acc = 8.36e-01

[VAL] epoch = 340, data_cost = 7.50e-01, cost = 1.07e+00, acc = 8.00e-01 (0.28s)
[TST] epoch = 340, cost = 1.03e+00, acc = 8.26e-01 (0.26s)
[TRN] epoch = 345, cost = 8.56e-01, |grad| = 1.04e-01, acc = 8.21e-01 (1.24s)
[VAL] epoch = 345, data_cost = 7.46e-01, cost = 1.06e+00, acc = 7.98e-01 (0.26s)
[TST] epoch = 345, cost = 1.03e+00, acc = 8.26e-01 (0.27s)
[TRN] epoch = 350, cost = 8.63e-01, |grad| = 8.28e-02, acc = 8.57e-01 (1.31s)
[VAL] epoch = 350, data_cost = 7.43e-01, cost = 1.06e+00, acc = 8.02e-01 (0.28s)
[TST] epoch = 350, cost = 1.03e+00, acc = 8.26e-01 (0.29s)
[TRN] epoch = 355, cost = 7.89e-01, |grad| = 7.85e-02, acc = 8.86e-01 (1.29s)
[VAL] epoch = 355, data_cost = 7.48e-01, cost = 1.07e+00, acc = 8.08e-01 (0.29s)
[TST] epoch = 355, cost = 1.03e+00, acc = 8.24e-01 (0.29s)
[TRN] epoch = 360, cost = 8.82e-01, |grad| = 7.85e-02, acc = 8.07e-01 (1.27s)
[VAL] epoch = 360, data_cost = 7.47e-01, cost = 1.07e+00, acc = 8.06e-01 (0.30s)
[TST] epoch = 360, cost = 1.03e+00, acc = 8.25e

[TRN] epoch = 530, cost = 8.13e-01, |grad| = 8.89e-02, acc = 8.64e-01 (1.28s)
[VAL] epoch = 530, data_cost = 7.13e-01, cost = 1.06e+00, acc = 8.02e-01 (0.27s)
[TST] epoch = 530, cost = 1.02e+00, acc = 8.27e-01 (0.27s)
[TRN] epoch = 535, cost = 8.71e-01, |grad| = 1.68e-01, acc = 8.64e-01 (1.24s)
[VAL] epoch = 535, data_cost = 7.14e-01, cost = 1.06e+00, acc = 8.06e-01 (0.27s)
[TST] epoch = 535, cost = 1.02e+00, acc = 8.26e-01 (0.27s)
[TRN] epoch = 540, cost = 7.16e-01, |grad| = 7.16e-02, acc = 9.14e-01 (1.29s)
[VAL] epoch = 540, data_cost = 7.16e-01, cost = 1.06e+00, acc = 8.02e-01 (0.28s)
[TST] epoch = 540, cost = 1.02e+00, acc = 8.28e-01 (0.28s)
[TRN] epoch = 545, cost = 8.77e-01, |grad| = 8.79e-02, acc = 8.36e-01 (1.27s)
[VAL] epoch = 545, data_cost = 7.18e-01, cost = 1.06e+00, acc = 8.00e-01 (0.27s)
[TST] epoch = 545, cost = 1.02e+00, acc = 8.27e-01 (0.26s)
[TRN] epoch = 550, cost = 7.68e-01, |grad| = 7.96e-02, acc = 8.93e-01 (1.20s)
[VAL] epoch = 550, data_cost = 7.19e-01, cost = 1.

[TST] epoch = 715, cost = 1.02e+00, acc = 8.25e-01 (0.26s)
[TRN] epoch = 720, cost = 8.20e-01, |grad| = 1.04e-01, acc = 8.79e-01 (1.27s)
[VAL] epoch = 720, data_cost = 7.10e-01, cost = 1.07e+00, acc = 7.98e-01 (0.28s)
[TST] epoch = 720, cost = 1.02e+00, acc = 8.27e-01 (0.26s)
[TRN] epoch = 725, cost = 8.52e-01, |grad| = 7.83e-02, acc = 8.36e-01 (1.22s)
[VAL] epoch = 725, data_cost = 7.11e-01, cost = 1.07e+00, acc = 8.00e-01 (0.28s)
[TST] epoch = 725, cost = 1.02e+00, acc = 8.28e-01 (0.26s)
[TRN] epoch = 730, cost = 8.47e-01, |grad| = 1.06e-01, acc = 8.57e-01 (1.27s)
[VAL] epoch = 730, data_cost = 7.15e-01, cost = 1.07e+00, acc = 7.96e-01 (0.28s)
[TST] epoch = 730, cost = 1.03e+00, acc = 8.22e-01 (0.27s)
[TRN] epoch = 735, cost = 8.14e-01, |grad| = 1.21e-01, acc = 8.79e-01 (1.26s)
[VAL] epoch = 735, data_cost = 7.20e-01, cost = 1.08e+00, acc = 7.96e-01 (0.28s)
[TST] epoch = 735, cost = 1.03e+00, acc = 8.22e-01 (0.28s)
[TRN] epoch = 740, cost = 9.17e-01, |grad| = 9.68e-02, acc = 8.07e-01

[VAL] epoch = 905, data_cost = 7.19e-01, cost = 1.08e+00, acc = 7.94e-01 (0.27s)
[TST] epoch = 905, cost = 1.04e+00, acc = 8.25e-01 (0.31s)
[TRN] epoch = 910, cost = 7.68e-01, |grad| = 8.10e-02, acc = 8.64e-01 (1.25s)
[VAL] epoch = 910, data_cost = 7.20e-01, cost = 1.08e+00, acc = 7.98e-01 (0.27s)
[TST] epoch = 910, cost = 1.04e+00, acc = 8.20e-01 (0.26s)
[TRN] epoch = 915, cost = 7.56e-01, |grad| = 8.01e-02, acc = 8.79e-01 (1.27s)
[VAL] epoch = 915, data_cost = 7.18e-01, cost = 1.08e+00, acc = 7.96e-01 (0.29s)
[TST] epoch = 915, cost = 1.04e+00, acc = 8.20e-01 (0.28s)
[TRN] epoch = 920, cost = 7.83e-01, |grad| = 1.07e-01, acc = 8.64e-01 (1.32s)
[VAL] epoch = 920, data_cost = 7.14e-01, cost = 1.08e+00, acc = 7.98e-01 (0.28s)
[TST] epoch = 920, cost = 1.03e+00, acc = 8.22e-01 (0.27s)
[TRN] epoch = 925, cost = 7.70e-01, |grad| = 9.95e-02, acc = 9.07e-01 (1.25s)
[VAL] epoch = 925, data_cost = 7.11e-01, cost = 1.08e+00, acc = 7.98e-01 (0.26s)
[TST] epoch = 925, cost = 1.03e+00, acc = 8.21e