# Datasets for GNN
- cora benchmark dataset 
- Open Graph benchmark dataset
- Benchmarking-GNNs


# Usefull resources for further study
Blogs: 
- GCN by Thomas Kipf
- GAT by Petar Velickovic
- Graph Deep Learning by Michael Bronstein

Guest lecture by Xavier Bresson for NYU's Deep Learning Course

Libraries for Tensorflow: GraphNets, Spektral, DGL

Book:
- Graph Representation Learning by Will Hamilton

University Courses: 
- CS224W at Standford
- COMP766 at McGill

all compiled in: https://twitter.com/PetarV_93/status/1306689702020382720

In [2]:
!pip install numpy
!pip install pandas
!pip install tensorflow
!pip install spektral # for graph representation learning used now for loading and preprocessing the dataset in a nice form
# spektral allows us to get acces to adjacency matrix, feature matrix, labels (topic of each paper), mask arrays (which nodes belong to the train, val, test set) of the graph

Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 2.6 MB/s 
Installing collected packages: tf-estimator-nightly
Successfully installed tf-estimator-nightly-2.8.0.dev2021122109
Collecting spektral
  Downloading spektral-1.0.8-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 2.7 MB/s 
Collecting numpy<1.20
  Downloading numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 17.5 MB/s 
Collecting tensorflow>=2.1.0
  Downloading tensorflow-2.7.1-cp37-cp37m-manylinux2010_x86_64.whl (495.0 MB)
[K     |████████████████████████████████| 495.0 MB 27 kB/s 
Collecting gast<0.5.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting keras<2.8,>=2.7.0rc0
  Downloading keras-2.7.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 43.2 MB

In [1]:
import numpy as np
import tensorflow as tf
import spektral

In [None]:
pip list

In [4]:
# adj, features, labels, train_mask, val_mask, test_mask = spektral.datasets.citation.load_data(dataset_name='cora')
# for spektral == 1.0.8
cora_dataset = spektral.datasets.citation.Citation(name='cora')
test_mask = cora_dataset.mask_te
train_mask = cora_dataset.mask_tr
val_mask = cora_dataset.mask_va
graph = cora_dataset.graphs[0] # zero since its just one graph inside, there could be more for other datasets
features = graph.x
adj = graph.a
labels = graph.y


# spektral retrieves in sparse format, for previous spektral versions
# features = features.todense()
  # also adj doesnt come with self edges, common thing to do at the beginning->
  # add the identity matrix to de adj matrix to make sure is okay
adj = adj.todense() + np.eye(adj.shape[0])

# to make sure everything plays nicely with tensorflow, convert to 32bit floating number
features = features.astype('float32')
adj = adj.astype('float32')

print('total nodes:', features.shape[0])
print('total features for each node:', features.shape[1])
print(adj.shape)
print(labels.shape)

print('training nodes: ',np.sum(train_mask))
print('validation nodes: ',np.sum(val_mask))
print('test nodes: ',np.sum(test_mask))



total nodes: 2708
total features for each node: 1433
(2708, 2708)
(2708, 7)
training nodes:  140
validation nodes:  500
test nodes:  1000


  self._set_arrayXarray(i, j, x)


In [12]:
# functions that will allow us to do loss and eval metrics for masked nodes
def masked_softmax_cross_entropy(logits, labels, mask): # returns the crossentropy loss of the nodes of the graph ONLY taking nodes that are masked by the mask array
  loss = tf.nn.softmax_cross_entropy_with_logits(logits= logits, labels=labels)
  mask = tf.cast(mask, dtype=tf.float32)
  mask /= tf.reduce_mean(mask)
  loss *= mask
  return tf.reduce_mean(loss)

# accuracy metric over a mask
def masked_accuracy(logits, lables, mask):
  correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(labels, 1))
  accuracy_all = tf.cast(correct_prediction, tf.float32)
  mask = tf.cast(mask, tf.float32)
  mask /= tf.reduce_mean(mask)
  accuracy_all *= mask
  return tf.reduce_mean(accuracy_all)

In [8]:
# definition of simple gnn layer
  #node feature matrix,
  #adjacency matrix,
  #transformation that we will like to apply to every node
  #activation function

def gnn(fts, adj, transform, activation):
  seq_fts = transform(fts) # transform each of the nodes individually
  ret_fts = tf.matmul(adj, seq_fts) # recombine to neighborhoods
  return activation(ret_fts) # apply activation function

In [9]:
# define 2 layer gnn to classify the cora dataset
  #node feature matrix
  #adj matrix
  #gnn model function
  #how many units we want our neural network to compute in each node
  # how many epochs
  # learning rate
def train_cora(fts, adj, gnn_fn, units, epochs, lr):
  lyr_1 = tf.keras.layers.Dense(units)
  lyr_2 = tf.keras.layers.Dense(7) # classification of 7 classes

  # define gnn that is used to solve this problem on a particular set of features and adjacencies
  def cora_gnn(fts, adj):
    hidden = gnn_fn(fts, adj, lyr_1, tf.nn.relu) # compute the hidden features on every node
    # define our logits by applying the second graph nn layer which starts from the hidden features, and then the adj matrix
    logits = gnn_fn(hidden, adj, lyr_2, tf.identity) # we use tf.identity as we don't want to transform (they are logits)
    return logits # return predictions

  # use of a standard optimization pipeline - Adam Optimizer
  optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

  # standard training pipeline with early stopping, for that, keep track of the validation data accuracy so far
  best_accuracy = 0.0
  for ep in range(epochs +1):
    with tf.GradientTape() as t: # to record all the gradients - GradientTape
      logits = cora_gnn(fts, adj) # apply cora_gnn to compute predictions at this step
      loss = masked_softmax_cross_entropy(logits, labels, train_mask) # compute loss

    # specify gradients to update variables based on this loss
    variables = t.watched_variables() # get the variables that the GradientTape is watching
    grads = t.gradient(loss, variables) # define gradients
    optimizer.apply_gradients(zip(grads, variables)) # apply the optimizer to apply this gradients

    # track validation and test accuracy
    logits = cora_gnn(fts, adj) # after the gradients have been updated
    val_accuracy = masked_accuracy(logits, labels, val_mask) # compute the validation accuracy as the masked accuracy on the logits against the labels
    test_accuracy = masked_accuracy(logits, labels, test_mask)

    if val_accuracy > best_accuracy:
      best_accuracy = val_accuracy
      print('Epoch', ep, '| Training loss:', loss.numpy(), '| Val accuracy:',
            val_accuracy.numpy(), '| Test accuracy:', test_accuracy.numpy())


In [14]:
# call just one line of code to try to train on the Cora data set for some particular adj matrix

# by passing the raw adj matrix means that we're going to be multiplying the features with just 0 or 1 matrix ->
  # Therefore implementing SUM-POOLING
  # we're expecting this to have some problems with the scale of the features, and as a result it might not give us the best results possible
train_cora(features, adj, gnn, 32, 200, 0.01)


Epoch 0 | Training loss: 4.4897547 | Val accuracy: 0.22399999 | Test accuracy: 0.22400002
Epoch 1 | Training loss: 7.0553784 | Val accuracy: 0.34399998 | Test accuracy: 0.39799997
Epoch 3 | Training loss: 3.614131 | Val accuracy: 0.36999997 | Test accuracy: 0.39
Epoch 4 | Training loss: 2.9831219 | Val accuracy: 0.406 | Test accuracy: 0.431
Epoch 5 | Training loss: 2.6912107 | Val accuracy: 0.444 | Test accuracy: 0.44500002
Epoch 7 | Training loss: 1.8758388 | Val accuracy: 0.47599998 | Test accuracy: 0.519
Epoch 8 | Training loss: 1.3678852 | Val accuracy: 0.546 | Test accuracy: 0.57899994
Epoch 9 | Training loss: 0.9143429 | Val accuracy: 0.59 | Test accuracy: 0.62299997
Epoch 10 | Training loss: 0.62095714 | Val accuracy: 0.62 | Test accuracy: 0.665
Epoch 11 | Training loss: 0.4301379 | Val accuracy: 0.66 | Test accuracy: 0.695
Epoch 12 | Training loss: 0.32929707 | Val accuracy: 0.684 | Test accuracy: 0.70000005
Epoch 13 | Training loss: 0.2792171 | Val accuracy: 0.702 | Test accur

In [15]:
# Verys useful thing to verify is that it's usefull to use the graph at all
# so lets test it by using the identity matrix instead the adj matrix
# This will basically render the operation of multiplying with the adj matrix as not really changing anything
# so we just have basically a point-wise classifier in each of our nodes -> so a standard MLP model that is shared accross the vertices

train_cora(features, tf.eye(adj.shape[0]), gnn, 32, 200, 0.01)

# so we can see the progress is generally far more steady, but it doesn't actually end up surpassing around 50% or so
# we can see that if you're not effectively exploiting the graph structure, 
# you're going to end up not completely capturing the interesting structure in your data 
# and this point-wise MLP will be unable to go beyond 50% testing_accuracy or so

Epoch 0 | Training loss: 1.9650178 | Val accuracy: 0.18399999 | Test accuracy: 0.21299998
Epoch 1 | Training loss: 1.7205448 | Val accuracy: 0.27 | Test accuracy: 0.301
Epoch 2 | Training loss: 1.514502 | Val accuracy: 0.334 | Test accuracy: 0.343
Epoch 3 | Training loss: 1.2945721 | Val accuracy: 0.35999998 | Test accuracy: 0.376
Epoch 4 | Training loss: 1.0721266 | Val accuracy: 0.38799998 | Test accuracy: 0.401
Epoch 5 | Training loss: 0.86842245 | Val accuracy: 0.38999996 | Test accuracy: 0.41200003
Epoch 6 | Training loss: 0.6934445 | Val accuracy: 0.40199998 | Test accuracy: 0.41999996
Epoch 7 | Training loss: 0.5486387 | Val accuracy: 0.40999997 | Test accuracy: 0.431
Epoch 8 | Training loss: 0.4328917 | Val accuracy: 0.424 | Test accuracy: 0.44499996
Epoch 9 | Training loss: 0.3424882 | Val accuracy: 0.438 | Test accuracy: 0.45999998
Epoch 10 | Training loss: 0.27245322 | Val accuracy: 0.45399997 | Test accuracy: 0.468
Epoch 11 | Training loss: 0.21771894 | Val accuracy: 0.474 

In [16]:
# We have shown that the graph is actually useful by comparing the above models
# lets explore more kinds of graph convolutional layers

# The first one we can explore is MEAN-POOLING
# so we can first compute the degree matrix as the degree of each node and then spread across the diagonal
deg = tf.reduce_sum(adj, axis = 1)

# and we can now rerun our train Cora setup using the features
# dividing the adj matrix by the degree matrix, which is equivalent to multiplying to the inverse of the degree matrix
# This will now give us a normalized propagation rule, which should hopefully deal with any exploding signal that we might have
# And this should hopefully be more stable than the update we had before
train_cora(features, adj / deg, gnn, 32, 200, 0.01)
# we see that after some epocs is behaving better than the SUM-POOLING model, 
# The overall performance is more stable and strong improvement from the SUM-POOLING

# this says that it is a good idea to normalize our adjacency matrix in this way

Epoch 0 | Training loss: 1.9528046 | Val accuracy: 0.366 | Test accuracy: 0.439
Epoch 1 | Training loss: 1.7760805 | Val accuracy: 0.532 | Test accuracy: 0.585
Epoch 2 | Training loss: 1.5735261 | Val accuracy: 0.63 | Test accuracy: 0.684
Epoch 3 | Training loss: 1.3514179 | Val accuracy: 0.69 | Test accuracy: 0.73099995
Epoch 4 | Training loss: 1.136811 | Val accuracy: 0.73200005 | Test accuracy: 0.77399987
Epoch 5 | Training loss: 0.9328334 | Val accuracy: 0.766 | Test accuracy: 0.8039999
Epoch 6 | Training loss: 0.75224274 | Val accuracy: 0.7839999 | Test accuracy: 0.81799984
Epoch 7 | Training loss: 0.60083675 | Val accuracy: 0.79199994 | Test accuracy: 0.8159998


In [17]:
# And finally we're going to try out the specific version of the normalization 
# that Thomas Kipf has proposed in the Graph Convolution Network model
# This requires us to compute one over the square root of the degree and 
# then multiply that on both sides with the adj matrix

# So we can get the normalized adj matrix by first having this half normalized degree matrix
norm_deg = tf.linalg.diag(1.0 / tf.sqrt(deg))
# and then multiplying that with the product of the adj and the normalized degree matrix
norm_adj = tf.matmul(norm_deg, tf.matmul(adj, norm_deg)) # so this is the equivalent of taking d to the minus 1/2 and then multiplying it with the adj matrix on both sides

train_cora(features, norm_adj, gnn, 32, 200, 0.01)

# In average you should not see a significant difference between this one and the division by degree, at least not in this dataset
# but both imporove over the SUM-POOLING and are expected to perform roughly comparably in this particular setting


Epoch 0 | Training loss: 1.9455462 | Val accuracy: 0.536 | Test accuracy: 0.574
Epoch 1 | Training loss: 1.7705743 | Val accuracy: 0.628 | Test accuracy: 0.662
Epoch 2 | Training loss: 1.5604495 | Val accuracy: 0.642 | Test accuracy: 0.66400003
Epoch 3 | Training loss: 1.3300091 | Val accuracy: 0.658 | Test accuracy: 0.67800003
Epoch 4 | Training loss: 1.1126056 | Val accuracy: 0.69 | Test accuracy: 0.70699996
Epoch 5 | Training loss: 0.91319835 | Val accuracy: 0.73199993 | Test accuracy: 0.74399996
Epoch 6 | Training loss: 0.7381033 | Val accuracy: 0.75600004 | Test accuracy: 0.7789999
Epoch 7 | Training loss: 0.5881834 | Val accuracy: 0.77000004 | Test accuracy: 0.7999998
Epoch 8 | Training loss: 0.46325633 | Val accuracy: 0.778 | Test accuracy: 0.8019998
Epoch 9 | Training loss: 0.3614521 | Val accuracy: 0.782 | Test accuracy: 0.8069998
