# Geometric Deep Learning Project - Towards Sparse Hierarchical Graph Classifiers

*Alessia Ruggeri*

### Implementation of the paper *Towards Sparse Hierarchical Graph Classifiers* tested on Enzymes, Proteins and D&D biological datasets using Tensorflow 2.0.

In [0]:
!pip install tensorflow-gpu==2.0.0-alpha0

In [0]:
import os,sys,inspect
import networkx as nx
import numpy as np
import scipy
import scipy.sparse as sp
import matplotlib.pyplot as plt
import time
import math

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Layer, Input, Dense, Flatten, Activation, Dropout, ReLU
from tensorflow.keras.regularizers import l2
from tensorflow.keras import Model
from sklearn.utils import shuffle

from load_data import read_graphfile

np.random.seed(0)

In [0]:
### Unzip datasets folders

# !unzip -o data.zip
!unzip -o data_ENZYMES.zip
# !unzip -o data_PROTEINS.zip
# !unzip -o data_DD.zip
# !unzip -o data_COLLAB.zip

In [0]:
### Load datasets from data.zip

# print("\nLoading ENZYMES...")
# graphs_ENZYMES = read_graphfile(datadir="data", dataname="ENZYMES", max_nodes=None)

# print("\nLoading DD...")
# graphs_DD = read_graphfile(datadir="data", dataname="DD", max_nodes=None)

# print("\nLoading PROTEINS...")
# graphs_PROTEINS = read_graphfile(datadir="data", dataname="PROTEINS", max_nodes=None)

# print("\nLoading COLLAB...")
# graphs_COLLAB = read_graphfile(datadir="data", dataname="COLLAB", max_nodes=None)


### Load datasets from data_DATASET.zip

print("\nLoading ENZYMES...")
graphs_ENZYMES = read_graphfile(datadir="data_ENZYMES", dataname="ENZYMES", max_nodes=None)

# print("\nLoading PROTEINS...")
# graphs_PROTEINS = read_graphfile(datadir="data_PROTEINS", dataname="PROTEINS", max_nodes=None)

# print("\nLoading DD...")
# graphs_DD = read_graphfile(datadir="data_DD", dataname="DD", max_nodes=None)

# print("\nLoading COLLAB...")
# graphs_COLLAB = read_graphfile(datadir="data_COLLAB", dataname="COLLAB", max_nodes=None)

print("\nDone.")

In [0]:
### Generic functions

def get_numberof_features(dataset_name):
  if dataset_name == "ENZYMES":
    return 18
  elif dataset_name == "PROTEINS":
    return 1
  return None

def preprocess_features(features):
  '''Row-normalize feature matrix and convert it to dense representation'''
  rowsum = np.array(features.sum(1))
  r_inv = np.power(rowsum, -1).flatten()
  r_inv[np.isinf(r_inv)] = 0.
  r_mat_inv = sp.diags(r_inv)
  features = r_mat_inv.dot(features)
  return features

def get_node_features_matrix(graph):
  '''It returns the node feature matrix of the graph with already preprocessed features'''
  Xdict = nx.get_node_attributes(graph, 'feat')
  X = np.array([Xdict[i] for i in range(nx.number_of_nodes(graph))])
  X = preprocess_features(X)
  X = X.astype(np.float32)
  return X

def get_adjacency_matrix(graph):
  '''It returns the adjacency matrix of the graph with inserted self-loops'''
  A = nx.adjacency_matrix(graph)
  A = np.array(A + np.eye(A.shape[0]))
  A = sp.coo_matrix(A.astype(np.float32))
  return A

def get_normalization_matrix(A):
  '''It returns the normalized adjacency matrix of the graph'''
  degrees = np.array(np.sum(A.todense(), axis=1)).flatten()
  degrees = np.power(degrees, -1)
  degrees[np.isinf(degrees)] = 0
  degrees = degrees.astype(np.float32)
  D = sp.diags(degrees, offsets=0).tocoo()
  return D

def get_normalized_adjacency_matrix(A):
  D = get_normalization_matrix(A)
  A_norm = D @ A
  return A_norm.tocoo()

def get_graphs_labels(dataset):
  '''It returns the class labels of all the graphs in the dataset'''
  labels = []
  for graph in dataset:
    labels.append(graph.graph['label'])
  labels = np.array([[labels[i]] for i in range(len(labels))])
  return labels

def dot(x, y, sparse=False):
  '''Wrapper for tf.matmul (sparse vs dense)'''
  if sparse:
      res = tf.sparse.sparse_dense_matmul(x, y)
  else:
      res = tf.matmul(x, y)
  return res
  
def convert_sparse_matrix_to_sparse_tensor(coo):
  indices = np.transpose(np.array([coo.row, coo.col]))
  return tf.SparseTensor(indices, coo.data.astype(np.float32), coo.shape)

def convert_nparray_to_sparse_tensor(nparray):
  tf_tensor = tf.constant(nparray)
  idx = tf.where(tf.not_equal(tf_tensor, 0))
  sparse_tensor = tf.SparseTensor(idx, tf.gather_nd(tf_tensor, idx), tf_tensor.get_shape())
  return sparse_tensor

def one_hot_encoding(data, n_classes):
    '''It one-hot encode data'''
    targets = np.array(data).reshape(-1)
    targets = np.eye(n_classes)[targets]
    return targets
  

In [0]:
### Execution functions

def convert_dataset_to_lists(dataset):
  feat = []
  adj = []
  for graph in dataset:
    X = get_node_features_matrix(graph)
    A = get_adjacency_matrix(graph)
    A_norm = get_normalized_adjacency_matrix(A)
    feat.append(X)
    adj.append(A_norm)
  return feat, adj

def create_batch_elements(X, A):
  '''It takes X and A lists and creates respective stack of nodes, block diagonal adjacency matrix and graph idx array'''
  X_stack = np.vstack(X)
  A_diag = sp.block_diag(A)
  A_diag = convert_sparse_matrix_to_sparse_tensor(A_diag)
  n_nodes = np.array([a.shape[0] for a in A])
  graph_idx = np.repeat(np.arange(len(n_nodes)), n_nodes)
  return X_stack, A_diag, graph_idx

def batch_generator(data, batch_size=32):
  '''It takes a list of arrays or matrices and it yields batches of given size'''
  len_data = len(data[0])
  batches_per_epoch = math.ceil(len_data/batch_size)
  for batch in range(batches_per_epoch):
    start = batch * batch_size
    end = min(start+batch_size, len_data)
    out = [item[start:end] for item in data]
    yield out

# @tf.function
def train_step(data, labels):
  with tf.GradientTape() as tape:
    predictions = model(data)
    loss = loss_object(labels, predictions)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  train_loss(loss)
  train_accuracy(labels, predictions)
  
# @tf.function
def val_step(data, labels):
  predictions = model(data)
  t_loss = loss_object(labels, predictions)

  val_loss(t_loss)
  val_accuracy(labels, predictions)
  
# @tf.function
def test_step(data, labels):
  predictions = model(data)
  t_loss = loss_object(labels, predictions)

  test_loss(t_loss)
  test_accuracy(labels, predictions)

In [0]:
### Define the layers

class Convolutional(Layer):
  
  def __init__(self, F, F_1, **kwargs):
    self.F = F
    self.F_1 = F_1
    super(Convolutional, self).__init__(**kwargs)

  def build(self, input_shape):
    self.W1 = self.add_weight(name='W1', 
                             shape=(self.F, F_1),
                             initializer='uniform',
                             regularizer=None,
                             trainable=True)

    self.W2 = self.add_weight(name='W2', 
                             shape=(self.F, F_1),
                             initializer='uniform',
                             regularizer=None,
                             trainable=True)

    super(Convolutional, self).build(input_shape)  # Be sure to call this at the end
    
  def kernel(self, X, A):
    
    res = dot(A, X, sparse=True)
#     res = (N, F) -> AX
    res = dot(res, self.W1, sparse=False)
#     res = (N, F_1) -> AXW1
    skip_connection = dot(X, self.W2, sparse=False)
#     skip_connection = (N, F_1) -> XW2
    
    res = tf.math.add(res, skip_connection)
    res = ReLU()(res)
#     res = (N, F_1) -> sigma(AXW1 + XW2)
    
    return res
   
  def call(self, inputs):
    X = inputs[0]
    A = inputs[1]

    res = self.kernel(X, A)

    return res

  def compute_output_shape(self, input_shape):
    return (input_shape[0][0], self.F_1)

  
class GlobalAvgPooling(Layer):
  
  def __init__(self, **kwargs):
    super(GlobalAvgPooling, self).__init__(**kwargs)
  
  def call(self, inputs):
    nodes_feat = inputs[0]
    idx = inputs[1]
    
    res = tf.math.segment_mean(nodes_feat, idx)
    
    return res

  
class HierarchicalPooling(Layer):
  
  def __init__(self, **kwargs):
    super(HierarchicalPooling, self).__init__(**kwargs)
  
  def build(self, input_shape):
    self.W1 = self.add_weight(name='W1', 
                             shape=(self.F, F_1),
                             initializer='uniform',
                             regularizer=None,
                             trainable=True)
    
    super(HierarchicalPooling, self).build(input_shape)  # Be sure to call this at the end
  
  def call(self, inputs):
    
    return res

  def compute_output_shape(self, input_shape):
    return (input_shape[0][0], self.output_dim)


class MyModel(Model):
  def __init__(self, F, F_1, n_classes, dropout, reg):
    super(MyModel, self).__init__()
    
    self.conv1 = Convolutional(F=F, F_1=F_1)
    self.conv2 = Convolutional(F=F_1, F_1=F_1)
    self.conv3 = Convolutional(F=F_1, F_1=F_1)
    self.pool = GlobalAvgPooling()
    self.flat = Flatten()
    self.dense1 = Dense(256, activation='relu', kernel_regularizer=reg)
    self.drop = Dropout(dropout)
    self.dense2 = Dense(n_classes, activation='softmax', kernel_regularizer=reg)

  def call(self, inputs):
    X = inputs[0]
    A = inputs[1]
    idx = inputs[2]
    
    res = self.conv1([X, A])
    res = self.conv2([res, A])
    res = self.conv3([res, A])
    res = self.pool([res, idx])
    res = self.flat(res)
    res = self.dense1(res)
    res = self.drop(res)
    res = self.dense2(res)
    
    return res

In [16]:
dataset = graphs_ENZYMES
dataset_name = "ENZYMES"
# dataset = graphs_PROTEINS
# dataset_name = "PROTEINS"

labels = get_graphs_labels(dataset)
n_classes=len(np.unique(labels))

dataset, labels = shuffle(dataset, labels)

train_n = len(dataset)//100 * 80
val_n = len(dataset)//100 * 15

(x_val, y_val) = dataset[0:val_n], labels[0:val_n]
(x_train, y_train) = dataset[val_n:train_n], labels[val_n:train_n]
(x_test, y_test) = dataset[train_n:], labels[train_n:]

print('train:', len(x_train))
print('val:', len(x_val))
print('test:', len(x_test))

X_train, A_train = convert_dataset_to_lists(x_train)
X_val, A_val = convert_dataset_to_lists(x_val)
X_test, A_test = convert_dataset_to_lists(x_test)

y_train = one_hot_encoding(y_train, n_classes)
y_val = one_hot_encoding(y_val, n_classes)
y_test = one_hot_encoding(y_test, n_classes)


train: 390
val: 90
test: 120


In [0]:
# Hyperparameters

epochs=3000
batch_size = 32

dropout = 0.5
learning_rate = 5e-4
reg = l2(5e-4)

F = get_numberof_features(dataset_name)
F_1 = 128

In [0]:
# layer = Convolutional(F=F, F_1=F_1)
model = MyModel(F, F_1, n_classes, dropout, reg)
generator = batch_generator([X_val, A_val, y_val], batch_size=45)

for X, A, y in generator:
  X_batch, A_batch, idx_batch = create_batch_elements(X, A)
  
#   result = layer([X_batch, A_batch])
#   print("Results from conv:\n", result)
#   result = Flatten()(result)
#   result = Dense(n_classes, activation='softmax', kernel_regularizer=reg)(result)
#   print("Results from dense:\n", result)
#   print("\n\n")
  
  result = model([X_batch, A_batch, idx_batch])
  print(result)

In [0]:
### Set up model and training variables
model = MyModel(F, F_1, n_classes, dropout, reg)

loss_object = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)


In [14]:
### Training loop
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.CategoricalAccuracy(name='val_accuracy')

for epoch in range(epochs):
  
  X_train, A_train, y_train = shuffle(X_train, A_train, y_train)
  train_generator = batch_generator([X_train, A_train, y_train], batch_size=batch_size)
  val_generator = batch_generator([X_val, A_val, y_val], batch_size=batch_size)
  
  for X, A, y in train_generator:
    X_batch, A_batch, idx_batch = create_batch_elements(X, A)
    train_step([X_batch, A_batch, idx_batch], y)

  for X, A, y in val_generator:
    X_batch, A_batch, idx_batch = create_batch_elements(X, A)
    val_step([X_batch, A_batch, idx_batch], y)

  template = 'Epoch {} \t train_loss: {:.4f}\t train_accuracy: {:.4f}\t val_loss: {:.4f}\t val_accuracy: {:.4f}'
  print (template.format(epoch+1,
                         train_loss.result(),
                         train_accuracy.result()*100,
                         val_loss.result(),
                         val_accuracy.result()*100))

W0510 07:54:00.711981 140070750967680 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


Epoch 1 	 train_loss: 1.7917	 train_accuracy: 15.1282	 val_loss: 1.7913	 val_accuracy: 20.0000
Epoch 2 	 train_loss: 1.7905	 train_accuracy: 17.5641	 val_loss: 1.7912	 val_accuracy: 19.4444
Epoch 3 	 train_loss: 1.7867	 train_accuracy: 18.5470	 val_loss: 1.7901	 val_accuracy: 19.2593
Epoch 4 	 train_loss: 1.7810	 train_accuracy: 18.5256	 val_loss: 1.7880	 val_accuracy: 18.3333
Epoch 5 	 train_loss: 1.7724	 train_accuracy: 19.6410	 val_loss: 1.7869	 val_accuracy: 17.7778
Epoch 6 	 train_loss: 1.7664	 train_accuracy: 20.4274	 val_loss: 1.7867	 val_accuracy: 17.4074
Epoch 7 	 train_loss: 1.7584	 train_accuracy: 20.9524	 val_loss: 1.7876	 val_accuracy: 17.1429
Epoch 8 	 train_loss: 1.7560	 train_accuracy: 21.2500	 val_loss: 1.7886	 val_accuracy: 16.9444
Epoch 9 	 train_loss: 1.7519	 train_accuracy: 21.5670	 val_loss: 1.7884	 val_accuracy: 16.7901
Epoch 10 	 train_loss: 1.7493	 train_accuracy: 21.7949	 val_loss: 1.7880	 val_accuracy: 16.6667
Epoch 11 	 train_loss: 1.7468	 train_accuracy: 21

In [15]:
### Test loop
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')

test_generator = batch_generator([X_test, A_test, y_test], batch_size=batch_size)

for X, A, y in test_generator:
  X_batch, A_batch, idx_batch = create_batch_elements(X, A)
  test_step([X_batch, A_batch, idx_batch], y)

template = 'test_loss: {:.4f}\t test_accuracy: {:.4f}'
print (template.format(test_loss.result(),
                       test_accuracy.result()*100))

test_loss: 1.7228	 test_accuracy: 59.1667
