In [None]:
# !pip install tensorflow==2.8.0
# !pip install spektral==1.0.6

In [None]:
import os
import numpy as np

import spektral as spktrl
import tensorflow as tf
keras = tf.keras

from spektral.datasets import Citation, TUDataset
from spektral.data import SingleLoader, DisjointLoader
from spektral.transforms import LayerPreprocess
from spektral.data import Dataset, Graph

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [None]:
print(f'Using Spektral {spktrl.__version__}')
print(f'Using TensorFlow {tf.__version__}')
print('Physical GPUs:', tf.config.list_physical_devices('GPU'))

# Practical graph neural networks in Python with TensorFlow and Spektral

*PyData Berlin, 2022.04.13*

**Abstract**


Graph neural networks (GNNs) have become one of the hottest research topics in recent years. Their popularity is reinforced by hugely successful industry applications in social networks, biology, chemistry, neuroscience and many other areas. One of the main challenges faced by data scientists and researchers who want to apply graph networks in their work is that they require different data structures and a slightly different training approach than traditional deep learning models. During the workshop we’ll demonstrate how to implement graph neural networks, how to prepare your data and – finally – how to train a GNN model for node-level and graph-level tasks using Spektral and TensorFlow.



## 1. Node classification with functional API

We'll perform node classification using [CORA](https://relational.fit.cvut.cz/dataset/CORA) citation dataset

### 1.1 Get the data

In [None]:
# Load data
dataset = Citation("cora", normalize_x=True)

### 1.2 EDA

In [None]:
# Let's understand the adjacency matrix
dataset[0].a

In [None]:
# Let's understand labels
dataset[0].y.shape

#### Exercise 1.2.1

Display the label of node 77. 


What is the label of this node?

In [None]:
# YOUR CODE HERE
...

In [None]:
# Let't understand features
dataset[0].x.shape

In [None]:
# Check the distribution of non-zero featueres over nodes
plt.hist((dataset[0].x > 0).sum(axis=1), alpha=.7, bins=100)
plt.show()

In [None]:
# Understand the masks 

# Training, val, test 
dataset.mask_tr, dataset.mask_va, dataset.mask_te

#### Exercise 1.2.2

Compute the number of training, validation and test examples. 

What are these numbers?

In [None]:
# YOUR CODE HERE
print(f'Number of training examples: {...}')
print(f'Number of validation examples: {dataset.mask_va.sum()}')
print(f'Number of test examples: {...}')

### 1.3 Prepare dataloaders

In [None]:
loader_tr = SingleLoader(dataset)
loader_va = SingleLoader(dataset)

### 1.4 Build and compile the model

#### 1.4.1 Build

In [None]:
# Inputs
in_x = keras.Input(shape=(dataset[0].x.shape[1],))
in_a = keras.Input(shape=(dataset[0].a.shape[0],), sparse=True)

In [None]:
# Add dropout on features (but not adjacency matrix)
dropout_1 = keras.layers.Dropout(.1)(in_x)

In [None]:
# Add GAT layer
gat_layer_1 = spktrl.layers.GATConv(
    channels=16,
    attn_heads=8,
    concat_heads=True,
    dropout_rate=.05,
    activation='selu',
    kernel_initializer='lecun_normal'
)([dropout_1, in_a])

In [None]:
# Add dropout
dropout_2 = keras.layers.Dropout(.1)(gat_layer_1)

In [None]:
# Final GAT layer
gat_out = spktrl.layers.GATConv(
    channels=dataset[0].n_labels,
    attn_heads=8,
    concat_heads=False,
    dropout_rate=.05,
    activation='softmax'
)([dropout_2, in_a])

In [None]:
# Enclose the layers in the model
model = keras.Model(inputs=[in_x, in_a], outputs=gat_out)

#### 1.4.2 Setup and compile

In [None]:
# Set some params
LR = 5e-3 # 5e-3  # Learning rate
EPOCHS = 10000  # Number of training epochs
PATIENCE = 30  # Patience for early stopping

In [None]:
# Compile the model
optimizer = keras.optimizers.Adam(lr=LR)
model.compile(
    optimizer=optimizer,
    loss=keras.losses.CategoricalCrossentropy(reduction='sum'),
    weighted_metrics=['acc'],
)
model.summary()

#### 1.4.3 Train

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(patience=PATIENCE, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=PATIENCE//2, min_lr=5e-6, factor=.9)
]

history = model.fit(
    loader_tr.load(),
    steps_per_epoch=loader_tr.steps_per_epoch,
    validation_data=loader_va.load(),
    validation_steps=loader_va.steps_per_epoch,
    epochs=EPOCHS,
    callbacks=callbacks,
)

In [None]:
plt.plot(history.history['loss'], label='Train', lw=2)
plt.plot(history.history['val_loss'], label='Val', lw=2)
plt.legend()
plt.show()

### 1.5 Evaluate

In [None]:
# Evaluate model
print("Evaluating model.")
loader_te = SingleLoader(dataset)
eval_results = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
print("Done.\n" "Test loss: {}\n" "Test accuracy: {}".format(*eval_results))

#### Exercise 1.4.1

Add one more hidden GAT layer, build, compile, train and evaluate the model.

In [None]:
# Inputs
in_x = keras.Input(shape=(dataset[0].x.shape[1],))
in_a = keras.Input(shape=(dataset[0].a.shape[0],), sparse=True)

# Add dropout on features (but not adjacency matrix)
dropout_1 = keras.layers.Dropout(.1)(in_x)

# Add GAT layer
gat_layer_1 = spktrl.layers.GATConv(
    channels=16,
    attn_heads=8,
    concat_heads=True,
    dropout_rate=.05,
    activation='selu',
    kernel_initializer='lecun_normal'
)([dropout_1, in_a])

# Add dropout
dropout_2 = keras.layers.Dropout(.1)(gat_layer_1)



######## YOUR CODE STARTS HERE ########

# Add another GAT layer
gat_layer_2 = ...

# Add another dropout layer
dropout_3 = ...

######## YOUR CODE ENDS HERE ########


# Final GAT layer
gat_out = spktrl.layers.GATConv(
    channels=dataset[0].n_labels,
    attn_heads=8,
    concat_heads=False,
    dropout_rate=.05,
    activation='softmax'
)([dropout_3, in_a])

In [None]:
# Enclose the layers in the model
model = keras.Model(inputs=[in_x, in_a], outputs=gat_out)

# Set some params
LR = 5e-3 # 5e-3  # Learning rate
EPOCHS = 10000  # Number of training epochs
PATIENCE = 30  # Patience for early stopping

# Compile the model
optimizer = keras.optimizers.Adam(lr=LR)
model.compile(
    optimizer=optimizer,
    loss=keras.losses.CategoricalCrossentropy(reduction='sum'),
    weighted_metrics=['acc'],
)
model.summary()

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(patience=PATIENCE, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=PATIENCE//2, min_lr=5e-6, factor=.9)
]



######## YOUR CODE STARTS HERE ########
history = model.fit(
    loader_tr.load(),
    steps_per_epoch=loader_tr.steps_per_epoch,
    validation_data=loader_va.load(),
    validation_steps=...,
    epochs=EPOCHS,
    callbacks=...
)
######## YOUR CODE ENDS HERE ########

In [None]:
plt.plot(history.history['loss'], label='Train', lw=2)
plt.plot(history.history['val_loss'], label='Val', lw=2)
plt.legend()
plt.show()

In [None]:
# Evaluate model
print("Evaluating model.")
loader_te = SingleLoader(dataset)
eval_results = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
print("Done.\n" "Test loss: {}\n" "Test accuracy: {}".format(*eval_results))

## 2. Graph classification with model sub-classing API

We'll use **Proteins** dataset, a part of [TU Datasets](https://chrsmrrs.github.io/datasets/).

Proteins dataset is stored in a **disjoint** format.



<img src="img/disjoint.png" width=400>


We'll need not only adjacency matrix and feature matrix, but also index matrix to identify which nodes belong to which batch.

### 2.1 Get the data

In [None]:
dataset = TUDataset("PROTEINS", clean=True)

#### Exercise 2.1.1

Check how many nodes are in the 8th graph of **Proteins** dataset.

How many are there in 172nd?

In [None]:
# YOUR CODE HERE
...

### 2.2 Split + dataloaders

In [None]:
# Train / test split
idxs = np.random.permutation(len(dataset))  # Random split
split = int(0.9 * len(dataset))
idx_tr, idx_te = np.split(idxs, [split])

# Get train and test datsets
dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te]

# Get loaders 
loader_tr = DisjointLoader(dataset_tr, batch_size=32, epochs=10)
loader_te = DisjointLoader(dataset_te, batch_size=32, epochs=1)

### 2.3 Define the model

In [None]:
class GCN(keras.models.Model):
    
    def __init__(self, channels, n_layers, dropout_rate=.2):
        
        super().__init__()
        
        self.conv1 = spktrl.layers.GCNConv(channels)
        self.convs = []
        
        for _ in range(1, n_layers):
            self.convs.append(
                spktrl.layers.GCNConv(channels)
            )
        self.pool = spktrl.layers.GlobalAvgPool()
        self.dense1 = keras.layers.Dense(channels, activation='relu')
        self.dropout = keras.layers.Dropout(dropout_rate)
        self.dense2 = keras.layers.Dense(dataset.n_labels, activation='softmax')

    def call(self, inputs):
        x, a, i = inputs
        x = self.conv1([x, a])
        for conv in self.convs:
            x = conv([x, a])
        x = self.pool([x, i])
        x = self.dense1(x)
        x = self.dropout(x)
        return self.dense2(x)

### 2.3 Compile, train & evaluate 

#### 2.3.1 Setup

In [None]:
# Set some params
LR = 5e-3 # 5e-3  # Learning rate
EPOCHS = 10  # Number of training epochs
PATIENCE = 30  # Patience for early stopping

In [None]:
# Instantiate the model
model = GCN(
    channels=16,
    dropout_rate=.1,
    n_layers=2
)

In [None]:
# Define the optimizer
optimizer = keras.optimizers.RMSprop(LR)

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=keras.losses.CategoricalCrossentropy(reduction='sum'),
    weighted_metrics=['acc'],
)


In [None]:
history = model.fit(
    loader_tr.load(),
    steps_per_epoch=loader_tr.steps_per_epoch,
    validation_data=loader_te.load(),
    validation_steps=loader_te.steps_per_epoch,
    epochs=EPOCHS
)

#### Exercise 2.3.1

Train a GCN with:

* 32 channels 
* 6 layers
* Adam optimizer (use the same learning rate, `LR`)

Are the results better?

In [None]:
# Get loaders 
loader_tr = DisjointLoader(dataset_tr, batch_size=32, epochs=10)
loader_te = DisjointLoader(dataset_te, batch_size=32, epochs=1)

######## YOUR CODE STARTS HERE ########
model = ...

# Define the optimizer
optimizer = ...

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=keras.losses.CategoricalCrossentropy(reduction='sum'),
    weighted_metrics=['acc'],
)

history = model.fit(
    loader_tr.load(),
    steps_per_epoch=loader_tr.steps_per_epoch,
    validation_data=loader_te.load(),
    validation_steps=loader_te.steps_per_epoch,
    epochs=EPOCHS
)

## 3. Building a custom dataset

To build your own dataset, you should store your data in a specific location. 

Locally it's: `~/.spektral/datasets/[ClassName]`

You can overwrite it by defining the `path` property of a `Dataset` class. 



Path on **Colab**: `/usr/local/lib/python3.7/dist-packages/spectral/datasets`


___________________________

<img src="img/tensorcell.png" width=150>

<br>

Now, we're going to look at a dataset class that we used in one of our experiments at [TensorCell](https://www.tensorcell.com/)




___________________________

In [None]:
class TensorcellDataset(Dataset):
    
    """A Tensorcell dataset."""
    
    def __init__(self, dataset_variant, allow_self_loops=True, circular_mapping=False, add_constant_feature=False, add_one_hot_index=False, **kwargs):
        """
        :param dataset_variant: A dataset to pick. Currently takes: `ochota_100k`, `centrum_100k`, `mokotow_100k`
        :type dataset_variant: str
        :param circular_mapping: If node values should be mapped to a unit circle
        :type circular_dataset: bool

        ...
        :return: None
        :rtype: None
        """

        self.dataset_variant = dataset_variant
        self.allow_self_loops = allow_self_loops
        self.circular_mapping = circular_mapping
        self.add_constant_feature = add_constant_feature
        self.add_one_hot_index = add_one_hot_index
        
        # Construct filenames
        dataset_info = dataset_variant.split('_')
        district = dataset_info[0]
        n_rows = dataset_info[1]
        
        self.filename_A = f'{district}_A.txt'
        self.filename_Xy = f'{district}_X_{n_rows}.txt'

        super().__init__(**kwargs)


    def read(self):
        
        """
        :return: output
        :rtype: list
        """
        
        # We must return a list of Graph objects
        output = []
        
        # Read files
        adjacency_matrix = np.loadtxt(os.path.join(self.path, self.filename_A))
        features = np.loadtxt(os.path.join(self.path, self.filename_Xy), delimiter=',')

        # Add/remove self loops in the adjacency matrix
        if self.allow_self_loops:
            np.fill_diagonal(adjacency_matrix, 1)
        else:
            np.fill_diagonal(adjacency_matrix, 0)

        
        # Construct graph objects
        for row in range(features.shape[0]):

            # If `circular_mapping` -> map to a circular representation
            if self.circular_mapping:
                x = self.get_circular_components(features[row, :-1]).T
            else:
                x = features[row, :-1][:, np.newaxis]

            # Add constant feature 1
            if self.add_constant_feature:
                x = np.hstack([x, np.ones(x.shape[0])[:, np.newaxis]])

            # Add one-hot encoded node label
            if self.add_one_hot_index:

                x_plus_oh = []

                for i, d in enumerate(x):
                    one_hot_index = np.zeros(x.shape[0])
                    one_hot_index[i] = 1
                    x_plus_oh.append(np.hstack([d, one_hot_index]))

                x = np.array(x_plus_oh)

            # Construct a graph 
            output.append(
                Graph(
                    x=x, 
                    a=adjacency_matrix, 
                    y=features[row, -1])
            )

        return output

In [None]:
dataset = TensorcellDataset('ochota_100k')