# Tutorial on a toy example

We will design a graph with four overlapping communities in a central node. 
The nodes in each community have a certain weight, and the weight for the central node will be the average
of the weights of all the communities.

A GNN will be trained to predict the weights of all nodes

The explainer algorithm will be used to predict the most relevant nodes and communities for the central node

# This experiment is a replica of the Experiment in Section 5.1 of "Community explanations in knowledge graphs with XP-GNN", by Andrés Martínez Mora, Dimitris Polychronopoulos, Michaël Ughetto, and Sebastian Nilsson

# Display example graph with communities colored differently, and all community weights

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

plt.figure()
plt.imshow(mpimg.imread("artificial_graph_networkx.png"))

# Import libraries for analysis

In [None]:
import pandas as pd
import os,sys
import numpy as np
import networkx
import torch
from torch import nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv,Linear
from sklearn.metrics import r2_score
from torch_geometric.data import Data
import time

from pathway_explanations.explainer import Explainer, set_seed

In [None]:
seed = 0
set_seed(seed)

# Set up graph data

In [None]:
# Auxiliary function

def manual_graph_labels(num_nodes,communities,weights):
    """
    Setup labels for an artificial graph with node features,
    edge indices, and a given community structure
    
    Params
    ------
    num_nodes : int
        Node number
    communities : list of lists of int
        Communities to analyze
    weights : list of float
        Community-wise weights
        
    Returns
    -------
    Y : torch.tensor
        Node-wise labels based on community-wise weights
    
    """
    
    # Build up node-wise labels
    weight_sum = torch.zeros((num_nodes))
    weight_num = torch.zeros((num_nodes))
    for weight,community in zip(weights,communities):
        community_tensor = torch.tensor(community,
                                        dtype=int)
        weight_sum[community] += weight
        weight_num[community] += 1
        
    Y = weight_sum/(weight_num + 1e-15)
    
    return Y

In [None]:
# Define device where to complete computations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Setup edge index

edge_index = torch.tensor([[0,1],[0,2],[0,4],
                             [1,0],[1,2],[1,5],
                             [2,1],[2,0],[2,4],[2,5],[2,6],[2,7],
                             [3,2],[3,4],[3,9],
                             [4,0],[4,2],[4,3],[4,9],
                             [5,1],[5,2],[5,6],
                             [6,5],[6,2],[6,7],
                             [7,6],[7,2],[7,8],
                             [8,7],[8,9],
                             [9,3],[9,4],[9,8],[9,10],
                             [10,9],[10,11],[10,19],[10,28],
                             [11,10],[11,16],[11,12],[11,18],
                             [12,11],[12,18],[12,13],
                             [13,12],[13,17],[13,14],
                             [14,13],[14,15],
                             [16,11],[16,18],[16,17],[16,15],
                             [17,15],[17,13],[17,18],[17,16],
                             [18,17],[18,12],[18,11],[18,16],
                             [19,10],[19,25],[19,24],
                             [20,25],[20,26],[20,21],
                             [21,20],[21,26],[21,27],[21,22],
                             [22,21],[22,27],[22,23],
                             [23,22],[23,24],
                             [24,23],[24,27],[24,19],[24,27],
                             [25,24],[25,19],[25,20],
                             [26,27],[26,21],[26,20],[26,25],
                             [27,24],[27,26],[27,21],[27,22],
                             [28,10],[28,30],[28,29],
                             [29,28],[29,30],[29,35],
                             [30,31],[30,32],[30,34],[30,35],[30,29],
                             [31,30],[31,28],[31,32],
                             [32,31],[32,33],[32,30],
                             [33,32],[33,34],
                             [34,33],[34,35],[34,30],
                             [35,30],[35,34],[34,29]],
                              dtype=torch.long,
                              device=device)


edge_index = edge_index.T

# Set up community weights and community structure
weights = [1.0,0.6,0.25,0.75]

communities = [[0,1,2,3,4,5,6,7,8,9,10],
               [10,11,12,13,14,15,16,17,18],
               [10,19,20,21,22,23,24,25,26,27],
               [10,28,29,30,31,32,33,34,35]]
community_names = ["blue","red","green","orange"]

# Set node features as normally distributed

length = 16 # Feature size
features = torch.randn(int(torch.max(edge_index[0]).item())+1,
                       length, device=device)

node_names = ["{}".format(i) for i in range(features.shape[0])]

# Obtain training labels
Y = manual_graph_labels(features.shape[0],
                        communities,weights)
Y = Y.to(device)

# Define train and test mask
# Central node is in the test set
test_size = 0.2
train_mask = torch.rand(Y.shape, device=device)
train_mask[train_mask < test_size] = 0
train_mask[train_mask > test_size] = 1
train_mask = train_mask.bool()
train_mask[10] = False
test_mask = ~train_mask

# Model for training artificial graph

In [None]:
class GCN_homo(torch.nn.Module): # Homogeneous GCN
    def __init__(self, node_features):
        super().__init__()
        
        # Load hyperparameters
        seed = 0
        hidden_channels = 8
        out_neurons = 1
        conv_layers = [16, 8, 8]
        fc_layers = [8, 8, 16]

        seed,out_neurons = int(seed),int(out_neurons)
        
        torch.manual_seed(seed)
        
        conv_list,fc_list = [],[]
        
        for enum_conv_layer,conv_layer in enumerate(conv_layers): # Setup convolutional backbone
            assert isinstance(conv_layer,int) or isinstance(conv_layer,float),"Size of convolutional layer is not numeric"
            conv_layer = abs(int(conv_layer))
            
            if enum_conv_layer == 0:
                conv = GCNConv(node_features,conv_layer)
            else:
                conv = GCNConv(conv_layers[enum_conv_layer - 1],conv_layer)
                
            conv_list.append(conv)
            conv_list.append(nn.ReLU())
            
        self.conv = nn.ModuleList(conv_list)
        
        for enum_fc_layer,fc_layer in enumerate(fc_layers): # Setup linear backbone
        
            assert isinstance(fc_layer,int) or isinstance(fc_layer,float),"Size of convolutional layer is not numeric"
        
            if enum_fc_layer == (len(fc_layers) - 1): # Last FC layer
                lin = Linear(fc_layers[enum_fc_layer],out_neurons)
                act = nn.Sigmoid()
            else:
                lin = Linear(fc_layers[enum_fc_layer],fc_layers[enum_fc_layer + 1])
                act = nn.ReLU()
            fc_list.append(lin)
            fc_list.append(act)
            
        self.fc = nn.ModuleList(fc_list[:(-1)])
        
        
    def forward(self,x,edge_index):
        
        for enum_c,c in enumerate(self.conv): # Convolutional backbone
            if enum_c % 2 == 0: # Even layer: convolution
                x = c(x, edge_index)
            else: # Odd layer: activation function
                x = c(x)
                
        for l in self.fc: # Fully-connected backbone
            x = l(x)
        
        return x

# Model training

In [None]:
# Call architecture
model = GCN_homo(features.shape[-1]).to(device)
print(model)

# Load optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

model.train()

for epoch in range(500):
    optimizer.zero_grad()
    H = model(features, 
              edge_index)
    
    # Use MSE as loss for weight prediction
    loss = F.mse_loss(H.flatten(), 
                      Y.flatten())
    loss.backward()
    optimizer.step()
    
    r2_val = r2_score(Y[test_mask].cpu().detach().numpy(),
                      H[test_mask].cpu().detach().numpy())
    
    print("Epoch:", epoch+1, "// Train loss:", round(loss.item(),4),"// Validation R2:", round(r2_val,4))    

# Set up hyperparameters for explanation pipeline

In [None]:
explanation_params = {
                        "seed": seed,
                        "interpret_samples": 20,
                        "epochs": 50,
                        "optimizer": "adam",
                        "lr": 0.01,
                        "lr_patience": 10,
                        "l1_lambda": 1e-4
                    }

# Explanation pipeline execution

In [None]:
query_node = "10" # The central node is the node in position "10"
repeats = 10 # Number of initializations to be averaged for the pipeline
query_type = None

t1 = time.time()

# Define Explainer object
pipeline = Explainer(
        features,
        edge_index,
        model.float(),
        explanation_params,
        node_names,
        communities,
        community_names,
        query_type,
        problem="node_prediction"
    )


# Run Explainer object
node_df, community_df = pipeline.run(query_node, repeats)

print("Time ellapsed for explanation for node {}: {}sec".format(query_node,round(time.time() - t1, 4)))

# Result checking

In [None]:
# Community ranking
community_df

In [None]:
# Node ranking
node_df