In [1]:
import numpy as np
import pandas as pd
import os
import torch_geometric.transforms as T 
from torch_geometric.datasets import Planetoid
import matplotlib.pyplot as plt 

In [2]:
import torch
import torch.nn as nn 
import torch.nn.functional as F 
from torch_geometric.nn import GCNConv, GraphSAGE, SAGEConv, GATConv
import networkx as nx 
import torch_geometric
from torch.nn import Parameter
from torch_geometric.utils import to_networkx
from torch_geometric.loader import NeighborLoader
from tqdm import tqdm

In [3]:
#importing pubmed dataset
dataset = Planetoid(root= '.', name = 'Pubmed')
data = dataset[0]

In [4]:
data

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])

In [5]:
# view the dataset details
# Print information about the dataset
print(f'Dataset: {dataset}')
print('-------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Dataset: Pubmed()
-------------------
Number of graphs: 1
Number of nodes: 19717
Number of features: 500
Number of classes: 3


In [6]:
# Print information about the graph
print(f'\nGraph:')
print('------')
print(f'Training nodes: {sum(data.train_mask).item()}')
print(f'Evaluation nodes: {sum(data.val_mask).item()}')
print(f'Test nodes: {sum(data.test_mask).item()}')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')


Graph:
------
Training nodes: 60
Evaluation nodes: 500
Test nodes: 1000
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


In [7]:
#set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# Create batches with neighbor sampling
#A NeighborLoader is a data loader that performs neighbor sampling for GNN's
#Allows for mini-batch training of GNNs on large-scale graphs where full-batch training is not feasible.
#num_neighbors denotes how many neighbors are sampled for each node in each iteration.
#https://pytorch-geometric.readthedocs.io/en/latest/modules/loader.html#torch_geometric.loader.NeighborLoader
train_loader = NeighborLoader(
    data,
    num_neighbors=[5, 10],
    batch_size=16,
    input_nodes=data.train_mask,
)

In [9]:
train_loader

NeighborLoader()

### Let's build a graph convolutional network

In [10]:
class GCN(nn.Module):
    '''
    Graph Convolutional Network
    GCN takes graphs as an input and applies convolution operations over the graph
    '''
    def __init__(self) -> None:
        super().__init__()

        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)
        self.optim = torch.optim.Adam(self.parameters(), lr = 0.005, weight_decay=5e-4)


    def forward(self, x, edge_index):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix 
        #x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training= self.training)
        x = self.conv2(x, edge_index)

        return x, F.log_softmax(x, dim = 1)

In [11]:
def accuracy(pred_y, y):
    """Calculate accuracy."""
    return ((pred_y == y).sum() / len(y)).item()

def train(model, data, epochs):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = model.optim

    model.train()
    for epoch in range(epochs+1):
      total_loss = 0
      acc = 0
      val_loss = 0
      val_acc = 0

      # Train on batches
      for batch in train_loader :
        optimizer.zero_grad()
        
        _, out = model(batch.x, batch.edge_index)
        loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])
        total_loss += loss
        acc += accuracy(out[batch.train_mask].argmax(dim=1), 
                        batch.y[batch.train_mask])
        loss.backward()
        optimizer.step()

        # Validation
        val_loss += criterion(out[batch.val_mask], batch.y[batch.val_mask])
        val_acc += accuracy(out[batch.val_mask].argmax(dim=1), 
                            batch.y[batch.val_mask])

      # Print metrics every 10 epochs
      if(epoch % 10 == 0):
          print(f'Epoch {epoch:>3} | Train Loss: {total_loss/len(train_loader):.3f} '
                f'| Train Acc: {acc/len(train_loader)*100:>6.2f}% | Val Loss: '
                f'{val_loss/len(train_loader):.2f} | Val Acc: '
                f'{val_acc/len(train_loader)*100:.2f}%')
          
def test(model, data):
    """Evaluate the model on test set and print the accuracy score."""
    model.eval()
    _, out = model(data.x, data.edge_index)
    acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
    return acc

In [12]:
# Create GCN
gcn = GCN().to(device)
print(gcn)

GCN(
  (conv1): GCNConv(500, 16)
  (conv2): GCNConv(16, 3)
)


In [13]:
# Train GCN
train(gcn, dataset, 200)

Epoch   0 | Train Loss: 1.111 | Train Acc:  22.47% | Val Loss: 1.11 | Val Acc: 30.42%
Epoch  10 | Train Loss: 0.834 | Train Acc:  89.73% | Val Loss: 0.84 | Val Acc: 76.01%
Epoch  20 | Train Loss: 0.520 | Train Acc:  93.41% | Val Loss: 0.61 | Val Acc: 79.34%
Epoch  30 | Train Loss: 0.335 | Train Acc: 100.00% | Val Loss: 0.55 | Val Acc: 72.68%
Epoch  40 | Train Loss: 0.293 | Train Acc:  96.45% | Val Loss: 0.49 | Val Acc: 72.22%
Epoch  50 | Train Loss: 0.161 | Train Acc: 100.00% | Val Loss: 0.37 | Val Acc: 81.25%
Epoch  60 | Train Loss: 0.164 | Train Acc:  97.06% | Val Loss: 0.49 | Val Acc: 81.25%
Epoch  70 | Train Loss: 0.125 | Train Acc: 100.00% | Val Loss: 0.51 | Val Acc: 79.46%
Epoch  80 | Train Loss: 0.087 | Train Acc: 100.00% | Val Loss: 0.39 | Val Acc: 92.71%
Epoch  90 | Train Loss: 0.096 | Train Acc:  98.53% | Val Loss: 0.38 | Val Acc: 84.52%
Epoch 100 | Train Loss: 0.094 | Train Acc: 100.00% | Val Loss: 0.32 | Val Acc: 89.90%
Epoch 110 | Train Loss: 0.102 | Train Acc:  98.53% | V

In [14]:
_, predictions = gcn(data.x, data.edge_index)

In [15]:
print(predictions.argmax(dim = 1))

tensor([1, 1, 0,  ..., 2, 0, 2])


In [16]:
for batch in train_loader:
    print(batch.y.shape)

torch.Size([381])
torch.Size([256])
torch.Size([283])
torch.Size([196])


In [17]:
data.edge_index.shape

torch.Size([2, 88648])

In [18]:
data

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])

### Build custom CFGNN- EXPLAINER

imported and modified from github

In [19]:
from utils.utils import get_degree_matrix
from gcn_perturb import GCNSyntheticPerturb
from utils.utils import normalize_adj
from gcn import GCNSynthetic
from cf_explainer import CFExplainer
from utils.utils import normalize_adj, get_neighbourhood, safe_open
from torch_geometric.utils import dense_to_sparse
import time
import argparse
import pickle


In [20]:
# Get CF examples in test set
_, predictions = gcn(data.x, data.edge_index)
output = predictions.argmax(dim = 1)
test_cf_examples = []
start = time.time()
for i in range(data.x[data.val_mask].shape[0]):
	sub_adj, sub_feat, sub_labels, node_dict = get_neighbourhood(int(i), data.edge_index, 3 + 1, data.x, data.y)
	new_idx = node_dict[int(i)]
	with torch.no_grad():
		print("Output original model, full adj: {}".format(predictions[i]))
		# print(sub_feat.shape, sub_adj.shape)


	# Need to instantitate new cf model every time because size of P changes based on size of sub_adj
	explainer = CFExplainer(model=gcn,
							sub_adj=sub_adj,
							sub_feat=sub_feat,
							n_hid=3,
							dropout=0.0,
							sub_labels=sub_labels,
							y_pred_orig=output[i],
							num_classes = len([0,1,2]),
							beta=5e-4,
							device='cpu')
	if device == 'cuda':
		gcn.cuda()
		explainer.cf_model.cuda()
		adj = adj.cuda()
		norm_adj = norm_adj.cuda()
		features = features.cuda()
		labels = labels.cuda()
		idx_train = idx_train.cuda()
		idx_test = idx_test.cuda()

	cf_example = explainer.explain(node_idx=i, cf_optimizer='SGD', new_idx=new_idx, lr=1e-1,
	                               n_momentum=0.0, num_epochs=10)
	test_cf_examples.append(cf_example)
	print("Time for {} epochs of one example: {:.4f}min".format(10, (time.time() - start)/60))
print("Total time elapsed: {:.4f}s".format((time.time() - start)/60))
print("Number of CF examples found: {}/{}".format(len(test_cf_examples), len(idx_test)))


Output original model, full adj: tensor([-2.3145, -0.1846, -2.6627], requires_grad=True)
orig model requires_grad:  conv1.bias True
orig model requires_grad:  conv1.lin.weight True
orig model requires_grad:  conv2.bias True
orig model requires_grad:  conv2.lin.weight True
cf model requires_grad:  P_vec True
cf model requires_grad:  gc1.weight False
cf model requires_grad:  gc1.bias False
cf model requires_grad:  gc2.weight False
cf model requires_grad:  gc2.bias False
cf model requires_grad:  gc3.weight False
cf model requires_grad:  gc3.bias False
cf model requires_grad:  lin.weight False
cf model requires_grad:  lin.bias False


  clip_grad_norm(self.cf_model.parameters(), 2.0)


Node idx: 0 New idx: 0 Epoch: 0001 loss: 0.0000 pred loss: -1.3756 graph loss: 0.0000
Output: tensor([-1.0602, -1.3756, -0.9140])
 Output nondiff: tensor([-1.0602, -1.3756, -0.9140])
 orig pred: 1, new pred: 2, new pred nondiff: 2
 
Node idx: 0 New idx: 0 Epoch: 0002 loss: 0.0000 pred loss: -1.3756 graph loss: 0.0000
Output: tensor([-1.0602, -1.3756, -0.9140])
 Output nondiff: tensor([-1.0602, -1.3756, -0.9140])
 orig pred: 1, new pred: 2, new pred nondiff: 2
 
Node idx: 0 New idx: 0 Epoch: 0003 loss: 0.0000 pred loss: -1.3756 graph loss: 0.0000
Output: tensor([-1.0602, -1.3756, -0.9140])
 Output nondiff: tensor([-1.0602, -1.3756, -0.9140])
 orig pred: 1, new pred: 2, new pred nondiff: 2
 
Node idx: 0 New idx: 0 Epoch: 0004 loss: 0.0000 pred loss: -1.3756 graph loss: 0.0000
Output: tensor([-1.0602, -1.3756, -0.9140])
 Output nondiff: tensor([-1.0602, -1.3756, -0.9140])
 orig pred: 1, new pred: 2, new pred nondiff: 2
 
Node idx: 0 New idx: 0 Epoch: 0005 loss: 0.0000 pred loss: -1.3756 g

: 