## GNN for graph classification

In [5]:

import numpy as np
import networkx as nx
from random import randint
from gensim.models import Word2Vec

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


from scipy.sparse.linalg import eigs
from scipy.sparse import diags, eye

from sklearn.linear_model import LogisticRegression
from sklearn.manifold import SpectralEmbedding
from sklearn.metrics import accuracy_score

import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder


import time
import torch.optim as optim


In [9]:
def normalize_adjacency(A):
    # Ã = A + I
    A_tilde = A + sp.identity(A.shape[0])

    # degree matrix D
    D = np.sum(A_tilde, axis=1)
    D_inv_sqrt = 1 / np.sqrt(D)
    D_inv_sqrt = np.squeeze(np.asarray(D_inv_sqrt))
    D_inv_sqrt_matrix = sp.diags(D_inv_sqrt, format='csc')

    # Normalized adjacency matrix A_normalized
    A_normalized = D_inv_sqrt_matrix @ A_tilde @ D_inv_sqrt_matrix

    return A_normalized


def load_cora():
    idx_features_labels = np.genfromtxt("./data/cora.content", dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    features = features.todense()
    features /= features.sum(1).reshape(-1, 1)
    
    class_labels = idx_features_labels[:, -1]
    le = LabelEncoder()
    class_labels = le.fit_transform(class_labels)

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("./data/cora.cites", dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(class_labels.size, class_labels.size), dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    print('Dataset has {} nodes, {} edges, {} features.'.format(adj.shape[0], edges.shape[0], features.shape[1]))

    return features, adj, class_labels


def sparse_to_torch_sparse(M):
    """Converts a sparse SciPy matrix to a sparse PyTorch tensor"""
    M = M.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((M.row, M.col)).astype(np.int64))
    values = torch.from_numpy(M.data)
    shape = torch.Size(M.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [None]:
class GNN(nn.Module):
    """ taken from LAB 5 on graph neural networks of the cours ALTEGRAD from MVA 2023 
        Lecture: Prof. Michalis Vazirgiannis
        Lab: Dr. Giannis Nikolentzos & Dr. Johannes Lutzeyer """
    def __init__(self, n_feat, n_hidden_1, n_hidden_2, n_class, dropout):
        super(GNN, self).__init__()

        self.fc1 = nn.Linear(n_feat, n_hidden_1)
        self.fc2 = nn.Linear(n_hidden_1, n_hidden_2)
        self.fc3 = nn.Linear(n_hidden_2, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj):
        
        # First layer
        z1 = self.fc1(x_in)
        h1 = self.relu(torch.mm(adj, z1))
        h1 = self.dropout(h1)

        # Second layer
        z2 = self.fc2(h1)
        h2 = self.relu(torch.mm(adj, z2))
       
        # Output layer
        x = self.fc3(h2)

        return F.log_softmax(x, dim=1), h2 #, h2 


# 

In [11]:
# Initialize device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Hyperparameters
epochs = 100
n_hidden_1 = 64
n_hidden_2 = 32
learning_rate = 0.01
dropout_rate = 0.5

# Read data
features, adj, class_labels = load_cora()
n = adj.shape[0] # Number of nodes
n_class = np.unique(class_labels).size # Number of classes

adj = normalize_adjacency(adj) # Normalize adjacency matrix

# Yields indices to split data into training, validation and test sets
idx = np.random.permutation(n)
idx_train = idx[:int(0.6*n)]
idx_val = idx[int(0.6*n):int(0.8*n)]
idx_test = idx[int(0.8*n):]

# Transform the numpy matrices/vectors to torch tensors
features = torch.FloatTensor(features).to(device)
y = torch.LongTensor(class_labels).to(device)
adj = sparse_to_torch_sparse(adj).to(device)
idx_train = torch.LongTensor(idx_train).to(device)
idx_val = torch.LongTensor(idx_val).to(device)
idx_test = torch.LongTensor(idx_test).to(device)

# Creates the model and specifies the optimizer
model = GNN(features.shape[1], n_hidden_1, n_hidden_2, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

Dataset has 2708 nodes, 5429 edges, 1433 features.


**Training**

In [12]:
def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output,_ = model(features, adj)
    loss_train = F.nll_loss(output[idx_train], y[idx_train])
    acc_train = accuracy_score(torch.argmax(output[idx_train], dim=1).detach().cpu().numpy(), y[idx_train].cpu().numpy())
    loss_train.backward()
    optimizer.step()

    
    model.eval()
    output,_ = model(features, adj)

    loss_val = F.nll_loss(output[idx_val], y[idx_val])
    acc_val = accuracy_score(torch.argmax(output[idx_val], dim=1).detach().cpu().numpy(), y[idx_val].cpu().numpy())
    print('Epoch: {:03d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'acc_train: {:.4f}'.format(acc_train),
          'loss_val: {:.4f}'.format(loss_val.item()),
          'acc_val: {:.4f}'.format(acc_val),
          'time: {:.4f}s'.format(time.time() - t))


def test():
    model.eval()
    output, embeddings = model(features, adj)
    loss_test = F.nll_loss(output[idx_test], y[idx_test])
    acc_test = accuracy_score(torch.argmax(output[idx_test], dim=1).detach().cpu().numpy(), y[idx_test].cpu().numpy())
    
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test))

    return embeddings[idx_test]

In [13]:
# Train model
t_total = time.time()
for epoch in range(epochs):
    train(epoch)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
print()

# Testing
embeddings_test = test()


Epoch: 001 loss_train: 1.9475 acc_train: 0.0764 loss_val: 1.9420 acc_val: 0.0978 time: 0.5946s
Epoch: 002 loss_train: 1.9385 acc_train: 0.0770 loss_val: 1.9350 acc_val: 0.0996 time: 0.5192s
Epoch: 003 loss_train: 1.9303 acc_train: 0.0850 loss_val: 1.9287 acc_val: 0.3007 time: 0.5021s
Epoch: 004 loss_train: 1.9226 acc_train: 0.3073 loss_val: 1.9210 acc_val: 0.2288 time: 0.5750s
Epoch: 005 loss_train: 1.9142 acc_train: 0.2248 loss_val: 1.9120 acc_val: 0.2454 time: 0.5025s
Epoch: 006 loss_train: 1.9034 acc_train: 0.2580 loss_val: 1.9020 acc_val: 0.3247 time: 0.5190s
Epoch: 007 loss_train: 1.8925 acc_train: 0.3140 loss_val: 1.8910 acc_val: 0.3063 time: 0.5035s
Epoch: 008 loss_train: 1.8791 acc_train: 0.3528 loss_val: 1.8794 acc_val: 0.2768 time: 0.5326s
Epoch: 009 loss_train: 1.8647 acc_train: 0.3140 loss_val: 1.8669 acc_val: 0.2768 time: 0.4788s
Epoch: 010 loss_train: 1.8484 acc_train: 0.3073 loss_val: 1.8542 acc_val: 0.2731 time: 0.4933s
Epoch: 011 loss_train: 1.8325 acc_train: 0.3067 lo