# Karate Club node embeddings
* Code from hhttps://towardsdatascience.com/a-beginners-guide-to-graph-neural-networks-using-pytorch-geometric-part-1-d98dc93e7742
* The code uses PyGeometric

In [116]:
import networkx as nx
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

# load graph from networkx library
G = nx.karate_club_graph()
print("nb edges: ", len(list(G.edges())))

# retrieve the labels for each node
labels = np.asarray([G.nodes[i]['club'] != 'Mr. Hi' for i in G.nodes]).astype(np.int64)
# labels are 0's and 1's

# create edge index from 
adj = nx.to_scipy_sparse_matrix(G).tocoo()
row = torch.from_numpy(adj.row.astype(np.int64)).to(torch.long)
col = torch.from_numpy(adj.col.astype(np.int64)).to(torch.long)
# Row and col: size 2*nb_edges (each edge has two entries in adj)
edge_index = torch.stack([row, col], dim=0)

# There are E edges, and 2*E entries in the adjacency matrix. 
# Therefore, row, col have 2*E entries each. 
# `row` and `col` are not used in this example

# using degree as embedding (1 feature inhelp(embeddings.reshape)
# embeddings are scalars
embeddings = np.array(list(dict(G.degree()).values()))

# normalizing degree values
scale = StandardScaler()
#embeddings.reshape(-1,1)
# Why use fit_transform and not the reshape in line above?
embeddings = scale.fit_transform(embeddings.reshape(-1,1))

nb edges:  78


## The Custom Dataset

In [118]:
import torch
import pandas as pd
from torch_geometric.data import InMemoryDataset, Data
from sklearn.model_selection import train_test_split
import torch_geometric.transforms as T

# custom dataset
class KarateDataset(InMemoryDataset):
    def __init__(self, transform=None, embeddings=None):
        super(KarateDataset, self).__init__('.', transform, None, None)

        data = Data(edge_index=edge_index)    # Torch Geometric class instance
        
        data.num_nodes = G.number_of_nodes()
        
        # embedding (defined above the function). Initial features
        data.x = torch.from_numpy(embeddings).type(torch.float32)
        
        # labels
        y = torch.from_numpy(labels).type(torch.long)
        data.y = y.clone().detach()
        
        data.num_classes = 2   # the code does not crash when I increase the number of classes. WHY? 

        # splitting the data into train, validation and test (shuffled by default)
        X_train, X_test, y_train, y_test = train_test_split(
                pd.Series(G.nodes()), 
                pd.Series(labels),
                test_size=0.30,   # fraction of data used for tests
                random_state=42)  # for reproducible output
        
        print(labels)
        print(G.nodes())
        
        n_nodes = G.number_of_nodes()

        # create train and test masks for data
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[X_train.index] = True
        test_mask[X_test.index] = True
        data['train_mask'] = train_mask
        data['test_mask'] = test_mask

        # What does self.collate return?
        self.data, self.slices = self.collate([data])

    def _download(self):
        return

    def _process(self):
        return

    def __repr__(self):
        return '{}()'.format(self.__class__.__name__)
    
dataset = KarateDataset(embeddings=embeddings)
data = dataset[0]


[0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]


In [120]:
dataset.num_classes

2

# Graph Convolutional Network

In [81]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# GCN model with 2 layers 
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(data.num_features, 16)
        self.conv2 = GCNConv(16, int(data.num_classes))

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data =  data.to(device)

model = Net().to(device) 

In [82]:
data.train_mask.sum(), data.test_mask.sum()
data.x

tensor([[ 2.9871],
        [ 1.1548],
        [ 1.4166],
        [ 0.3695],
        [-0.4157],
        [-0.1540],
        [-0.1540],
        [-0.1540],
        [ 0.1078],
        [-0.6775],
        [-0.4157],
        [-0.9392],
        [-0.6775],
        [ 0.1078],
        [-0.6775],
        [-0.6775],
        [-0.6775],
        [-0.6775],
        [-0.6775],
        [-0.4157],
        [-0.6775],
        [-0.6775],
        [-0.6775],
        [ 0.1078],
        [-0.4157],
        [-0.4157],
        [-0.6775],
        [-0.1540],
        [-0.4157],
        [-0.1540],
        [-0.1540],
        [ 0.3695],
        [ 1.9401],
        [ 3.2488]])

# Train the GCN model

In [85]:
torch.manual_seed(42)

optimizer_name = "Adam"
lr = 1e-1
optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
epochs = 200

def train():
  model.train()
  optimizer.zero_grad()
  F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
  optimizer.step()

@torch.no_grad()
def test():
  model.eval()
  logits = model()
  mask1 = data['train_mask']
  pred1 = logits[mask1].max(1)[1]
  acc1 = pred1.eq(data.y[mask1]).sum().item() / mask1.sum().item()
  mask = data['test_mask']
  pred = logits[mask].max(1)[1]
  acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
  return acc1,acc

for epoch in range(1, epochs):
  train()

train_acc,test_acc = test()

print('#' * 70)
print('Train Accuracy: %s' %train_acc )
print('Test Accuracy: %s' % test_acc)
print('#' * 70)

######################################################################
Train Accuracy: 0.9130434782608695
Test Accuracy: 0.7272727272727273
######################################################################
