In [1]:
import os.path as osp

from collections import namedtuple
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

import pyg_lib
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.loader import ClusterData, ClusterLoader
from torch_geometric.datasets import Planetoid
from torch_geometric.nn.models import GraphSAGE

The Cora dataset consists of __2708 scientific publications__ classified into one of __seven classes (y vector)__. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of __1433 unique words (x matrix)__.
![cora](static/Cora.png) 

In [2]:
dataset_directory = '/notebooks/data'
cora_root = osp.join(dataset_directory, 'Cora')
dataset = Planetoid(cora_root, 'Cora')
data = dataset[0]

In [None]:
model = GraphSAGE(in_channels=1433, 
                  hidden_channels=128,
                  num_layers=3)
print(model)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

In [None]:
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()
    
model.eval()
out = model(data.x, data.edge_index)
visualize(out, color=data.y)

In [None]:
def train():
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(data.x, data.edge_index)  # Perform a single forward pass.
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
    test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
    return test_acc


for epoch in range(1, 101):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

In [None]:
out = model(data.x, data.edge_index)
visualize(out, color=data.y)

In [1]:
total_num_clusters = 150

cluster_data = ClusterData(
    dataset[0], num_parts=total_num_clusters)

In [2]:
num_nodes_per_cluster = []
num_edges_per_cluster = []

for cluster in cluster_data:
    num_nodes_per_cluster.append(cluster.y.shape[0])
    num_edges_per_cluster.append(cluster.edge_index.shape[1])

In [None]:
plt.hist(np.array(num_nodes_per_cluster), 20)
plt.xlabel("Number of nodes per cluster")
plt.ylabel("Counts")
plt.title("Histogram of nodes in each cluster")
plt.show()

In [None]:
plt.hist(np.array(num_edges_per_cluster), 20)
plt.xlabel("Number of edges per cluster")
plt.ylabel("Counts")
plt.title("Histogram of edges in each cluster")
plt.show()

In [None]:
clusters_per_batch = 10

dynamic_size_dataloader = ClusterLoader(
    cluster_data,
    batch_size=clusters_per_batch,
)

dynamic_size_dataloader

In [None]:
model = GraphSAGE(in_channels=1433, 
                  hidden_channels=128,
                  num_layers=3)
print(model)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

In [None]:
def epoch_train(model):
    model.train()
    for batch in dynamic_size_dataloader:
        optimizer.zero_grad()  # Clear gradients.
        out = model(batch.x, batch.edge_index)  # Perform a single forward pass.
        loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])  # Compute the loss solely based on the training nodes.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
    return loss


for epoch in range(1, 35):
    loss = epoch_train(model)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

In [None]:
out = model(data.x, data.edge_index)
visualize(out, color=data.y)