# Классификация графов с использованием графовых нейронных сетей

__Автор задач: Блохин Н.В. (NVBlokhin@fa.ru)__

Материалы:
* Макрушин С.В. Машинное обучение на графах", Лекции 4-5 "Графовые нейронные сети"
* Документация:
    * https://pytorch-geometric.readthedocs.io/en/latest/modules/utils.html#torch_geometric.utils.from_networkx
    * https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html
    * https://pytorch-geometric.readthedocs.io/en/latest/modules/utils.html?highlight=scatter#torch_geometric.utils.scatter

## Вопросы для совместного обсуждения

1\. Обсудите задачу предсказания характеристик графа, подходы к получению векторного представления графа и способы объединения графов в пакеты.

In [1]:
from torch.utils.data import Dataset
import torch
from torch_geometric.utils.convert import from_networkx
import networkx as nx

class GraphsDataset(Dataset):
  def __init__(self):
    self.graphs = [nx.random_geometric_graph(n=10, radius=1) for _ in range(10)]
    self.ys = torch.randint(0, 7, size=(10, ))

  def __getitem__(self, idx):
    return from_networkx(self.graphs[idx]), self.ys[idx]

  def __len__(self):
    return len(self.graphs)

In [2]:
g = nx.random_geometric_graph(n=10, radius=1)
g.number_of_edges()

43

In [3]:
from torch_geometric.utils.convert import from_networkx

from_networkx(g)

Data(edge_index=[2, 86], pos=[10, 2])

In [4]:
dataset = GraphsDataset()
dataset[0]

(Data(edge_index=[2, 90], pos=[10, 2]), tensor(5))

In [5]:
import torch_geometric.nn as gnn

layer = gnn.GraphConv(in_channels=2, out_channels=7)
out = layer(dataset[0][0].pos, dataset[0][0].edge_index)
out

tensor([[ 4.4806, -1.2960, -4.2970, -2.3146, -3.1377, -0.4409, -0.6727],
        [ 5.3717, -0.6066, -4.7660, -2.4637, -2.9488, -0.3697, -1.1930],
        [ 4.8880, -0.9162, -4.3527, -2.2160, -2.9421, -0.1346, -1.1084],
        [ 4.5506, -1.4182, -4.7673, -2.7818, -3.4211, -1.1830, -0.1732],
        [ 5.2902, -0.6685, -4.7201, -2.4469, -2.9640, -0.3710, -1.1491],
        [ 4.4941, -1.4822, -4.7877, -2.8250, -3.4676, -1.2738, -0.0778],
        [ 4.8297, -1.0146, -4.4531, -2.3440, -3.0447, -0.3653, -0.9110],
        [ 4.7457, -1.1706, -4.6323, -2.5647, -3.2162, -0.7574, -0.5834],
        [ 5.3028, -0.6039, -4.5918, -2.3073, -2.8686, -0.1374, -1.3245],
        [ 4.8797, -1.0191, -4.5854, -2.4637, -3.1070, -0.5442, -0.8080]],
       grad_fn=<AddBackward0>)

In [6]:
from torch_geometric.utils import scatter

In [8]:
from torch_geometric.loader import DataLoader

loader = DataLoader(dataset, batch_size=4)

In [9]:
batch, labels = next(iter(loader))

In [10]:
batch

DataBatch(edge_index=[2, 346], pos=[40, 2], batch=[40], ptr=[5])

In [11]:
labels

tensor([5, 1, 6, 6])

In [12]:
batch.batch

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [13]:
out = layer(batch.pos, batch.edge_index)

In [14]:
scatter(out, index=batch.batch, dim=0)

tensor([[ 48.8332, -10.1960, -45.9534, -24.7277, -31.1179,  -5.5775,  -8.0010],
        [ 43.2900,  -0.7076, -42.8172, -10.6113, -19.5073,  10.6804, -12.2846],
        [ 45.4441, -11.4678, -41.6517, -24.3746, -30.7382,  -7.6443,  -6.4531],
        [ 42.0447,  -4.5642, -39.6649, -14.8974, -22.6703,   3.1852,  -9.4935]],
       grad_fn=<ScatterAddBackward0>)

In [15]:
labels

tensor([5, 1, 6, 6])

## Задачи для самостоятельного решения

In [1]:
import os
import networkx as nx
from torch.utils.data import Dataset
import torch as th
from torch_geometric.utils import from_networkx
from torch_geometric.nn import global_mean_pool, SAGEConv, global_add_pool, global_max_pool
from torch_geometric.loader import DataLoader
import torch.nn.functional as F
from torchmetrics import MeanMetric, Accuracy, ConfusionMatrix
from torch.utils.data import random_split
from torch_scatter import scatter_min
import pandas as pd

<p class="task" id="1"></p>

1\. Реализуйте все описанные методы класса `GraphsDataset`. Используя данный класс, создайте датасет на основе файлов архива `graphs.zip` (можно разархивировать вручную или программно). Выведите на экран количество объектов в датасете. Выведите на экран значения признаков узлов для графа с индексом 0.

- [x] Проверено на семинаре

In [46]:
class GraphsDataset(Dataset):
    def __init__(self, root_dir):
        super().__init__()
        self.graphs = []
        self.ys = []
        f2l = {
            'barabasi_albert_graph': 0,
            'random_partition_graph': 1,
            'watts_strogatz_graph': 2
        }
        for folder in sorted(os.listdir(root_dir)):
            folder_path = os.path.join(root_dir, folder)
            for fname in sorted(os.listdir(folder_path), key=lambda x: int(''.join(filter(str.isdigit, x)))):
                G = nx.read_gml(os.path.join(folder_path, fname))
                self.graphs.append(G)
                self.ys.append(f2l[folder])
        attrs = set()
        for G in self.graphs:
            for _, data in G.nodes(data=True):
                attrs.update(data.keys())
        self.node_attrs = list(attrs)
        self.ys = th.tensor(self.ys, dtype=th.long)

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        G = self.graphs[idx]
        for _, data in G.nodes(data=True):
            for key in self.node_attrs:
                data.setdefault(key, 0.0)
        data = from_networkx(G, group_node_attrs=self.node_attrs)
        return data, self.ys[idx]

In [47]:
dataset = GraphsDataset(root_dir='data/graphs')
len(dataset)

600

In [48]:
data0, y0 = dataset[0]
data0.x.shape, data0.x

(torch.Size([13, 3]),
 tensor([[0.6667, 0.0000, 0.3929],
         [0.2500, 0.0000, 0.6667],
         [0.3333, 0.0000, 0.1667],
         [0.5000, 0.0000, 0.4667],
         [0.5833, 0.0000, 0.4762],
         [0.6667, 0.0000, 0.3929],
         [0.4167, 0.0000, 0.4000],
         [0.5000, 0.0000, 0.3333],
         [0.5000, 0.0000, 0.6000],
         [0.5000, 0.0000, 0.4000],
         [0.4167, 0.0000, 0.4000],
         [0.3333, 0.0000, 0.3333],
         [0.3333, 0.0000, 0.5000]]))

<p class="task" id="2"></p>

2\. Используя датасет из предыдущего задания, создайте объект `torch_geometric.loader.DataLodaer`. Получите один батч размера 128 при помощи этого объекта и выведите на экран (используйте соответствующие атрибуты и методы):

* количество узлов в графе-батче;
* количество связей в графе-батче;
* количество графов в батче;
* количество узлов в каждом графе батча;

Выполните readout для графа на основе атрибута `X`. Выведите размерность полученного тензора на экран.

- [ ] Проверено на семинаре

In [34]:
loader = DataLoader(dataset, batch_size=128, shuffle=False)
batch, batch_labels = next(iter(loader))

batch.num_nodes, batch.num_edges, batch.num_graphs, th.bincount(batch.batch)

(4518,
 21016,
 128,
 tensor([29, 15, 35, 27, 42, 22, 52, 37, 33, 36, 26, 23, 48, 21, 46, 24, 35, 32,
         13, 55, 13, 41, 16, 47, 51, 24, 28, 51, 48, 21, 27, 60, 15, 26, 37, 22,
         47, 35, 30, 40, 30, 33, 50, 48, 45, 19, 37, 38, 34, 56, 25, 18, 46, 56,
         18, 55, 57, 40, 31, 23, 45, 47, 54, 40, 37, 34, 46, 29, 13, 35, 39, 29,
         37, 45, 15, 29, 43, 15, 52, 43, 30, 45, 24, 18, 52, 22, 41, 36, 29, 11,
         35, 50, 25, 45, 21, 36, 39, 35, 44, 29, 24, 45, 37, 33, 50, 38, 30, 22,
         42, 39, 45, 27, 29, 42, 11, 54, 38, 41, 41, 40, 36, 45, 44, 32, 17, 60,
         33, 40]))

In [35]:
global_mean_pool(batch.x, batch.batch).shape

torch.Size([128, 3])

<p class="task" id="3"></p>

3\. Решите задачу классификации графа, используя слои `SAGEConv` и операцию усреднения для процедуры readout. Для обучения используйте стохастический градиентный спуск с размером батча 128. Во время обучения выводите значение функции потерь по эпохам (используйте `torchmetrics`). Вычислите матрицу несоответствий прогнозов и точность обученной модели (используйте `torchmetrics`).  

- [ ] Проверено на семинаре

In [None]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

device = th.device('cuda' if th.cuda.is_available() else 'cpu')

class GraphSAGE(th.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes, readout=global_mean_pool):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.lin = th.nn.Linear(hidden_channels, num_classes)
        self.readout = readout

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.readout(x, batch)
        return self.lin(x)

in_channels = dataset[0][0].x.size(1)
hidden_channels = 64
num_classes = len(th.unique(dataset.ys))
model = GraphSAGE(in_channels, hidden_channels, num_classes).to(device)

optimizer = th.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

loss_metric = MeanMetric().to(device)
acc_metric = Accuracy(task='multiclass', num_classes=num_classes, top_k=1).to(device)
confmat = ConfusionMatrix(task='multiclass', num_classes=num_classes).to(device)

In [94]:
num_epochs = 50
for epoch in range(1, num_epochs + 1):
    model.train()
    loss_metric.reset()
    for data, label in train_loader:
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        loss = F.cross_entropy(out, label)
        loss.backward()
        optimizer.step()
        loss_metric.update(loss)
    print(f'Эпоха {epoch:02d}, loss: {loss_metric.compute():.4f}')

model.eval()
acc_metric.reset()
confmat.reset()
with th.no_grad():
    for data, label in test_loader:
        data, label = data.to(device), label.to(device)
        out = model(data.x, data.edge_index, data.batch)
        preds = out.argmax(dim=1)
        acc_metric.update(preds, label)
        confmat.update(preds, label)

Эпоха 01, loss: 1.0903
Эпоха 02, loss: 1.0457
Эпоха 03, loss: 0.9760
Эпоха 04, loss: 0.8987
Эпоха 05, loss: 0.8253
Эпоха 06, loss: 0.7570
Эпоха 07, loss: 0.6873
Эпоха 08, loss: 0.6302
Эпоха 09, loss: 0.5808
Эпоха 10, loss: 0.5413
Эпоха 11, loss: 0.5116
Эпоха 12, loss: 0.4875
Эпоха 13, loss: 0.4603
Эпоха 14, loss: 0.4442
Эпоха 15, loss: 0.4262
Эпоха 16, loss: 0.4012
Эпоха 17, loss: 0.3900
Эпоха 18, loss: 0.3762
Эпоха 19, loss: 0.3655
Эпоха 20, loss: 0.3515
Эпоха 21, loss: 0.3395
Эпоха 22, loss: 0.3315
Эпоха 23, loss: 0.3196
Эпоха 24, loss: 0.3138
Эпоха 25, loss: 0.3080
Эпоха 26, loss: 0.3029
Эпоха 27, loss: 0.2920
Эпоха 28, loss: 0.2901
Эпоха 29, loss: 0.2873
Эпоха 30, loss: 0.2820
Эпоха 31, loss: 0.2769
Эпоха 32, loss: 0.2810
Эпоха 33, loss: 0.2789
Эпоха 34, loss: 0.2768
Эпоха 35, loss: 0.2711
Эпоха 36, loss: 0.2706
Эпоха 37, loss: 0.2679
Эпоха 38, loss: 0.2695
Эпоха 39, loss: 0.2672
Эпоха 40, loss: 0.2682
Эпоха 41, loss: 0.2617
Эпоха 42, loss: 0.2600
Эпоха 43, loss: 0.2615
Эпоха 44, l

In [95]:
print(f'Точность на тесте: {acc_metric.compute():.4f}')
print(f'Матрица ошибок: \n{confmat.compute()}')

Точность на тесте: 0.8667
Матрица ошибок: 
tensor([[37,  0, 11],
        [ 0, 38,  0],
        [ 5,  0, 29]], device='cuda:0')


<p class="task" id="4"></p>

4\. Повторите решение задачи 3, сравнив разные функции агрегации для проведения операции readout. Выведите результаты в виде таблицы:

Выведите результат в виде таблицы:

| Readout op    | Loss | Acc |
|-----------|------------------|-----------------|
| sum |                  |                 |
| mean  |                  |                 |
| max   |                  |                 |
| min   |                  |                 |

- [ ] Проверено на семинаре

In [96]:
def global_min_pool(x, batch, dim=0):
    out, _ = scatter_min(x, batch, dim=dim)
    return out

readout_ops = {
    'sum': global_add_pool,
    'mean': global_mean_pool,
    'max': global_max_pool,
    'min': global_min_pool
}

results = []
num_epochs = 50

for name, readout in readout_ops.items():
    model = GraphSAGE(in_channels, hidden_channels, num_classes, readout).to(device)
    optimizer = th.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    loss_metric = MeanMetric().to(device)
    acc_metric = Accuracy(task='multiclass', num_classes=num_classes, top_k=1).to(device)

    for epoch in range(1, num_epochs+1):
        model.train()
        loss_metric.reset()
        for data, label in train_loader:
            data, label = data.to(device), label.to(device)
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, data.batch)
            loss = F.cross_entropy(out, label)
            loss.backward()
            optimizer.step()
            loss_metric.update(loss)

    model.eval()
    acc_metric.reset()
    test_loss = 0.0
    n_batches = 0
    with th.no_grad():
        for data, label in test_loader:
            data, label = data.to(device), label.to(device)
            out = model(data.x, data.edge_index, data.batch)
            loss = F.cross_entropy(out, label)
            test_loss += loss.item()
            n_batches += 1
            preds = out.argmax(dim=1)
            acc_metric.update(preds, label)
    test_loss /= n_batches
    test_acc = acc_metric.compute().item()
    results.append((name, test_loss, test_acc))

In [97]:
df = pd.DataFrame(results, columns=['Readout op', 'Loss', 'Acc']).sort_values(by='Acc', ascending=False)
df

Unnamed: 0,Readout op,Loss,Acc
0,sum,0.196431,0.916667
2,max,0.221148,0.9
3,min,0.229243,0.9
1,mean,0.278025,0.858333
