# Advance PyTorch Geometric Tutorial
## Tutorial 1
#### Antonio Longa, 15 Nov 2021

# Open Graph Benchmark and PyG
original [code](https://github.com/snap-stanford/ogb/tree/master/examples/nodeproppred/arxiv) made by Matthias Fey

In [None]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [1]:
# import libraries 
import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv


In [2]:
# define GCN
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(GCN, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels, cached=True))
        self.bns = torch.nn.ModuleList()
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(
                GCNConv(hidden_channels, hidden_channels, cached=True))
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(GCNConv(hidden_channels, out_channels, cached=True))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, adj_t)
            x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, adj_t)
        return x.log_softmax(dim=-1)

In [3]:
#Define train and test
def train(model, data, train_idx, optimizer):
    model.train()

    optimizer.zero_grad()
    out = model(data.x, data.adj_t)[train_idx]
    loss = F.nll_loss(out, data.y.squeeze(1)[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()




@torch.no_grad()
def test(model, data, split_idx, evaluator):
    model.eval()

    out = model(data.x, data.adj_t)
    y_pred = out.argmax(dim=-1, keepdim=True)
    
    # update the evaluator
    train_acc = evaluator.eval({'y_true': data.y[split_idx['train']],
                                'y_pred': y_pred[split_idx['train']],
                               })['acc']
    valid_acc = evaluator.eval({'y_true': data.y[split_idx['valid']],
                                'y_pred': y_pred[split_idx['valid']],
                               })['acc']
    test_acc = evaluator.eval({'y_true': data.y[split_idx['test']],
                               'y_pred': y_pred[split_idx['test']],
                              })['acc']

    return train_acc, valid_acc, test_acc

In [4]:
# define the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

### OGB get the dataset

In [5]:
#From node property prediction import :
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator


# download the dataset
dataset = PygNodePropPredDataset(name='ogbn-arxiv',transform=T.ToSparseTensor())
# I have already downloaded the dataset, so it is loaded from my pc

In [6]:
data = dataset[0]
data.adj_t = data.adj_t.to_symmetric()
data = data.to(device) # move the data to the device

In [7]:
# it is a Data object for PyG
data

Data(x=[169343, 128], node_year=[169343, 1], y=[169343, 1], adj_t=[169343, 169343, nnz=2315598])

### PygNodePropPredDataset allows to get datasets.
### Are there other datasets? (for node property predictions)

In [8]:
dataset2 = PygNodePropPredDataset(name='ogbn')

ValueError: Invalid dataset name ogbn.
Available datasets are as follows:
ogbn-proteins
ogbn-products
ogbn-arxiv
ogbn-mag
ogbn-papers100M

# Instanciate the GNN

In [9]:
hidden_channels = 64
num_layers = 2
dropout = 0.5 
epochs = 50
print_steps = 1


split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)

model = GCN(data.num_features, hidden_channels,
            dataset.num_classes, num_layers,
            dropout).to(device)

# Evaluator

In [10]:
evaluator = Evaluator(name='ogbn-arxiv')

evaluator

<ogb.nodeproppred.evaluate.Evaluator at 0x7f410e6ef430>

In [11]:
evaluator.expected_input_format

"==== Expected input format of Evaluator for ogbn-arxiv\n{'y_true': y_true, 'y_pred': y_pred}\n- y_true: numpy ndarray or torch tensor of shape (num_node, num_task)\n- y_pred: numpy ndarray or torch tensor of shape (num_node, num_task)\nwhere y_pred stores predicted class label (integer),\nnum_task is 1, and each row corresponds to one node.\n"

### NOTE:
we have different evaluators for node property predictions, graphs predictions and link predictions

In [12]:
# link property prediction
from ogb.linkproppred import Evaluator as ev
ev(name='ogbl-ppa').expected_input_format

"==== Expected input format of Evaluator for ogbl-ppa\n{'y_pred_pos': y_pred_pos, 'y_pred_neg': y_pred_neg}\n- y_pred_pos: numpy ndarray or torch tensor of shape (num_edge, ). Torch tensor on GPU is recommended for efficiency.\n- y_pred_neg: numpy ndarray or torch tensor of shape (num_edge, ). Torch tensor on GPU is recommended for efficiency.\ny_pred_pos is the predicted scores for positive edges.\ny_pred_neg is the predicted scores for negative edges.\nNote: As the evaluation metric is ranking-based, the predicted scores need to be different for different edges."

In [13]:
# link property prediction
from ogb.graphproppred import Evaluator as ev
ev(name='ogbg-molesol').expected_input_format

"==== Expected input format of Evaluator for ogbg-molesol\n{'y_true': y_true, 'y_pred': y_pred}\n- y_true: numpy ndarray or torch tensor of shape (num_graph, num_task)\n- y_pred: numpy ndarray or torch tensor of shape (num_graph, num_task)\nwhere num_task is 1, and each row corresponds to one graph.\nnan values in y_true are ignored during evaluation.\n"

### END NOTE.

## Train as usual 

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, 1 + epochs):
    loss = train(model, data, train_idx, optimizer)
    result = test(model, data, split_idx, evaluator)

    if epoch % print_steps == 0:
        train_acc, valid_acc, test_acc = result
        print(f'Epoch: {epoch:02d}, '
              f'Loss: {loss:.4f}, '
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}% '
              f'Test: {100 * test_acc:.2f}%')


Epoch: 01, Loss: 4.0065, Train: 1.10%, Valid: 1.27% Test: 1.28%
Epoch: 02, Loss: 3.9284, Train: 1.17%, Valid: 1.25% Test: 1.28%
Epoch: 03, Loss: 3.8547, Train: 1.33%, Valid: 1.30% Test: 1.33%
Epoch: 04, Loss: 3.7865, Train: 1.70%, Valid: 1.59% Test: 1.44%
Epoch: 05, Loss: 3.7158, Train: 2.80%, Valid: 2.93% Test: 2.33%
Epoch: 06, Loss: 3.6484, Train: 6.31%, Valid: 8.72% Test: 7.39%
Epoch: 07, Loss: 3.5848, Train: 12.43%, Valid: 18.81% Test: 17.81%
Epoch: 08, Loss: 3.5141, Train: 18.55%, Valid: 24.67% Test: 23.28%
Epoch: 09, Loss: 3.4507, Train: 23.94%, Valid: 27.29% Test: 25.27%
Epoch: 10, Loss: 3.3944, Train: 27.48%, Valid: 29.07% Test: 26.61%
Epoch: 11, Loss: 3.3339, Train: 29.01%, Valid: 29.79% Test: 27.28%
Epoch: 12, Loss: 3.2731, Train: 29.39%, Valid: 29.95% Test: 27.55%
Epoch: 13, Loss: 3.2197, Train: 29.20%, Valid: 29.75% Test: 27.58%
Epoch: 14, Loss: 3.1723, Train: 28.79%, Valid: 29.42% Test: 27.40%
Epoch: 15, Loss: 3.1161, Train: 28.38%, Valid: 29.22% Test: 27.28%
Epoch: 16, Lo