# Including Node Features with Vanilla Neural Networks

In this chapter, we’ll cover the following topics:
- Introducing graph datasets **Cora** $\cdot$ **Facebook Page-Page**
- Classifying nodes with vanilla neural networks
- Classifying nodes with vanilla graph neural networks

Visualization tools:
- yEd Live
- Gephi

## Datasets
**Cora** for node classification
It represents a network of 2,708 publications, where each connection is a reference. Each publication is described as a binary vector of 1,433 unique words, where 0 and 1 indicate the absence or presence of the corresponding word, respectively.  
This representation is also called a binary bag of words in natural language processing. Our goal is to classify each node into one of seven categories.

!<img src="images/cora.png" width=500>

In [21]:
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root=".", name="Cora")

In [22]:
data = dataset[0]
print(f'Dataset: {dataset}')
print('---------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Dataset: Cora()
---------------
Number of graphs: 1
Number of nodes: 2708
Number of features: 1433
Number of classes: 7


In [23]:
print(f'Graph:')
print('------')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')

Graph:
------
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


In [24]:
# Get Tabular representation of cora dataset without topological information
import pandas as pd 
df_x = pd.DataFrame(data.x.numpy())
df_x['label'] = pd.DataFrame(data.y)
df_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1424,1425,1426,1427,1428,1429,1430,1431,1432,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2704,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


**Facebook Page-Page**  
In this dataset, each of the 22,470 nodes represents an official Facebook page. Pages are connected when there are mutual likes between them. Node features (128-dim vectors) are created from textual descriptions written by the owners of these pages.  
Our goal is to classify each node into one of four categories: politicians, companies, television shows, and governmental organizations.

!<img src="images/facebook-page-page.png" width=500>

### Three major differences with Cora:

- The number of nodes is much higher (2,708 versus 22,470)
- The dimensionality of the node features decreased dramatically (from 1,433 to 128)
- The goal is to classify each node into four categories instead of seven (which is easier since there are fewer options)

In [25]:
from torch_geometric.datasets import FacebookPagePage
dataset = FacebookPagePage(root='.')
data = dataset[0]

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


In [26]:
print(f'Dataset: {dataset}')
print('-----------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Dataset: FacebookPagePage()
-----------------------
Number of graphs: 1
Number of nodes: 22470
Number of features: 128
Number of classes: 4


In [27]:
print(f'\nGraph:')
print('------')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')


Graph:
------
Edges are directed: True
Graph has isolated nodes: False
Graph has loops: True


In [28]:
import pandas as pd 
df_x = pd.DataFrame(data.x.numpy())
df_x['label'] = pd.DataFrame(data.y)
df_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,label
0,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.209509,-0.255755,-0.215140,-0.375903,-0.223836,0
1,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.234818,-0.223700,-0.284379,-0.197935,-0.147256,-0.255755,-0.215140,-0.364134,-0.128634,2
2,-0.262576,-0.265053,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.210461,-0.25101,3.222161,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.209509,-0.255755,-0.215140,-0.375903,-0.223836,1
3,-0.246378,-0.276483,-0.241991,-0.299327,-0.299159,-0.270681,-0.307051,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.265534,-0.080353,-0.209509,-0.250560,-0.180260,-0.375903,-0.223836,2
4,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.175312,-0.272613,-0.224216,-0.181153,-0.255755,-0.215140,-0.370639,-0.223836,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22465,-0.262576,-0.276483,-0.262350,-0.296955,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.209509,-0.255755,-0.196685,-0.370115,-0.223836,3
22466,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.221643,-0.284379,-0.224216,-0.209509,-0.255755,-0.215140,-0.375903,-0.223836,1
22467,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.146793,-0.255755,-0.180389,-0.372097,-0.222613,2
22468,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307668,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.209509,-0.252456,-0.215140,-0.375903,-0.218148,1


Unlike Cora, Facebook Page-Page doesn’t have training, evaluation, and test masks by default. 
- We can arbitrarily create masks with the range() function:
- Or use torch_geometric.transforms as T to randomly calculate masks when the dataset is loaded.

In [29]:
data.train_mask = range(18000)
data.val_mask = range(18001, 20000)
data.test_mask = range(20001, 22470)

## Vanilla MLP for node cls

In [30]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from tqdm.auto import tqdm

class MLP(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super(MLP, self).__init__()
        self.linear1 = torch.nn.Linear(dim_in, dim_h)
        self.linear2 = torch.nn.Linear(dim_h, dim_out)
        
    def forward(self, x):
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.linear2(x)
        return F.log_softmax(x, dim=1)
    
    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        
        self.train()
        for epoch in tqdm(range(epochs + 1), desc='Epoch'):
            optimizer.zero_grad()
            out = self(data.x)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            
            if epoch % 20 == 0:
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | Val Acc: {val_acc*100:.2f}%')
 
    def test(self, data):
        self.eval()
        out = self(data.x)
        acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc
    
def accuracy(y_pred, y_true):
    return torch.sum(y_pred == y_true) / len(y_true)

In [31]:
# Facebook Page Page
from torch_geometric.datasets import FacebookPagePage
dataset = FacebookPagePage(root='.')
data = dataset[0]
data.train_mask = range(18000)
data.val_mask = range(18001, 20000)
data.test_mask = range(20001, 22470)

mlp = MLP(dataset.num_features, 16, dataset.num_classes)
print(mlp)
mlp.fit(data, epochs=100)
# Test
acc = mlp.test(data)
print(f'\nMLP test accuracy: {acc*100:.2f}%')

MLP(
  (linear1): Linear(in_features=128, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=4, bias=True)
)


  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


Epoch:   0%|          | 0/101 [00:00<?, ?it/s]

Epoch   0 | Train Loss: 1.393 | Train Acc: 23.78% | Val Loss: 1.39 | Val Acc: 24.66%
Epoch  20 | Train Loss: 0.657 | Train Acc: 73.77% | Val Loss: 0.67 | Val Acc: 72.84%
Epoch  40 | Train Loss: 0.576 | Train Acc: 77.16% | Val Loss: 0.62 | Val Acc: 74.89%
Epoch  60 | Train Loss: 0.548 | Train Acc: 78.47% | Val Loss: 0.60 | Val Acc: 75.49%
Epoch  80 | Train Loss: 0.531 | Train Acc: 79.02% | Val Loss: 0.59 | Val Acc: 76.24%
Epoch 100 | Train Loss: 0.518 | Train Acc: 79.50% | Val Loss: 0.59 | Val Acc: 75.84%

MLP test accuracy: 75.17%


In [32]:
# Cora
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root=".", name="Cora")
data = dataset[0]

mlp = MLP(dataset.num_features, 16, dataset.num_classes)
print(mlp)
mlp.fit(data, epochs=100)
# Test
acc = mlp.test(data)
print(f'\nMLP test accuracy: {acc*100:.2f}%')

MLP(
  (linear1): Linear(in_features=1433, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=7, bias=True)
)


Epoch:   0%|          | 0/101 [00:00<?, ?it/s]

Epoch   0 | Train Loss: 1.960 | Train Acc: 14.29% | Val Loss: 1.96 | Val Acc: 7.20%
Epoch  20 | Train Loss: 0.107 | Train Acc: 100.00% | Val Loss: 1.42 | Val Acc: 50.80%
Epoch  40 | Train Loss: 0.013 | Train Acc: 100.00% | Val Loss: 1.43 | Val Acc: 51.20%
Epoch  60 | Train Loss: 0.008 | Train Acc: 100.00% | Val Loss: 1.40 | Val Acc: 52.60%
Epoch  80 | Train Loss: 0.009 | Train Acc: 100.00% | Val Loss: 1.35 | Val Acc: 53.60%
Epoch 100 | Train Loss: 0.009 | Train Acc: 100.00% | Val Loss: 1.33 | Val Acc: 53.00%

MLP test accuracy: 54.70%


---
## Classifying nodes with vanilla graph neural networks

Instead of directly introducing well-known GNN architectures, let's try to build our own model to understand the thought process behind GNNs. First, we need to go back to the definition of a simple linear layer.

A basic neural network layer corresponds to a linear transformation:

$$
h_A = x_A W^T
$$

where $x_A$ is the input vector of node $A$, and $W$ is the weight matrix. In PyTorch, this equation can be implemented with the `torch.mm()` function, or with the `nn.Linear` class that adds other parameters such as biases.

With our graph datasets, the input vectors are node features. It means that nodes are completely separate from each other. This is not enough to capture a good understanding of the graph: like a pixel in an image, the context of a node is essential to understand it. If you look at a group of pixels instead of a single one, you can recognize edges, patterns, and so on. Likewise, to understand a node, you need to look at its neighborhood.

Let's call $\mathcal{N}_A$ the set of neighbors of node $A$. Our graph linear layer can be written as follows:

$$
h_A = \sum_{i \in \mathcal{N}_A} x_i W^T
$$

You can imagine several variants of this equation. For instance, we could have a weight matrix $W_1$ dedicated to the central node, and another one $W_2$ for the neighbors. Note that we cannot have a weight matrix per neighbor, as this number can change from node to node.

We're talking about neural networks, so we can't apply the previous equation to each node. Instead, we perform matrix multiplications that are much more efficient. For instance, the equation of the linear layer can be rewritten as:

$$
H = X W^T
$$

where $X$ is the input matrix.

In our case, the adjacency matrix $A$ contains the connections between every node in the graph. Multiplying the input matrix by this adjacency matrix will directly sum up the neighboring node features. We can add **self loops** to the adjacency matrix so that the central node is also considered in this operation. We call this updated adjacency matrix $\tilde{A} = A + I$. Our graph linear layer can be rewritten as follows:

$$
H = \tilde{A}^T X W^T
$$

Let's test this layer by implementing it in PyTorch Geometric. We'll then be able to use it as a regular layer to build a GNN.


In [33]:
class VanillaGNNLayer(torch.nn.Module):
    def __init__(self, dim_in, dim_out):
        super(VanillaGNNLayer, self).__init__()
        self.linear = Linear(dim_in, dim_out, bias=False)
        
    def forward(self, x, adjacency):
        x = self.linear(x)
        x = torch.sparse.mm(adjacency, x)
        return x

In [34]:
dataset, data.edge_index.shape

(Cora(), torch.Size([2, 10556]))

In [35]:
from torch_geometric.utils import to_dense_adj
adjacency = to_dense_adj(data.edge_index)[0] # take the first one because we don't have batch dim. otherwise we would have to add eye to every batch I guess...
adjacency += torch.eye(len(adjacency))
adjacency

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 0., 1., 1.]])

In [40]:
from tqdm.auto import tqdm
class VanillaGNN(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super(VanillaGNN, self).__init__()
        self.gnn1 = VanillaGNNLayer(dim_in, dim_h)
        self.gnn2 = VanillaGNNLayer(dim_h, dim_out)
        
    def forward(self, x, adjacency):
        h = self.gnn1(x, adjacency)
        h = torch.relu(h)
        h = self.gnn2(h, adjacency)
        return F.log_softmax(h, dim=1)
    
    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        self.train()
        for epoch in tqdm(range(epochs+1), desc='Epochs'):
            optimizer.zero_grad()
            out = self(data.x, adjacency)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if epoch % 20 == 0:
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | Val Acc: {val_acc*100:.2f}%')

    
    def test(self, data):
        self.eval()
        out = self(data.x, adjacency)
        acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc

In [41]:
# Cora
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_dense_adj

dataset = Planetoid(root=".", name="Cora")
data = dataset[0]

adjacency = to_dense_adj(data.edge_index)[0]
adjacency += torch.eye(len(adjacency))

gnn = VanillaGNN(dataset.num_features, 16, dataset.num_classes)
print(gnn)
gnn.fit(data, epochs=100)
print(f'\nGNN test accuracy: {gnn.test(data)*100:.2f}%')

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


VanillaGNN(
  (gnn1): VanillaGNNLayer(
    (linear): Linear(in_features=1433, out_features=16, bias=False)
  )
  (gnn2): VanillaGNNLayer(
    (linear): Linear(in_features=16, out_features=7, bias=False)
  )
)


Epochs:   0%|          | 0/101 [00:00<?, ?it/s]

Epoch   0 | Train Loss: 2.035 | Train Acc: 21.43% | Val Loss: 2.08 | Val Acc: 18.20%
Epoch  20 | Train Loss: 0.060 | Train Acc: 100.00% | Val Loss: 1.75 | Val Acc: 72.20%
Epoch  40 | Train Loss: 0.004 | Train Acc: 100.00% | Val Loss: 2.56 | Val Acc: 73.60%
Epoch  60 | Train Loss: 0.001 | Train Acc: 100.00% | Val Loss: 2.82 | Val Acc: 73.60%
Epoch  80 | Train Loss: 0.001 | Train Acc: 100.00% | Val Loss: 2.70 | Val Acc: 74.00%
Epoch 100 | Train Loss: 0.001 | Train Acc: 100.00% | Val Loss: 2.56 | Val Acc: 74.40%

GNN test accuracy: 75.20%


In [42]:
# Facebook Page Page
from torch_geometric.datasets import FacebookPagePage
dataset = FacebookPagePage(root='.')
data = dataset[0]
data.train_mask = range(18000)
data.val_mask = range(18001, 20000)
data.test_mask = range(20001, 22470)

adjacency = to_dense_adj(data.edge_index)[0]
adjacency += torch.eye(len(adjacency))

gnn = VanillaGNN(dataset.num_features, 16, dataset.num_classes)
print(gnn)
gnn.fit(data, epochs=100)
print(f'\nGNN test accuracy: {gnn.test(data)*100:.2f}%')

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


VanillaGNN(
  (gnn1): VanillaGNNLayer(
    (linear): Linear(in_features=128, out_features=16, bias=False)
  )
  (gnn2): VanillaGNNLayer(
    (linear): Linear(in_features=16, out_features=4, bias=False)
  )
)


Epochs:   0%|          | 0/101 [00:00<?, ?it/s]

Epoch   0 | Train Loss: 97.520 | Train Acc: 24.02% | Val Loss: 89.24 | Val Acc: 25.81%
Epoch  20 | Train Loss: 5.117 | Train Acc: 80.79% | Val Loss: 3.49 | Val Acc: 81.64%
Epoch  40 | Train Loss: 2.818 | Train Acc: 81.01% | Val Loss: 1.93 | Val Acc: 82.04%
Epoch  60 | Train Loss: 1.531 | Train Acc: 82.76% | Val Loss: 1.32 | Val Acc: 83.49%
Epoch  80 | Train Loss: 0.860 | Train Acc: 82.36% | Val Loss: 0.78 | Val Acc: 82.64%
Epoch 100 | Train Loss: 0.672 | Train Acc: 84.00% | Val Loss: 0.65 | Val Acc: 83.64%

GNN test accuracy: 83.80%


In [None]:
# Dataset
dataset = FacebookPagePage(root=".")
data = dataset[0]
data.train_mask = range(18000)
data.val_mask = range(18001, 20000)
data.test_mask = range(20001, 22470)

# Adjacency matrix
adjacency = to_dense_adj(data.edge_index)[0]
adjacency += torch.eye(len(adjacency))
adjacency

# MLP
mlp = MLP(dataset.num_features, 16, dataset.num_classes)
print(mlp)
mlp.fit(data, epochs=100)
acc = mlp.test(data)
print(f'\nMLP test accuracy: {acc*100:.2f}%\n')

# GCN
gnn = VanillaGNN(dataset.num_features, 16, dataset.num_classes)
print(gnn)
gnn.fit(data, epochs=100)
acc = gnn.test(data)
print(f'\nGNN test accuracy: {acc*100:.2f}%')