# **Graph Learning Project - ZINC exp 3**

By Shahar Cohen 205669260 & Alexander petrunin 205782568

# Installation

In [1]:
!pip install -q torch-geometric

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# SETUP

In [2]:
import torch
import torch.nn as nn
from torch_geometric.nn import GPSConv, GatedGraphConv, TransformerConv, GINEConv
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import global_mean_pool, global_add_pool

from torch_geometric.transforms import AddLaplacianEigenvectorPE
import torch_geometric

import torch.optim as optim

from torch_geometric.datasets import ZINC
from torch_geometric.loader import DataLoader

import torch.optim as optim
from torch_geometric.data import DataLoader
from sklearn.metrics import mean_squared_error

from torch_geometric.transforms import AddRandomWalkPE
from torch_geometric.datasets import ZINC
from torch_geometric.data import DataLoader

from torch_geometric.typing import Tensor
from torch_geometric.typing import Adj
from typing import Any, Dict, Optional
from torch_geometric.utils import to_dense_batch
from torch_geometric.nn.attention import PerformerAttention


# MODEL:

In [3]:
class MLPBlock(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(MLPBlock, self).__init__()
        self.fc1 = nn.Linear(in_channels, hidden_channels)
        self.fc2 = nn.Linear(hidden_channels, hidden_channels)  # This should output 'hidden_channels'

    def forward(self, x):
        x = x.float()  # Ensure the input is float before passing it to the linear layer
        x = F.relu(self.fc1(x))  # Apply ReLU activation after the first linear layer
        x = self.fc2(x)  # The second layer keeps the number of features as hidden_channels
        return x


In [4]:
class GraphGPSModel(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim, pe_in_dim, pe_out_dim, num_layers):
        super(GraphGPSModel, self).__init__()

        # MLP layers
        self.mlp1 = MLPBlock(input_dim + pe_out_dim, hidden_dim)

        # Create MLP layers for GINEConv GPSConv layers
        self.mlps = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim)
            )
            for _ in range(num_layers)
        ])

        # Create GPSConv layers
        self.gps_layers = nn.ModuleList([
            CustomGPSConv(
                hidden_dim,
                conv=GINEConv(self.mlps[i], eps=0.0, train_eps=False, edge_dim=3),
                heads=4,
                attn_kwargs={'dropout': 0.5}
            )
            for i in range(num_layers)
        ])

        # Final fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

        # PE layers
        self.bn_pe = nn.BatchNorm1d(pe_in_dim)
        self.fc_pe = nn.Linear(pe_in_dim, pe_out_dim)



    def forward(self, data):

        x, edge_index, batch, pe, edge_attr = (
            data.x,
            data.edge_index,
            data.batch,
            data.random_walk_pe,
            data.edge_attr
        )

        # Transform edge attributes
        dummy_tensor = torch.zeros(edge_attr.size(0), 3, device=edge_attr.device)  # Change size to 3 for the new dummy
        dummy_tensor[edge_attr == 1, 0] = 1  # Keep the condition for edge_attr == 1
        dummy_tensor[edge_attr == 2, 1] = 1  # Keep the condition for edge_attr == 2
        dummy_tensor[edge_attr == 3, 2] = 1  # New condition for edge_attr == 3
        edge_attr = dummy_tensor


        # Process positional encodings (PE)
        pe = self.bn_pe(pe)
        pe = self.fc_pe(pe)

        # Concatenate PE to node features
        x = torch.cat([x, pe], dim=1)

        # Initial MLP processing
        x = self.mlp1(x)

        # Sequentially apply GPSConv layers
        for gps_layer in self.gps_layers:
            x = gps_layer(x, edge_index, batch=batch, edge_attr=edge_attr)

        # Global pooling to aggregate node features into graph features
        x = global_add_pool(x, batch)

        # Final classification layer
        x = self.fc(x)
        return x


GPS layer

In [5]:
class CustomGPSConv(GPSConv):
    def __init__(self,hidden_dim, *args, **kwargs):
        super().__init__(hidden_dim, *args, **kwargs)
        self.MLP_combine = MLPBlock(hidden_dim*2, hidden_dim)
        self.cross_attn1 = torch.nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=2)
        self.cross_attn2 = torch.nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=2)

    def forward(
        self,
        x: Tensor,
        edge_index: Adj,
        batch: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> Tensor:
        r"""Runs the forward pass of the module."""
        hs = []
        if self.conv is not None:  # Local MPNN.
            h = self.conv(x, edge_index, **kwargs)
            h = F.dropout(h, p=self.dropout, training=self.training)
            h = h + x
            if self.norm1 is not None:
                if self.norm_with_batch:
                    h = self.norm1(h, batch=batch)
                else:
                    h = self.norm1(h)
            hs.append(h)

        # Global attention transformer-style model.
        h, mask = to_dense_batch(x, batch)

        if isinstance(self.attn, torch.nn.MultiheadAttention):
            h, _ = self.attn(h, h, h, key_padding_mask=~mask,
                             need_weights=False)
        elif isinstance(self.attn, PerformerAttention):
            h = self.attn(h, mask=mask)

        h = h[mask]
        h = F.dropout(h, p=self.dropout, training=self.training)
        h = h + x  # Residual connection.
        if self.norm2 is not None:
            if self.norm_with_batch:
                h = self.norm2(h, batch=batch)
            else:
                h = self.norm2(h)
        hs.append(h)


        ##################

        # Original Code:
        #out = sum(hs)  # Combine local and global outputs.

        # Our code:

        # Combine the two tensors in hs with cross attention layer
        cross_attn_output1, _ = self.cross_attn1(
            query=hs[0].unsqueeze(1),
            key=hs[1].unsqueeze(1),
            value=hs[1].unsqueeze(1)
        )

        cross_attn_output2, _ = self.cross_attn2(
            query=hs[1].unsqueeze(1),
            key=hs[0].unsqueeze(1),
            value=hs[0].unsqueeze(1)
        )

        cross_attn_output1 = cross_attn_output1.squeeze(1)  # Back to (N, hidden_dim)
        cross_attn_output1 = 0.1 * cross_attn_output1 + hs[1]
        cross_attn_output2 = cross_attn_output2.squeeze(1)
        cross_attn_output1 = 0.1 * cross_attn_output2 + hs[0]

        out = torch.cat([cross_attn_output1, cross_attn_output2], dim=1)

        # Pass the combined tensor through the MLP
        out = self.MLP_combine(out) + sum(hs)
        #+ sum(hs)

        # Original Code:
        #out = out + self.mlp(out)

        #################


        if self.norm3 is not None:
            if self.norm_with_batch:
                out = self.norm3(out, batch=batch)
            else:
                out = self.norm3(out)

        return out

# Load ZINC and add PE:

In [6]:
# Load the ZINC dataset
transform = AddRandomWalkPE(walk_length=20)

# Load the ZINC dataset with predefined splits
train_dataset = ZINC(root='./data', subset=True, split='train', transform=transform)
val_dataset = ZINC(root='./data', subset=True, split='val', transform=transform)
test_dataset = ZINC(root='./data', subset=True, split='test', transform=transform)

# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Downloading https://www.dropbox.com/s/feo9qle74kg48gy/molecules.zip?dl=1
Extracting data/molecules.zip
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/train.index
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/val.index
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/test.index
Processing...
Processing train dataset: 100%|██████████| 10000/10000 [00:00<00:00, 12232.95it/s]
Processing val dataset: 100%|██████████| 1000/1000 [00:00<00:00, 3901.33it/s]
Processing test dataset: 100%|██████████| 1000/1000 [00:00<00:00, 8784.43it/s]
Done!


# Training

In [7]:
# Training loop
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        optimizer.zero_grad()

        # Move data to the same device as the model
        data = data.to(device)

        # Forward pass
        output = model(data)

        # Get the target values (penalized logP)
        y = data.y.view(-1, 1).to(device)  # Ensure target is on the same device as the model

        # Compute the loss
        loss = criterion(output, y)
        loss.backward()

        # Optimization step
        optimizer.step()

        total_loss += loss.item()


    return total_loss / len(train_loader)


# Define a function to evaluate the model on a given dataset
def evaluate(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():  # Disable gradient computation for evaluation
        for data in loader:
            data = data.to(device)
            output = model(data)
            y = data.y.view(-1, 1).to(device)
            loss = criterion(output, y)
            total_loss += loss.item()
    return total_loss / len(loader)


In [8]:
num_layers = 10
input_dim = train_dataset.num_features
hidden_dim = 64
output_dim = 1
pe_in_dim = 20
pe_out_dim = 28

weight_decay = 1e-5
lr = 0.001
epochs_num = 250

In [9]:
# Define the model

model = GraphGPSModel(input_dim=input_dim, hidden_dim=hidden_dim,  output_dim=output_dim, pe_in_dim=pe_in_dim, pe_out_dim=pe_out_dim, num_layers=num_layers)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Loss function
criterion = nn.MSELoss()

# Initialize variables to track the best model
best_val_loss = float('inf')
best_model = None

# Training the model for epochs_num:
for epoch in range(epochs_num):
    # Train the model for one epoch
    train_loss = train()

    # Evaluate the model on the validation set
    val_loss = evaluate(val_loader)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

    # Check if this is the best validation loss we've seen
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        # Save a copy of the best model
        best_model = model.state_dict()  # No need for deepcopy
        print(f'New best model saved at epoch {epoch+1} with Validation Loss: {val_loss:.4f}')

# After training, you can save the best model to disk
torch.save(best_model, 'best_model_ZINC.pth')
print("Best model saved to 'best_model_ZINC.pth'.")

Epoch 1, Train Loss: 3.6225, Validation Loss: 1.7673
New best model saved at epoch 1 with Validation Loss: 1.7673
Epoch 2, Train Loss: 1.4057, Validation Loss: 1.6800
New best model saved at epoch 2 with Validation Loss: 1.6800
Epoch 3, Train Loss: 1.2610, Validation Loss: 1.1061
New best model saved at epoch 3 with Validation Loss: 1.1061
Epoch 4, Train Loss: 1.1169, Validation Loss: 1.0309
New best model saved at epoch 4 with Validation Loss: 1.0309
Epoch 5, Train Loss: 0.9928, Validation Loss: 1.1098
Epoch 6, Train Loss: 1.0057, Validation Loss: 0.9016
New best model saved at epoch 6 with Validation Loss: 0.9016
Epoch 7, Train Loss: 0.9660, Validation Loss: 1.2187
Epoch 8, Train Loss: 0.8776, Validation Loss: 1.1113
Epoch 9, Train Loss: 0.9408, Validation Loss: 0.9151
Epoch 10, Train Loss: 0.8528, Validation Loss: 0.9013
New best model saved at epoch 10 with Validation Loss: 0.9013
Epoch 11, Train Loss: 0.8113, Validation Loss: 1.0602
Epoch 12, Train Loss: 0.8714, Validation Loss: 0

#Test Score:

In [10]:
def test_score():
    model.eval()  # Set the model to evaluation mode
    total_mae = 0.0
    num_batches = 0

    with torch.no_grad():  # Disable gradient computation during evaluation
        for batch in test_loader:  # Assuming you have a DataLoader for your test set
            # Move batch data to the same device as the model
            data = batch.to(device)

            # Forward pass (prediction)
            output = model(data)

            # Ensure target is the correct shape
            target = data.y.view(-1, 1).to(device)  # Match output shape: [batch_size, 1]

            # Compute Mean Absolute Error (MAE)
            mae_loss = F.l1_loss(output, target)

            total_mae += mae_loss.item()
            num_batches += 1

    # Return average MAE over all batches in the test set
    avg_mae = total_mae / num_batches
    return avg_mae


# Load the best model's state dictionary
model.load_state_dict(best_model)
model.to(device)  # Ensure the model is on the correct device (GPU or CPU)

# Now you can evaluate the model on the test set
test_mae = test_score()
print(f"Test MAE: {test_mae:.4f}")



Test MAE: 1.0756
