In [37]:
import os

In [38]:
# Define names of datasets to select
dataset_target_name_list = ["ENZYMES", "DD"]

# Define datasets destination
datasets_folder = "datasets/"

# List of all datasets names
dataset_name_list = os.listdir(datasets_folder)

# List of dataset files and label files
dataset_file_list = []
dataset_label_file_list = []

# File paths of dataset edge indexes and labels
for dataset_name in dataset_name_list:
    if(dataset_name in dataset_target_name_list):
        dataset_file_list.append(os.path.join(datasets_folder, dataset_name, f"{dataset_name}.pth"))
        dataset_label_file_list.append(os.path.join(datasets_folder, dataset_name, f"{dataset_name}.global_cc"))

In [39]:
import torch

In [40]:
# Enable gpu for training, validation and test if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [41]:
# List of datasets
dataset_list = []

# List of dataset labels list
dataset_labels_list = []

for dataset_file, dataset_label_file in zip(dataset_file_list, dataset_label_file_list):
    # Load dataset
    dataset = torch.load(dataset_file, weights_only=True)

    # Load labels from file .global_cc
    with open(dataset_label_file, 'r') as f:
        dataset_labels = [float(line.strip()) for line in f.readlines()]

    # Prepare dataset list to manage datasets easily
    dataset_list.append(dataset)

    # Prepare dataset labels
    dataset_labels_list.append(dataset_labels)

In [42]:
from torch_geometric.data import Data
from torch_geometric.utils import degree

In [57]:
# Data structure for storing Data object of graphs
data_list = []

# Create pytorch geometric Data objects from graphs
for dataset, dataset_labels in zip(dataset_list, dataset_labels_list):
    for i, (key, tensor) in enumerate(dataset.items()):
        edge_index = tensor.coalesce().indices()
        label = torch.tensor([dataset_labels[i]])
        data = Data(edge_index=edge_index, y=label)

        # Add node features to current graph's Data object
        x = degree(edge_index[0], data.num_nodes, dtype=torch.float).view(-1, 1)
        data.x = x

        data_list.append(data)

In [44]:
from torch_geometric.loader import DataLoader
from torch.utils.data import random_split

In [45]:
# DataLoader split in train, validation and test
total_len = len(data_list)
train_len = int(0.8 * total_len)
validation_len = int(0.1 * total_len)
test_len = total_len - train_len - validation_len

train_data, validation_data, test_data = random_split(data_list, [train_len, validation_len, test_len])

In [46]:
# DataLoader structures for train, valdation and test
train_data_loader = DataLoader(train_data, batch_size=32, shuffle=True)
validation_data_loader = DataLoader(validation_data, batch_size=32, shuffle=True)
test_data_loader = DataLoader(test_data, batch_size=32, shuffle=True)

In [47]:
import torch.nn.functional as func
from torch_geometric.nn import GCNConv, global_mean_pool

In [48]:
# GNN model representation
class ClusteringCoefficientGNN(torch.nn.Module):
    # Model architecture
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(ClusteringCoefficientGNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.linear = torch.nn.Linear(hidden_channels, out_channels)

    # Forward pass (inference)
    def forward(self, x, edge_index, batch):
        # Graph convolutional layers
        x = self.conv1(x, edge_index)  
        x = func.relu(x)
        x = self.conv2(x, edge_index)
        x = func.relu(x)

        # Global mean pool (graph-level features)
        x = global_mean_pool(x, batch)

        # Fully connected layer
        x = self.linear(x)
        x = func.sigmoid(x)

        return x

In [49]:
# Training function
def train(model, train_data_loader, optimizer, loss_function):
    # Set GNN model to training mode
    model.train()
    train_total_loss = 0

    # Train over all graphs in training_data_loader
    for data in train_data_loader:
        # Clear the gradient
        optimizer.zero_grad()

        # Move data to device (gpu if available)
        data = data.to(device)

        # Forward pass
        out = model(data.x, data.edge_index, data.batch)

        # Compute loss
        loss = loss_function(out.squeeze(dim=1), data.y)
        optimizer.step()

        train_total_loss += loss.item()

    return train_total_loss / len(train_data_loader)

In [50]:
# Evaluation function (for validation and test)
@torch.no_grad()
def evaluate(model, evaluation_data_loader, loss_function):
    # Set model to evaluation mode
    model.eval()
    evaluation_total_loss = 0

    # Evaluation over all graphs in evaluation_data_loader
    for data in evaluation_data_loader:
        # Move data to device (gpu if available)
        data = data.to(device)

        # Forward pass
        out = model(data.x, data.edge_index, data.batch)

        # Compute loss
        loss = loss_function(out.squeeze(dim=1), data.y)

        evaluation_total_loss += loss.item()

    return evaluation_total_loss / len(evaluation_data_loader) 

In [51]:
from torch.optim import Adam

In [52]:
# Number of node features
in_channels = 1

# Output dimension (regression task over scalar numbers)
out_channels = 1

# Hyperparameters for GNN
hidden_channels = 64

# Training settings
learning_rate = 0.001
num_epochs = 10
patience = 3

In [53]:
# Initialize model
model = ClusteringCoefficientGNN(in_channels, hidden_channels, out_channels)
model.to(device)

# Initialize optimizer and loss function
optimizer = Adam(model.parameters(), lr=learning_rate)
loss_function = torch.nn.MSELoss()

In [54]:
# Number of epochs the GNN has not obtained better validation loss
patience_counter = 0

# Best validation loss up to now
best_validation_loss = float('inf')

# Training and validation loop
for epoch in range(1, num_epochs + 1):
    # Training step
    train_loss = train(model, train_data_loader, optimizer, loss_function)

    # Validation step
    val_loss = evaluate(model, validation_data_loader, loss_function)

    # Training status in current epoch
    print(f'Epoch {epoch:03d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Early stopping condition using patience
    if(val_loss < best_validation_loss):
        best_validation_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if(patience_counter == patience):
            print("Patience finished: stopping training")
            break

# Final test phase
test_loss = evaluate(model, test_data_loader, loss_function)
print(f'Test Loss: {test_loss:.4f}')

Epoch 001, Train Loss: 0.2374, Val Loss: 0.2353
Epoch 002, Train Loss: 0.2373, Val Loss: 0.2358
Epoch 003, Train Loss: 0.2374, Val Loss: 0.2355
Epoch 004, Train Loss: 0.2374, Val Loss: 0.2356
Patience finished: stopping training
Test Loss: 0.2365


In [55]:
# Save location in 'model/'
model_folder = "model/"
os.makedirs(model_folder, exist_ok = True)

# Save GNN model
torch.save(model.state_dict(), os.path.join(model_folder, 'graph_gcc_net.pth'))

In [56]:
index = 4
data_point = data_list[index].to(device)
out = model(data_point.x, data_point.edge_index, data.batch)
print(data_list[index].y)
print(out)

tensor([0.0164], device='cuda:0')
tensor([[0.4954]], device='cuda:0', grad_fn=<SigmoidBackward0>)
