## 4.2.3 Questions for Node Classification

In [216]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Load the dataset
node_data = torch.load('node_data.pt')  # Assuming node_data is a PyG dataset
graph_data = torch.load('graph_data.pt')

print("Node Data:", node_data)
print("Graph Data:", graph_data)

Node Data: CiteSeer()
Graph Data: BZR(405)


  node_data = torch.load('node_data.pt')  # Assuming node_data is a PyG dataset
  graph_data = torch.load('graph_data.pt')


In [217]:
data = node_data

# Output data split information
print(f"Training set size: {data.train_mask.sum().item()}")
print(f"Validation set size: {data.val_mask.sum().item()}")
print(f"Test set size: {data.test_mask.sum().item()}")

Training set size: 120
Validation set size: 500
Test set size: 1000


In [218]:
hidden_dims = [16, 32, 64]  # candidate hidden layer dimensions
learning_rates = [0.01, 0.005]  # candidate learning rates

In [219]:
# Define GCN
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Define MLP
class MLP(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# Training
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index) if isinstance(model, GCN) else model(data.x)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    # print(loss)
    loss.backward()
    optimizer.step()
    return loss.item()

# Verification
def validate(model, data, criterion):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index) if isinstance(model, GCN) else model(data.x)
        loss = criterion(out[data.val_mask], data.y[data.val_mask])
        pred = out.argmax(dim=1)
        # print("----------------------")
        correct = (pred[data.val_mask] == data.y[data.val_mask]).sum().item()
        total = data.val_mask.sum().item()
        acc = correct / total
        # print('correct', correct)
    return loss.item(), acc

# Testing
def test(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index) if isinstance(model, GCN) else model(data.x)
        pred = out.argmax(dim=1)
        correct = (pred[data.test_mask] == data.y[data.test_mask]).sum().item()
        total = data.test_mask.sum().item()
        acc = correct / total
    return acc

# Train and evaluate the model
def train_and_evaluate(model, data, epochs=100, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    best_val_acc = 0
    best_model = None

    for epoch in range(epochs):
        train_loss = train(model, data, optimizer, criterion)
        val_loss, val_acc = validate(model, data, criterion)
        print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model.state_dict()
            # record the best epoch information for model hypertuning
            best_epoch_info = {
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'val_loss': val_loss,
                'val_acc': val_acc
            }
    # Load the best model and evaluate on the test set
    model.load_state_dict(best_model)
    test_acc = test(model, data)
    # output the best epoch information for model hypertuning
    print(f"Best Epoch: {best_epoch_info['epoch']}, "
          f"Train Loss: {best_epoch_info['train_loss']:.4f}, "
          f"Val Loss: {best_epoch_info['val_loss']:.4f}, "
          f"Val Acc: {best_epoch_info['val_acc']:.4f}")
    print(f'Test Accuracy: {test_acc:.4f}')
    return test_acc

# initialize the model
input_dim = data.num_features

output_dim = torch.unique(data.y).size(0)

# train and evaluate 2-layer GCN
print("Training 2-layer GCN...")
# after hyperparameter tuning, the best hidden_dim and learning rate are 64 and 0.005 respectively
hidden_dim = 64
gcn = GCN(input_dim, hidden_dim, output_dim)
gcn_test_acc = train_and_evaluate(gcn, data, lr=0.005)

# train and evaluate 2-layer MLP
print("Training 2-layer MLP...")
# after hyperparameter tuning, the best hidden_dim and learning rate are 32 and 0.005 respectively
hidden_dim = 32
mlp = MLP(input_dim, hidden_dim, output_dim)
mlp_test_acc = train_and_evaluate(mlp, data, lr=0.005)

# Compare the results
print(f"2-layer GCN Test Accuracy: {gcn_test_acc:.4f}")
print(f"2-layer MLP Test Accuracy: {mlp_test_acc:.4f}")

if gcn_test_acc > mlp_test_acc:
    print("GCN outperforms MLP because it utilizes graph structure information by aggregating neighbor nodes, capturing local relationships. In contrast, MLP only relies on node features, ignoring the graph's connectivity, which limits its ability to model interactions between nodes")
else:
    print("MLP performs better than GCN, this may due to the graph architecture is not helpful for the current task.")

Training 2-layer GCN...
Epoch 1, Train Loss: 1.7802, Val Loss: 1.7117, Val Acc: 0.4720
Epoch 2, Train Loss: 1.5492, Val Loss: 1.6132, Val Acc: 0.5660
Epoch 3, Train Loss: 1.3065, Val Loss: 1.5025, Val Acc: 0.5960
Epoch 4, Train Loss: 1.0738, Val Loss: 1.4020, Val Acc: 0.6200
Epoch 5, Train Loss: 0.8400, Val Loss: 1.3153, Val Acc: 0.6400
Epoch 6, Train Loss: 0.6616, Val Loss: 1.2406, Val Acc: 0.6500
Epoch 7, Train Loss: 0.5233, Val Loss: 1.1768, Val Acc: 0.6600
Epoch 8, Train Loss: 0.4251, Val Loss: 1.1235, Val Acc: 0.6700
Epoch 9, Train Loss: 0.3106, Val Loss: 1.0830, Val Acc: 0.6860
Epoch 10, Train Loss: 0.2442, Val Loss: 1.0533, Val Acc: 0.6780
Epoch 11, Train Loss: 0.1713, Val Loss: 1.0373, Val Acc: 0.6880
Epoch 12, Train Loss: 0.1445, Val Loss: 1.0304, Val Acc: 0.6820
Epoch 13, Train Loss: 0.1145, Val Loss: 1.0301, Val Acc: 0.6720
Epoch 14, Train Loss: 0.1042, Val Loss: 1.0354, Val Acc: 0.6620
Epoch 15, Train Loss: 0.0817, Val Loss: 1.0457, Val Acc: 0.6580
Epoch 16, Train Loss: 0.0

In [220]:
# define GCN with multiple layers
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(GCN, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(input_dim, hidden_dim))
        for i in range(num_layers - 2):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
        self.convs.append(GCNConv(hidden_dim, output_dim))

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, training=self.training)
        x = self.convs[-1](x, edge_index)
        return F.log_softmax(x, dim=1)

# train and evaluate GCN with multiple layers
def train_and_evaluate_gcn(data, num_layers, epochs=100, lr=0.01):
    # print('hidden dim:', hidden_dim)
    model = GCN(input_dim, hidden_dim, output_dim, num_layers)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    best_val_acc = 0
    best_model = None

    for epoch in range(epochs):
        train_loss = train(model, data, optimizer, criterion)
        val_loss, val_acc = validate(model, data, criterion)
        print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model.state_dict()
            # record the best epoch information for model hypertuning
            best_epoch_info = {
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'val_loss': val_loss,
                'val_acc': val_acc
            }

    # load the best model and evaluate on the test set
    model.load_state_dict(best_model)
    test_acc = test(model, data)
    # output the best epoch information for model hypertuning
    print(f"Best Epoch: {best_epoch_info['epoch']}, "
          f"Train Loss: {best_epoch_info['train_loss']:.4f}, "
          f"Val Loss: {best_epoch_info['val_loss']:.4f}, "
          f"Val Acc: {best_epoch_info['val_acc']:.4f}")
    print(f'{num_layers}-layer GCN Test Accuracy: {test_acc:.4f}')
    return test_acc

# 2, 4, 6 GCN
print("Training 2-layer GCN...")
# after hyperparameter tuning, the best hidden_dim and learning rate are 64 and 0.005 respectively
hidden_dim = 64
gcn_2layer_test_acc = train_and_evaluate_gcn(data, num_layers=2, lr=0.005)

print("Training 4-layer GCN...")
# after hyperparameter tuning, the best hidden_dim and learning rate are 64 and 0.005 respectively
hidden_dim = 64
gcn_4layer_test_acc = train_and_evaluate_gcn(data, num_layers=4, lr=0.005)

print("Training 6-layer GCN...")
# after hyperparameter tuning, the best hidden_dim and learning rate are 64 and 0.01 respectively
hidden_dim = 64
gcn_6layer_test_acc = train_and_evaluate_gcn(data, num_layers=6, lr=0.01)

# Compare the results
print(f"2-layer GCN Test Accuracy: {gcn_2layer_test_acc:.4f}")
print(f"4-layer GCN Test Accuracy: {gcn_4layer_test_acc:.4f}")
print(f"6-layer GCN Test Accuracy: {gcn_6layer_test_acc:.4f}")

# Layout
if (gcn_2layer_test_acc > gcn_4layer_test_acc) and (gcn_2layer_test_acc > gcn_6layer_test_acc):
    print("2 layer GCN performs the best. This is due to the fact that as layers increase, the model will become over-smoothing, where node representations are indistinguishable. Additionally, a 2-layer GCN effectively captures local neighborhood information without excessive complexity, making it more robust and efficient for the given task.")
elif (gcn_4layer_test_acc > gcn_6layer_test_acc) and (gcn_4layer_test_acc > gcn_2layer_test_acc):
    print("4 layer GCN performs the best. This suggests that the dataset requires capturing higher-order neighborhood information (i.e., nodes that are further away in the graph). A 4-layer GCN strikes a good balance between capturing deeper relationships and avoiding over-smoothing or excessive computational cost.")
elif (gcn_6layer_test_acc > gcn_4layer_test_acc) and (gcn_6layer_test_acc > gcn_2layer_test_acc):
    print("6 layer GCN performs the best. This indicates that the dataset has complex structural dependencies that require capturing information from distant nodes in the graph. However, this may also suggest that the model is less prone to over-smoothing for this specific dataset, possibly due to regularization techniques or the inherent properties of the graph.")

Training 2-layer GCN...
Epoch 1, Train Loss: 1.7934, Val Loss: 1.7161, Val Acc: 0.5160
Epoch 2, Train Loss: 1.5660, Val Loss: 1.6204, Val Acc: 0.6080
Epoch 3, Train Loss: 1.3657, Val Loss: 1.5102, Val Acc: 0.6420
Epoch 4, Train Loss: 1.0990, Val Loss: 1.3999, Val Acc: 0.6560
Epoch 5, Train Loss: 0.8452, Val Loss: 1.2960, Val Acc: 0.7000
Epoch 6, Train Loss: 0.6777, Val Loss: 1.2022, Val Acc: 0.7080
Epoch 7, Train Loss: 0.5318, Val Loss: 1.1252, Val Acc: 0.7080
Epoch 8, Train Loss: 0.3818, Val Loss: 1.0677, Val Acc: 0.7100
Epoch 9, Train Loss: 0.2747, Val Loss: 1.0275, Val Acc: 0.7100
Epoch 10, Train Loss: 0.2191, Val Loss: 1.0007, Val Acc: 0.7040
Epoch 11, Train Loss: 0.1792, Val Loss: 0.9845, Val Acc: 0.6900
Epoch 12, Train Loss: 0.1405, Val Loss: 0.9801, Val Acc: 0.6760
Epoch 13, Train Loss: 0.1143, Val Loss: 0.9815, Val Acc: 0.6740
Epoch 14, Train Loss: 0.0821, Val Loss: 0.9907, Val Acc: 0.6660
Epoch 15, Train Loss: 0.0741, Val Loss: 1.0056, Val Acc: 0.6640
Epoch 16, Train Loss: 0.0

## 4.2.4 Questions for Graph Classification

In [221]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool, global_max_pool, global_add_pool
from torch_geometric.data import DataLoader

In [222]:
# load
graph_data = torch.load('graph_data.pt')

# split
train_data = graph_data[:100]  # first 100 as training set
val_data = graph_data[100:150]  # 101 to 150 as validation set
test_data = graph_data[150:]  # rest as test set

# GCN + READOUT + MLP
class GCNWithReadout(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, readout_type='sum'):
        super(GCNWithReadout, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.readout_type = readout_type
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x, edge_index, batch):
        # GCN 
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        # READOUT 
        if self.readout_type == 'sum':
            x = global_add_pool(x, batch)
        elif self.readout_type == 'mean':
            x = global_mean_pool(x, batch)
        elif self.readout_type == 'max':
            x = global_max_pool(x, batch)

        # MLP 
        x = self.mlp(x)
        return F.log_softmax(x, dim=1)

# train function
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# validate function
def validate(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for data in loader:
            out = model(data.x, data.edge_index, data.batch)
            loss = criterion(out, data.y)
            total_loss += loss.item()
            pred = out.argmax(dim=1)
            correct += int((pred == data.y).sum())
            # print("correct:", correct)
    return total_loss / len(loader), correct / len(loader.dataset)

# test function
def test(model, loader):
    model.eval()
    correct = 0
    # print("Testing...")
    with torch.no_grad():
        for data in loader:
            out = model(data.x, data.edge_index, data.batch)
            pred = out.argmax(dim=1)
            correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)

# train and evaluate function
def train_and_evaluate(model, train_data, val_data, test_data, epochs=100, lr=0.01):
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    best_val_acc = 0
    best_model = None

    for epoch in range(epochs):
        train_loss = train(model, train_loader, optimizer, criterion)
        val_loss, val_acc = validate(model, val_loader, criterion)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model.state_dict()
        print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

    # load the best model and evaluate on the test set
    model.load_state_dict(best_model)
    test_acc = test(model, test_loader)
    print(f'Test Accuracy: {test_acc:.4f}')
    return test_acc

# initialization
input_dim = train_data[0].num_features  # node feature dim
hidden_dim = 32  # hidden dim
output_dim = torch.unique(torch.cat([data.y for data in graph_data])).size(0)  # num of classes

# 3 readout
readout_types = ['sum', 'mean', 'max']
results = {}

for readout_type in readout_types:
    print(f"Training GCN with {readout_type} readout...")
    model = GCNWithReadout(input_dim, hidden_dim, output_dim, readout_type=readout_type)
    test_acc = train_and_evaluate(model, train_data, val_data, test_data)
    results[readout_type] = test_acc

for readout_type, acc in results.items():
    print(f"{readout_type} readout Test Accuracy: {acc:.4f}")

best_readout = max(results, key=results.get)
print(f"The best readout function is {best_readout} with accuracy {results[best_readout]:.4f}.")

if best_readout == 'sum':
    print("Sum keeps the information of all nodes, it avoids information loss. Compared to Average and Max, Sum is better at capturing global features.")
elif best_readout == 'mean':
    print("Mean smooths out the differences between node features and reduces the impact of noise")
elif best_readout == 'max':
    print("Max focuses on capturing the most salient features of the graph (i.e. the node information that stands out the most). This method is very effective when the key information of the graph is concentrated in a few nodes, and interference from irrelevant or secondary nodes can be ignored")

Training GCN with sum readout...
Epoch 1, Train Loss: 0.6292, Val Loss: 0.2783, Val Acc: 0.9000
Epoch 2, Train Loss: 0.3666, Val Loss: 0.3068, Val Acc: 0.9000
Epoch 3, Train Loss: 0.4461, Val Loss: 0.2769, Val Acc: 0.9000
Epoch 4, Train Loss: 0.3404, Val Loss: 0.2781, Val Acc: 0.9000
Epoch 5, Train Loss: 0.5640, Val Loss: 0.2905, Val Acc: 0.9000
Epoch 6, Train Loss: 0.4210, Val Loss: 0.3498, Val Acc: 0.9000
Epoch 7, Train Loss: 0.3899, Val Loss: 0.3125, Val Acc: 0.9000
Epoch 8, Train Loss: 0.4313, Val Loss: 0.2673, Val Acc: 0.9000
Epoch 9, Train Loss: 0.3304, Val Loss: 0.2646, Val Acc: 0.9000
Epoch 10, Train Loss: 0.3252, Val Loss: 0.2647, Val Acc: 0.9000
Epoch 11, Train Loss: 0.4074, Val Loss: 0.2640, Val Acc: 0.9000
Epoch 12, Train Loss: 0.3293, Val Loss: 0.2602, Val Acc: 0.9000
Epoch 13, Train Loss: 0.2989, Val Loss: 0.2519, Val Acc: 0.9000
Epoch 14, Train Loss: 0.3068, Val Loss: 0.2471, Val Acc: 0.9000
Epoch 15, Train Loss: 0.4085, Val Loss: 0.2536, Val Acc: 0.9000


  graph_data = torch.load('graph_data.pt')


Epoch 16, Train Loss: 0.3243, Val Loss: 0.2858, Val Acc: 0.9000
Epoch 17, Train Loss: 0.3213, Val Loss: 0.2399, Val Acc: 0.9000
Epoch 18, Train Loss: 0.3129, Val Loss: 0.2463, Val Acc: 0.9000
Epoch 19, Train Loss: 0.5838, Val Loss: 0.2507, Val Acc: 0.9000
Epoch 20, Train Loss: 0.4015, Val Loss: 0.4050, Val Acc: 0.9000
Epoch 21, Train Loss: 0.4745, Val Loss: 0.3956, Val Acc: 0.9000
Epoch 22, Train Loss: 0.3535, Val Loss: 0.2938, Val Acc: 0.9000
Epoch 23, Train Loss: 0.4186, Val Loss: 0.2491, Val Acc: 0.9000
Epoch 24, Train Loss: 0.5962, Val Loss: 0.2493, Val Acc: 0.9000
Epoch 25, Train Loss: 0.3231, Val Loss: 0.3271, Val Acc: 0.9000
Epoch 26, Train Loss: 0.3933, Val Loss: 0.3280, Val Acc: 0.9000
Epoch 27, Train Loss: 0.3411, Val Loss: 0.2760, Val Acc: 0.9000
Epoch 28, Train Loss: 0.3637, Val Loss: 0.2474, Val Acc: 0.9000
Epoch 29, Train Loss: 0.5053, Val Loss: 0.2474, Val Acc: 0.9000
Epoch 30, Train Loss: 0.4750, Val Loss: 0.2764, Val Acc: 0.9000
Epoch 31, Train Loss: 0.3431, Val Loss: 