# imports

In [None]:
!pip install torch_geometric
!pip install torch torchvision
!pip install networkx matplotlib
!pip install torch-geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.2-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.2
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     

In [None]:
import numpy as np
import pandas as pd
import networkx as nx

In [None]:
import torch
import os
import typing
import torch_geometric

import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.datasets as datasets
from torch_geometric.data import DataLoader


from torch_geometric.nn import GCNConv, ChebConv
from torch_geometric.nn import global_mean_pool

# utils

In [None]:
def data_load_sep(dataset):
    dataset = dataset.shuffle()

    train_size = int(len(dataset) * 0.8)  # 80% for training
    test_size = len(dataset) - train_size

    train_dataset, test_dataset = dataset[:train_size], dataset[train_size:]

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    num_features = train_dataset.num_features
    num_classes = train_dataset.num_classes

    return num_features, num_classes,train_loader, test_loader



In [None]:
def data_stats_node(dataset):
    data = dataset[0]

    # Number of nodes
    num_nodes = data.num_nodes

    # Number of edges
    num_edges = data.num_edges

    # Number of node features
    num_node_features = data.num_features

    # Number of classes
    num_classes = dataset.num_classes

    print("Number of nodes:", num_nodes)
    print("Number of edges:", num_edges)
    print("Number of node features:", num_node_features)
    print("Number of classes:", num_classes)

def data_stats_graph(dataset):
    data = dataset[0]

    num_graphs = len(dataset)

    # Number of node features
    num_node_features = data.num_features

    # Number of classes
    num_classes = dataset.num_classes

    print("Number of graphs:", num_graphs)
    print("Number of node features:", num_node_features)
    print("Number of classes:", num_classes)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx

def plot_graphs_from_dataset(dataset, num_examples=3):
    plt.figure(figsize=(15, 5))

    for i in range(min(num_examples, len(dataset))):
        graph = dataset[i]
        nx_graph = to_networkx(graph, to_undirected=True)

        plt.subplot(1, num_examples, i+1)
        nx.draw(nx_graph, with_labels=True, node_color='skyblue', edge_color='k', node_size=700, font_size=10)
        plt.title(f'Graph {i+1}')

    plt.show()

def plot_graphs_from_dataset_large(dataset, num_examples=3):
    plt.figure(figsize=(30, 10))

    for i in range(min(num_examples, len(dataset))):
        graph = dataset[i]
        nx_graph = to_networkx(graph, to_undirected=True)

        plt.subplot(1, num_examples, i+1)
        nx.draw(nx_graph, with_labels=True, node_color='skyblue', edge_color='k', node_size=200, font_size=10)
        plt.title(f'Graph {i+1}')

    plt.show()


# dataset

In [None]:
#@title Load Dataset
from torch_geometric.datasets import MNISTSuperpixels, CitationFull, WebKB, Actor, WikipediaNetwork
import numpy as np

In [None]:
dataset_cornell = WebKB(root='dataset/cornell/', name = 'Cornell')
dataset_texas = WebKB(root='dataset/texas/', name = 'Texas')
dataset_Wisconsin = WebKB(root='dataset/Wisconsin/', name = 'Wisconsin')
dataset_actor = Actor(root='dataset/actor/')
dataset_chameleon = WikipediaNetwork(root='dataset/chameleon/', name = 'chameleon')

Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/new_data/texas/out1_node_feature_label.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/new_data/texas/out1_graph_edges.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_0.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_1.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_2.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_3.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_4.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_5.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/spl

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCNModel_semi(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCNModel_semi, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [None]:
from torch_geometric.nn import ChebConv

class ChebNetModel_semi(torch.nn.Module):
    def __init__(self, num_features, num_classes, K=3):
        super(ChebNetModel_semi, self).__init__()
        self.cheb1 = ChebConv(num_features, 16, K)
        self.cheb2 = ChebConv(16, num_classes, K)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.cheb1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.cheb2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [None]:
def train(model, optimizer, data, criterion, device='cuda'):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(model, data, device='cuda'):
    model.eval()
    logits, accs = model(data), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs


In [None]:
def run_experiment_semi_supervised(model_class, optimizer_class, dataset, criterion, device="cuda", num_runs=5):
    accuracies = []
    data = dataset[0].to(device)  # Assuming a single-graph dataset

    # Masks are assumed to be part of the dataset
    train_mask = data.train_mask
    val_mask = data.val_mask
    test_mask = data.test_mask

    for run in range(num_runs):
        model = model_class(dataset.num_features, dataset.num_classes).to(device)
        optimizer = optimizer_class(model.parameters(), lr=0.01)

        # Train the model
        for epoch in range(200):  # Assuming a fixed number of epochs; adjust as necessary
            model.train()
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out[train_mask], data.y[train_mask])
            loss.backward()
            optimizer.step()

            # Optionally, evaluate on the validation set to adjust hyperparameters or early stopping

        # Test the model
        model.eval()
        _, pred = model(data).max(dim=1)
        correct = float(pred[test_mask].eq(data.y[test_mask]).sum().item())
        acc = correct / test_mask.sum().item()
        accuracies.append(acc)

        print(f'Run {run + 1}: Model Test Accuracy: {acc:.4f}')

    avg_accuracy = sum(accuracies) / num_runs
    print(f'Average Test Accuracy over {num_runs} runs: {avg_accuracy:.4f}')
    return avg_accuracy


In [None]:
import torch

def add_train_val_test_masks_to_dataset(dataset, train_percent=0.7, val_percent=0.1):
    """
    Add train, validation, and test masks to the dataset for semi-supervised learning.

    Parameters:
    - dataset: The dataset object, assumed to contain a single graph.
    - train_percent: The percentage of nodes used for training.
    - val_percent: The percentage of nodes used for validation.
    """
    num_nodes = dataset.data.num_nodes
    num_train = int(train_percent * num_nodes)
    num_val = int(val_percent * num_nodes)
    num_test = num_nodes - num_train - num_val

    # Initialize masks
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    # Ensure labels are evenly distributed across splits
    labels = dataset.data.y.cpu().numpy()
    unique_labels = torch.unique(dataset.data.y).cpu().numpy()

    for label in unique_labels:
        label_indices = torch.where(dataset.data.y == label)[0]
        # Shuffle indices of the current label
        label_indices = label_indices[torch.randperm(len(label_indices))]

        # Calculate split sizes for the current label
        num_label_train = int(train_percent * len(label_indices))
        num_label_val = int(val_percent * len(label_indices))

        # Assign splits for the current label
        train_mask[label_indices[:num_label_train]] = True
        val_mask[label_indices[num_label_train:num_label_train + num_label_val]] = True
        test_mask[label_indices[num_label_train + num_label_val:]] = True

    # Add masks to the dataset
    dataset.data.train_mask = train_mask
    dataset.data.val_mask = val_mask
    dataset.data.test_mask = test_mask

    return dataset


In [None]:
from torch_geometric.data import DataLoader

def data_load_sep_for_semi_supervised(dataset):
    """
    Prepare data loaders and masks for a semi-supervised node classification task.

    This function assumes the dataset has a single graph with added 'train_mask',
    'val_mask', and 'test_mask' attributes for semi-supervised learning.

    Parameters:
    - dataset: The dataset containing the graph.

    Returns:
    - num_features: The number of features per node.
    - num_classes: The number of classes for classification.
    - loader: DataLoader for the entire graph.
    - train_mask: Boolean mask for training nodes.
    - val_mask: Boolean mask for validation nodes.
    - test_mask: Boolean mask for testing nodes.
    """
    # Access .data for single-graph datasets
    data = dataset.data

    # Check for required masks
    required_masks = ['train_mask', 'val_mask', 'test_mask']
    for mask_name in required_masks:
        if not hasattr(data, mask_name):
            raise AttributeError(f"Dataset does not have '{mask_name}'. Please add it before calling this function.")

    # Extract masks
    train_mask = data.train_mask
    val_mask = data.val_mask
    test_mask = data.test_mask

    # DataLoader setup for the whole graph
    loader = DataLoader(dataset, batch_size=1, shuffle=False)  # Process the whole graph at once

    num_features = dataset.num_features
    num_classes = dataset.num_classes

    return num_features, num_classes, loader, train_mask, val_mask, test_mask


In [None]:
# Assuming the modified training and testing functions, criterion, and DataLoader objects are defined
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = torch.nn.CrossEntropyLoss()


# Experiments

## cornell

In [None]:
dataset_cornell1 = add_train_val_test_masks_to_dataset(dataset_cornell)




In [None]:
num_features_cornell, num_classes_cornell, loader_cornell, train_mask_cornell, val_mask_cornell, test_mask_cornell = data_load_sep_for_semi_supervised(dataset_cornell1)



In [None]:
# Assuming the models GCNModel, ChebNetModel, and dataset with masks are already defined
avg_accuracy = run_experiment_semi_supervised(GCNModel_semi, torch.optim.Adam,dataset_cornell1, criterion)


Run 1: Model Test Accuracy: 0.4146
Run 2: Model Test Accuracy: 0.4390
Run 3: Model Test Accuracy: 0.3659
Run 4: Model Test Accuracy: 0.3659
Run 5: Model Test Accuracy: 0.4634
Average Test Accuracy over 5 runs: 0.4098


In [None]:
avg_accuracy = run_experiment_semi_supervised(ChebNetModel_semi, torch.optim.Adam,dataset_cornell1, criterion)


Run 1: Model Test Accuracy: 0.6098
Run 2: Model Test Accuracy: 0.6098
Run 3: Model Test Accuracy: 0.5854
Run 4: Model Test Accuracy: 0.6098
Run 5: Model Test Accuracy: 0.6098
Average Test Accuracy over 5 runs: 0.6049


## Texas

In [None]:
dataset_texax1 = add_train_val_test_masks_to_dataset(dataset_texas)




In [None]:
num_features_texax, num_classes_texax, loader_texax, train_mask_texax, val_mask_texax, test_mask_texax = data_load_sep_for_semi_supervised(dataset_texax1)



In [None]:
# Assuming the models GCNModel, ChebNetModel, and dataset with masks are already defined
avg_accuracy = run_experiment_semi_supervised(GCNModel_semi, torch.optim.Adam,dataset_texax1, criterion)


Run 1: Model Test Accuracy: 0.4750
Run 2: Model Test Accuracy: 0.4750
Run 3: Model Test Accuracy: 0.4750
Run 4: Model Test Accuracy: 0.4750
Run 5: Model Test Accuracy: 0.4750
Average Test Accuracy over 5 runs: 0.4750


In [None]:
avg_accuracy = run_experiment_semi_supervised(ChebNetModel_semi, torch.optim.Adam,dataset_texax1, criterion)


Run 1: Model Test Accuracy: 0.6500
Run 2: Model Test Accuracy: 0.6000
Run 3: Model Test Accuracy: 0.7000
Run 4: Model Test Accuracy: 0.6750
Run 5: Model Test Accuracy: 0.6750
Average Test Accuracy over 5 runs: 0.6600


## Wisconsin

In [None]:
dataset_Wisconsin1 = add_train_val_test_masks_to_dataset(dataset_Wisconsin)




In [None]:
num_features_Wisconsin, num_classes_Wisconsin, loader_Wisconsin, train_mask_Wisconsin, val_mask_Wisconsin, test_mask_Wisconsin = data_load_sep_for_semi_supervised(dataset_Wisconsin1)



In [None]:
# Assuming the models GCNModel, ChebNetModel, and dataset with masks are already defined
avg_accuracy = run_experiment_semi_supervised(GCNModel_semi, torch.optim.Adam,dataset_Wisconsin1, criterion)


Run 1: Model Test Accuracy: 0.4717
Run 2: Model Test Accuracy: 0.4528
Run 3: Model Test Accuracy: 0.4528
Run 4: Model Test Accuracy: 0.4340
Run 5: Model Test Accuracy: 0.4906
Average Test Accuracy over 5 runs: 0.4604


In [None]:
avg_accuracy = run_experiment_semi_supervised(ChebNetModel_semi, torch.optim.Adam,dataset_Wisconsin1, criterion)


Run 1: Model Test Accuracy: 0.7925
Run 2: Model Test Accuracy: 0.6981
Run 3: Model Test Accuracy: 0.6415
Run 4: Model Test Accuracy: 0.7358
Run 5: Model Test Accuracy: 0.6981
Average Test Accuracy over 5 runs: 0.7132


## Actor

In [None]:
dataset_actor1 = add_train_val_test_masks_to_dataset(dataset_actor)




In [None]:
num_features_Wisconsin, num_classes_Wisconsin, loader_Wisconsin, train_mask_Wisconsin, val_mask_Wisconsin, test_mask_Wisconsin = data_load_sep_for_semi_supervised(dataset_Wisconsin1)



In [None]:
# Assuming the models GCNModel, ChebNetModel, and dataset with masks are already defined
avg_accuracy = run_experiment_semi_supervised(GCNModel_semi, torch.optim.Adam,dataset_actor1, criterion)


Run 1: Model Test Accuracy: 0.2690
Run 2: Model Test Accuracy: 0.2756
Run 3: Model Test Accuracy: 0.2631
Run 4: Model Test Accuracy: 0.2671
Run 5: Model Test Accuracy: 0.2651
Average Test Accuracy over 5 runs: 0.2680


In [None]:
avg_accuracy = run_experiment_semi_supervised(ChebNetModel_semi, torch.optim.Adam,dataset_actor1, criterion)


Run 1: Model Test Accuracy: 0.3340
Run 2: Model Test Accuracy: 0.3274
Run 3: Model Test Accuracy: 0.3379
Run 4: Model Test Accuracy: 0.3320
Run 5: Model Test Accuracy: 0.3340
Average Test Accuracy over 5 runs: 0.3331


In [None]:
dataset_chameleon

## chameleon

In [None]:
dataset_chameleon1 = add_train_val_test_masks_to_dataset(dataset_chameleon)




In [None]:
# Assuming the models GCNModel, ChebNetModel, and dataset with masks are already defined
avg_accuracy = run_experiment_semi_supervised(GCNModel_semi, torch.optim.Adam,dataset_chameleon1, criterion)


Run 1: Model Test Accuracy: 0.3072
Run 2: Model Test Accuracy: 0.3333
Run 3: Model Test Accuracy: 0.3050
Run 4: Model Test Accuracy: 0.3159
Run 5: Model Test Accuracy: 0.3246
Average Test Accuracy over 5 runs: 0.3172


In [None]:
avg_accuracy = run_experiment_semi_supervised(ChebNetModel_semi, torch.optim.Adam,dataset_chameleon1, criterion)


Run 1: Model Test Accuracy: 0.3878
Run 2: Model Test Accuracy: 0.4118
Run 3: Model Test Accuracy: 0.4379
Run 4: Model Test Accuracy: 0.4052
Run 5: Model Test Accuracy: 0.3813
Average Test Accuracy over 5 runs: 0.4048
