In [5]:
# suppress warnings
import warnings

# Suppress specific UserWarning related to InMemoryDataset
warnings.simplefilter("ignore", category=UserWarning)

In [6]:
import torch
import os
import torch_geometric
import torch_geometric.datasets as tgdatasets

**Example**: We will use `torch_geometric.datasets` to load the `Cora` dataset for node classification, which consists of 2,708 nodes, 10,556 edges, 1,433 features, and 7 classes.

`dataset.data.x` : input node features

`dataset.data.edge_index` : A sparce adjacent matrix with the size of *2 x n_edge*.

`dataset.data.y` : target labels

`dataset.data.train_mask` : boolean mask indicating whether an index belongs to training data

`dataset.data.val_mask` : boolean mask indicating whether an index belongs to validation data

`dataset.data.test_mask` : boolean mask indicating whether an index belongs to test data

In [7]:
DATASET_NAMES = ['Cora', 'CiteSeer', 'PubMed']
DATA_DIR = "./data/"
DATASETS = { name: tgdatasets.Planetoid(
                    root=DATA_DIR,
                    name=name,
                    split="public",
                    transform=torch_geometric.transforms.GCNNorm()
                    ) 
            for name in DATASET_NAMES
            }

In [10]:
def mask_size(mask):
    return torch.count_nonzero(mask).item()

In [20]:
def print_datasets_info():
    for name, dataset in DATASETS.items():
        print(f"Info for {name}:")
        print(f"num_nodes: {dataset.data.x.shape[0]}")
        print(f"num_edges: {dataset.data.edge_index.shape[1]}")
        print(f"num_node_features = {dataset.num_node_features}")
        print(f"num_classes = {dataset.num_classes}")
        
        print(f"Training set size: {mask_size(dataset.data.train_mask)}")
        print(f"Validation set size: {mask_size(dataset.data.val_mask)}")
        print(f"Test set size: {mask_size(dataset.data.test_mask)}")
        print()
        
# print_datasets_info()

Info for Cora:
num_nodes: 2708
num_edges: 10556
num_node_features = 1433
num_classes = 7
Training set size: 140
Validation set size: 500
Test set size: 1000

Info for CiteSeer:
num_nodes: 3327
num_edges: 9104
num_node_features = 3703
num_classes = 6
Training set size: 120
Validation set size: 500
Test set size: 1000

Info for PubMed:
num_nodes: 19717
num_edges: 88648
num_node_features = 500
num_classes = 3
Training set size: 60
Validation set size: 500
Test set size: 1000



In [25]:
def load_data(dataset_name, data_only=False):
    """
    Returns the dataset and [Data] object for the given dataset name.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dataset = DATASETS[dataset_name]
    data = dataset.data.to(device)
    return data if data_only else (data, dataset)