# GraphSSL Demonstration

I recommend running the main pipeline using the command line interface or by running the shell script run_examples.sh. Here is a Jupyter notebook for compleness.

Train and evaluate graph representation learning models on OGBN-MAG dataset.

**Estimated Runtime**: ~10 hours on RTX 3060 (100 epochs)

## Setup

In [26]:
%pip install -e ..

Note: you may need to restart the kernel to use updated packages.


c:\Users\gabri\GTFO_Onedrive\DTU_Code\GraphSSL\.venv\Scripts\python.exe: No module named pip


In [27]:
import sys
import os
from pathlib import Path
from datetime import datetime
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

os.environ["WANDB_MODE"] = "disabled"
# Add project root to sys path just in case. It is recommended to install the project as a pip package by running the top block.
project_root = Path.cwd().parent if Path.cwd().name == 'scripts' else Path.cwd()
sys.path.insert(0, str(project_root / 'src'))

from graphssl.main import run_pipeline
from graphssl.utils.args_utils import parse_args
import wandb
wandb.init(mode="disabled")

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch: 2.8.0+cu126
CUDA: True
GPU: NVIDIA GeForce RTX 3060


## Configuration

In [None]:
EXPERIMENT = "supervised_node"
# EXPERIMENT = "ssl_node_sce"
# EXPERIMENT = "ssl_node_mse"
# EXPERIMENT = "ssl_edge"
# EXPERIMENT = "ssl_tarpfp"

CONFIG = {
    "data_root": str(project_root / "data"),
    "results_root": str(project_root / "results" / f"demo_{EXPERIMENT}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"),
    "target_node": "paper",
    "target_edge_type": "paper,has_topic,field_of_study",
    "hidden_channels": 128,
    "num_layers": 2,
    "num_neighbors": [30, 30],
    "batch_size": 1024,
    "epochs": 10,
    "lr": 0.001,
    "dropout": 0.5,
    "patience": 5,
    "num_workers": 4,
    "weight_decay": 0,
    "log_interval": 10,
    "seed": 42,
    "edge_msg_pass_prop": [0, 0, 0],
    "extract_embeddings": True,
    "downstream_eval": True,
    "downstream_task": "both",
    "downstream_n_runs": 2,
    "downstream_hidden_dim": 128,
    "downstream_num_layers": 2,
    "downstream_dropout": 0.5,
    "multiclass_batch_size": 256,
    "downstream_node_epochs": 10,
    "downstream_link_epochs": 1,
    "downstream_patience": 4,
    "downstream_lr": 0.0001,
    # Defaults from parse_args
    "model_path": None,
    "preprocess": "metapath2vec",
    "metapath2vec_embeddings_path": "embedding.pt",
    "loss_fn": "mse",
    "mer_weight": 1.0,
    "tar_weight": 1.0,
    "pfp_weight": 1.0,
    "tar_temperature": 0.5,
    "mask_ratio": 0.5,
    "neg_sampling_ratio": 1.0,
    "aggr": "mean",
    "aggr_rel": "sum",
    "use_batchnorm": True,
    "node_inductive": True,
    "dependent_node_edge_data_split": True,
    "lambda_tar": 1.0,
    "lambda_pfp": 0.0,
    "disable_tqdm": False,
    "log_level": "INFO",
    "downstream_batch_size": 1024,
    "downstream_weight_decay": 0,
    "downstream_neg_samples": 1,
    "test_mode": True,
    "test_max_nodes": 5000,
}

# Experiment-Specific Parameters
if EXPERIMENT == "supervised_node":
    CONFIG.update({
        "objective_type": "supervised_node_classification",
        "use_feature_decoder": True,
        "use_edge_decoder": True,
    })
    
elif EXPERIMENT == "ssl_node_sce":
    CONFIG.update({
        "objective_type": "self_supervised_node",
        "loss_fn": "sce",
        "mask_ratio": 0.5,
        "use_feature_decoder": True,
        "use_edge_decoder": True,
        "patience": 20,
        "downstream_link_epochs": 3,
        "downstream_patience": 20,
    })
    
elif EXPERIMENT == "ssl_node_mse":
    CONFIG.update({
        "objective_type": "self_supervised_node",
        "loss_fn": "mse",
        "mask_ratio": 0.5,
        "use_feature_decoder": True,
        "use_edge_decoder": True,
    })
    
elif EXPERIMENT == "ssl_edge":
    CONFIG.update({
        "objective_type": "self_supervised_edge",
        "neg_sampling_ratio": 1.0,
        "use_feature_decoder": True,
        "use_edge_decoder": True,
    })
    
elif EXPERIMENT == "ssl_tarpfp":
    CONFIG.update({
        "objective_type": "self_supervised_tarpfp",
        "lambda_tar": 1.0,
        "lambda_pfp": 1.0,
        "mask_ratio": 0.5,
        "neg_sampling_ratio": 1.0,
        "tar_temperature": 0.5,
        "use_feature_decoder": True,
        "use_edge_decoder": True,
        "metapath2vec_embeddings_path": "pos_embedding.pt",
    })

print(f"Experiment: {EXPERIMENT}")
print(f"Objective: {CONFIG['objective_type']}")
print(f"Results will be saved to: {CONFIG['results_root']}")
print(f"\nEstimated runtime on RTX 3060: ~10 hours for {CONFIG['epochs']} epochs (with downstream evaluation)")

Experiment: supervised_node
Objective: supervised_node_classification
Results will be saved to: c:\Users\gabri\GTFO_Onedrive\DTU_Code\GraphSSL\results\demo_supervised_node_20251208_171952

Estimated runtime on RTX 3060: ~10 hours for 10 epochs (with downstream evaluation)


: 

## Training
Run the main experiment pipeline

In [None]:
import argparse

args = argparse.Namespace(**CONFIG)
run_pipeline(args)

GraphSSL - Supervised Learning Pipeline
Task: Venue Prediction on OGB_MAG Dataset

Using device: cuda
GPU: NVIDIA GeForce RTX 3060
CUDA Version: 12.6

Step 1: Loading Dataset

Step 2: Creating Data Loaders

Step 2: Creating Data Loaders

Step 3: Creating Training Objective

Step 4: Creating Model

Step 5: Setting up Optimizer

Step 6: Training Model

Step 3: Creating Training Objective

Step 4: Creating Model

Step 5: Setting up Optimizer

Step 6: Training Model


                                                         

Epoch   1/10 | Train Loss: 5.9513 | Train Acc: 0.0056 | Val Loss: 5.8363 | Val Acc: 0.0158 | Time: 16.93s


                                                         

Epoch   2/10 | Train Loss: 5.7476 | Train Acc: 0.0131 | Val Loss: 5.7919 | Val Acc: 0.0181 | Time: 14.99s


                                                         

Epoch   3/10 | Train Loss: 5.5700 | Train Acc: 0.0310 | Val Loss: 5.7266 | Val Acc: 0.0226 | Time: 14.34s


                                                         

Epoch   4/10 | Train Loss: 5.4421 | Train Acc: 0.0387 | Val Loss: 5.6384 | Val Acc: 0.0249 | Time: 15.50s


                                                         

Epoch   5/10 | Train Loss: 5.3128 | Train Acc: 0.0507 | Val Loss: 5.5265 | Val Acc: 0.0430 | Time: 15.83s


                                                         

Epoch   6/10 | Train Loss: 5.2214 | Train Acc: 0.0549 | Val Loss: 5.3901 | Val Acc: 0.0566 | Time: 14.79s


                                                         

Epoch   7/10 | Train Loss: 5.1137 | Train Acc: 0.0657 | Val Loss: 5.2513 | Val Acc: 0.0701 | Time: 13.99s


                                                         

Epoch   8/10 | Train Loss: 5.0499 | Train Acc: 0.0737 | Val Loss: 5.1310 | Val Acc: 0.0747 | Time: 14.49s


                                                         

Epoch   9/10 | Train Loss: 4.9685 | Train Acc: 0.0855 | Val Loss: 5.0213 | Val Acc: 0.0769 | Time: 15.04s


                                                         

Epoch  10/10 | Train Loss: 4.9118 | Train Acc: 0.0902 | Val Loss: 4.9368 | Val Acc: 0.0747 | Time: 14.20s

Step 7: Testing Model

Step 7: Testing Model


                                                         


Step 8: Saving Results

Step 9: Extracting Embeddings


                                                                    


Step 10: Downstream Evaluation


Epoch 1:  91%|█████████ | 39/43 [00:07<00:00, 30.04it/s]

## Results

In [1]:
results_dir = Path(CONFIG['results_root'])
history = torch.load(results_dir / "training_history.pt")

if 'train_loss' in history:
    print(f"Train Loss: {history['train_loss'][-1]:.4f}")
    print(f"Val Loss: {history['val_loss'][-1]:.4f}")
if 'val_acc' in history:
    print(f"Best Val Acc: {max(history['val_acc']):.4f}")

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
if 'train_loss' in history:
    axes[0].plot(history['train_loss'], label='Train')
    axes[0].plot(history['val_loss'], label='Val')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
if 'val_acc' in history:
    axes[1].plot(history['val_acc'])
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()

NameError: name 'Path' is not defined

## Downstream Evaluation

In [None]:
downstream_results = torch.load(results_dir / "downstream_node_results.pt")

if 'node_classification' in downstream_results:
    r = downstream_results['node_classification']
    print("Node Classification:")
    print(f"  Test Acc: {r['test_acc_mean']:.4f} ± {r['test_acc_std']:.4f}")
    print(f"  Test F1:  {r['test_f1_mean']:.4f} ± {r['test_f1_std']:.4f}")

if 'binary_link_prediction' in downstream_results:
    r = downstream_results['binary_link_prediction']
    print("\nBinary Link Prediction:")
    print(f"  Test AUC: {r['test_auc_mean']:.4f} ± {r['test_auc_std']:.4f}")
    print(f"  Test AP:  {r['test_ap_mean']:.4f} ± {r['test_ap_std']:.4f}")

if 'multiclass_link_prediction' in downstream_results:
    r = downstream_results['multiclass_link_prediction']
    print("\nMulti-Label Prediction:")
    print(f"  Test F1 (micro): {r['test_f1_micro_mean']:.4f} ± {r['test_f1_micro_std']:.4f}")
    print(f"  Test F1 (macro): {r['test_f1_macro_mean']:.4f} ± {r['test_f1_macro_std']:.4f}")

    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    
    metrics = []
    values = []
    errors = []
    
    if 'node_classification' in downstream_results:
        node_results = downstream_results['node_classification']
        if 'test_acc_mean' in node_results:
            metrics.append('Node Acc')
            values.append(node_results['test_acc_mean'])
            errors.append(node_results['test_acc_std'])
    
    if 'binary_link_prediction' in downstream_results:
        link_results = downstream_results['binary_link_prediction']
        if 'test_auc_mean' in link_results:
            metrics.append('Link AUC')
            values.append(link_results['test_auc_mean'])
            errors.append(link_results['test_auc_std'])
        if 'test_ap_mean' in link_results:
            metrics.append('Link AP')
            values.append(link_results['test_ap_mean'])
            errors.append(link_results['test_ap_std'])
    
    if 'multiclass_link_prediction' in downstream_results:
        multiclass_results = downstream_results['multiclass_link_prediction']
        if 'test_f1_micro_mean' in multiclass_results:
            metrics.append('Multi-Label F1')
metrics, values, errors = [], [], []
if 'node_classification' in downstream_results:
    r = downstream_results['node_classification']
    metrics.append('Node Acc')
    values.append(r['test_acc_mean'])
    errors.append(r['test_acc_std'])
if 'binary_link_prediction' in downstream_results:
    r = downstream_results['binary_link_prediction']
    metrics.extend(['Link AUC', 'Link AP'])
    values.extend([r['test_auc_mean'], r['test_ap_mean']])
    errors.extend([r['test_auc_std'], r['test_ap_std']])
if 'multiclass_link_prediction' in downstream_results:
    r = downstream_results['multiclass_link_prediction']
    metrics.append('Multi-Label F1')
    values.append(r['test_f1_micro_mean'])
    errors.append(r['test_f1_micro_std'])

fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(metrics, values, yerr=errors, capsize=5, alpha=0.8)
ax.set_ylabel('Score')
ax.set_ylim(0, 1)