In [None]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.8.0+cu126.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.8.0+cu126.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-2.8.0+cu126.html
!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.8.0+cu126.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.8.0+cu126.html
Looking in links: https://data.pyg.org/whl/torch-2.8.0+cu126.html
Looking in links: https://data.pyg.org/whl/torch-2.8.0+cu126.html
Looking in links: https://data.pyg.org/whl/torch-2.8.0+cu126.html


In [None]:
!pip install torchmetrics
!pip install opacus

Collecting opacus
  Downloading opacus-1.5.4-py3-none-any.whl.metadata (8.7 kB)
Downloading opacus-1.5.4-py3-none-any.whl (254 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.4/254.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opacus
Successfully installed opacus-1.5.4


In [None]:
import torch
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
import torch_geometric
from torch_geometric.nn import global_mean_pool, global_max_pool


class FingerprintsModel(torch.nn.M2odule):
    def __init__(self, hidden_channels, dataset, model_type:str):
        super(FingerprintsModel, self).__init__()
        self.model_type = model_type
        self.hidden_channels = hidden_channels

        if self.model_type == "GCN":
            self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
            self.conv2 = GCNConv(hidden_channels, hidden_channels*2)
            self.conv3 = GCNConv(hidden_channels*2, hidden_channels*4)
            # self.conv4 = GCNConv(hidden_channels*4, hidden_channels*8)
        elif self.model_type == "GraphSAGE":
            self.conv1 = SAGEConv(dataset.num_node_features, hidden_channels)
            self.conv2 = SAGEConv(hidden_channels, hidden_channels*2)
            self.conv3 = SAGEConv(hidden_channels*2, hidden_channels*4)
            # self.conv4 = SAGEConv(hidden_channels*4, hidden_channels*8)
        elif self.model_type == "GAT":
            self.conv1 = GATConv(dataset.num_node_features, hidden_channels)
            self.conv2 = GATConv(hidden_channels, hidden_channels*2)
            self.conv3 = GATConv(hidden_channels*2, hidden_channels*4)
            # self.conv4 = GATConv(hidden_channels*4, hidden_channels*8)

        self.lin = torch.nn.Sequential(torch.nn.Linear(hidden_channels*4, 256), torch.nn.ReLU(), torch.nn.Linear(256, dataset.num_tasks))
        self.norm = torch_geometric.nn.InstanceNorm(1, affine=True)

    def forward(self, x, edge_index, batch):
        x = self.norm(x, batch)
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = x.relu()

        # 2. Readout layer
        x = global_max_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = self.lin(x)

        return x

In [None]:
import numpy as np
import torch
from tqdm import tqdm
from functorch import jacrev
import torch_geometric
from functorch import make_functional_with_buffers
from opacus.accountants.utils import get_noise_multiplier
from opacus.optimizers import DPOptimizer
from opacus.utils.batch_memory_manager import wrap_data_loader


def train(model, train_loader, optimizer, criterion, device):
    model.train()

    correct = 0
    epoch_loss = 0
    for data in tqdm(train_loader):
        optimizer.zero_grad()
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y.squeeze())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        pred = out.cpu().argmax(dim=1)
        correct += int((pred == data.y.squeeze().cpu()).sum())

    return epoch_loss/len(train_loader), correct/len(train_loader.dataset)


def compute_loss(params, buffers, data_x, data_edge_index, data_batch, targets, fmodel, loss_fn):
    predictions = fmodel(params, buffers, data_x, data_edge_index, data_batch)
    loss = loss_fn(predictions.squeeze(), targets.squeeze())
    return loss


# functorch implementation of per sample gradients needed for DP
compute_per_sample_grads = jacrev(compute_loss)


def train_dp(fmodel, params, buffers, train_loader, device, optimizer, criterion, scheduler=None):
    epoch_losses = []
    correct = 0

    for step, data in enumerate(tqdm(train_loader, desc="Iteration")):
        optimizer.zero_grad(True)
        data = data.to(device)
        out = fmodel(params, buffers, data.x.float(), data.edge_index, data.batch)
        pred = out.cpu().argmax(dim=1)
        correct += int((pred == data.y.squeeze().cpu()).sum())

        if isinstance(criterion, torch.nn.CrossEntropyLoss):
            loss = criterion(out.squeeze(), data.y.squeeze())
        else:
            loss = criterion(out.squeeze(), data.y.squeeze().float().to(device))

        per_sample_grads = compute_per_sample_grads(
            params,
            buffers,
            data.x,
            data.edge_index,
            data.batch,
            data.y,
            fmodel,
            criterion,
        )

        for param, grad_sample in zip(params, per_sample_grads):
            param.grad_sample = grad_sample
            param.grad = (grad_sample.mean(0))

        optimizer.step()
        epoch_losses.append(torch.mean(loss.detach().cpu()))

        if scheduler is not None:
            scheduler.step()

    acc = correct/len(train_loader.dataset)
    return np.mean(epoch_losses), acc, params


def test(model, test_loader, criterion, device):
    model.eval()
    epoch_losses = []

    correct = 0
    for data in tqdm(test_loader):
        data = data.to(device)
        y = data.y.squeeze()
        out = model(data.x, data.edge_index, data.batch)
        pred = out.cpu().argmax(dim=1)
        correct += int((pred == data.y.squeeze().cpu()).sum())
        loss = criterion(out.squeeze(), data.y.squeeze())

    epoch_losses.append(torch.mean(loss.detach().cpu()))

    return correct/len(test_loader.dataset) , np.mean(epoch_losses)#, f1_score, roc_auc, specificity, sensitivity


def set_up_train_environment(dp:bool,
                             model:torch.nn.Module,
                             nr_train_samples:int,
                             epochs:int,
                             train_loader:torch_geometric.loader.DataLoader,
                             clip:float,
                             learning_rate:float,
                             batch_size:int,
                             max_epsilon:float=None):

    fmodel, params, buffers = make_functional_with_buffers(model)

    if dp:
        optimizer = torch.optim.SGD(params, lr=learning_rate)
        criterion = torch.nn.CrossEntropyLoss(reduction="none")
        NOISE = get_noise_multiplier(target_epsilon=max_epsilon, target_delta=1/nr_train_samples, sample_rate=1/len(train_loader), epochs=epochs)
        optimizer = DPOptimizer(
                            optimizer,
                            noise_multiplier=NOISE,
                            max_grad_norm=clip,
                            expected_batch_size=batch_size,
                            loss_reduction="mean",
                        )
        train_loader = wrap_data_loader(data_loader=train_loader, max_batch_size=batch_size, optimizer=optimizer)
        torch.set_grad_enabled(False)
        return fmodel, params, buffers, optimizer, criterion, train_loader
    else:
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.NAdam(model.parameters(), lr=learning_rate)
        torch.set_grad_enabled(True)

        return model, params, buffers, optimizer, criterion, train_loader

In [None]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = TUDataset(root='data/TUD', name='MUTAG')  # 188 graphs

#Dataset does not have num_tasks attribute - create it!
dataset.num_tasks = 2
#dataset.num_node_features = dataset.num_features

num_graphs = len(dataset)
train = int(0.7*num_graphs)
random_idx = torch.randperm(num_graphs)
train_idx = random_idx[:train]
test_idx = random_idx[train:]

train = dataset[train_idx]
tester = dataset[test_idx]

#Make batches of data
trainer = DataLoader(train, batch_size=32,shuffle = True)
tester = DataLoader(tester, batch_size=32,shuffle = True)

model = FingerprintsModel(16, dataset,"GCN").to(device)

In [None]:
fmodel, params, buffers, optimizer, criterion, train_loader = set_up_train_environment(dp=True, model = model, nr_train_samples=len(dataset), epochs = 50, train_loader=trainer, clip = 1.0, learning_rate = .01, batch_size = 32, max_epsilon= 5)

In [None]:
mean_poch_loss, acc, params = train_dp(fmodel=fmodel, params=params, buffers=buffers, optimizer=optimizer, criterion= criterion, train_loader = train_loader, device = device)

In [None]:
mean_poch_loss

In [None]:
acc

#Sweep for values of epsilon

In [None]:
eps = [0.1 ,0.5, 1, 2, 5, 7, 9] #Values of epsilon to test
models_trained = dict()

for e in eps:
  #Initalize Fingerprint model
  model = FingerprintsModel(16, dataset,"GCN").to(device)
  #Set up environment
  fmodel, params, buffers, optimizer, criterion, train_loader = set_up_train_environment(dp=True, model = model,
                                                                                         nr_train_samples=len(dataset), epochs = 50, train_loader=trainer,
                                                                                         clip = 1.0, learning_rate = .01, batch_size = 32, max_epsilon= e)
  #Train
  mean_epoch_loss, acc, params = train_dp(fmodel=fmodel, params=params, buffers=buffers, optimizer=optimizer,
                                         criterion= criterion, train_loader = train_loader, device = device)

  print("Model with epsilon", e, "obtained accuracy", acc, "and mean loss", mean_epoch_loss)

  models_trained[e] = model


#Epsilon 2 seems to give the best accuracy results

### Try DP-GNN on Test Data
* Not necessary since we will use the test data for the shadow model but kept for reference.

In [None]:
test_acc, test_mean_loss = test(models_trained[2], tester, criterion, device)

In [None]:
test_acc

In [None]:
test_mean_loss

# How does the model hold up to different atacks?


## Member Inference Attack

In [None]:
test_data = dataset[test_idx]

s_num_graphs = len(test_data)
train_s = int(0.7*s_num_graphs)
random_idx = torch.randperm(s_num_graphs)
train_idx = random_idx[:train_s]
test_idx = random_idx[train_s:]

shadow_train = test_data[train_idx]
shadow_tester = test_data[test_idx]

#Dataloaders
s_trainer = DataLoader(shadow_train, batch_size=32,shuffle = True)
s_tester = DataLoader(shadow_tester, batch_size=32,shuffle = True)

In [None]:
s_model = FingerprintsModel(16, test_data,"GCN").to(device)

In [None]:
shadow_model, params, buffers, optimizer, criterion, train_loader = set_up_train_environment(dp=True, model = s_model, nr_train_samples=len(test_data), epochs = 50, train_loader=s_trainer, clip = 1.0, learning_rate = .01, batch_size = 32, max_epsilon= 5)
shadow_model

In [None]:
def get_attack_feats(model, dataloader, device):
  model.eval()
  feats = list()
  labels = list()

  for data in dataloader:
    data = data.to(device)
    out = model(data.x, data.edge_index, data.batch)
    prob = torch.softmax(out, dim = 1)
    max_conf, _ = prob.max(dim=1)
    loss = F.cross_entropy(out, data.y.squeeze(), reduction = "none")

    #Get features and associated label
    for max_con, los, ylab in zip( max_conf.cpu(), loss.cpu(), data.y):
      feats.append([max_con.item(), los.item()])
      #Label
      labels.append(ylab)
    return torch.tensor(feats), torch.tensor(labels)



In [None]:
get_attack_feats(shadow_model, train_loader, device)

# LinkTeller Attack (Failed Attempt)

### Follow the pre-processing procedure for one graph from GitHub

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.utils import to_networkx, dense_to_sparse
from torch_geometric.nn import GCNConv
import networkx as nx
from sklearn.cluster import KMeans
import torch, fsspec, torch_geometric
from torch_geometric.datasets import TUDataset
import numpy as np
from sklearn.model_selection import train_test_split
import math, random
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


In [None]:
dataset = TUDataset(root='data/TUD', name='MUTAG')  # 188 graphs
sizes = [data.num_nodes for data in dataset]
idx = int(np.argmax([n if n >= 25 else 0 for n in sizes]))  # pick a larger graph
data = dataset[idx]
print(f"Graph index {idx}: nodes={data.num_nodes}, edges={data.num_edges // 2} (undirected)")
data.num_tasks = 2
data

In [None]:
X = data.x.float()  # [N, D]
N, D = X.shape


K = min(3, len(torch.unique(X, dim=0))) if len(torch.unique(X, dim=0))>1 else 2
km = KMeans(n_clusters=K, n_init=10, random_state=0).fit(X.numpy())
y_node = torch.from_numpy(km.labels_).long()

# Train/val/test node splits
idx_all = np.arange(N)
idx_train, idx_tmp = train_test_split(idx_all, test_size=0.4, random_state=42, stratify=y_node.numpy())
idx_val, idx_test = train_test_split(idx_tmp, test_size=0.5, random_state=42, stratify=y_node.numpy()[idx_tmp])

train_mask = torch.zeros(N, dtype=torch.bool); train_mask[idx_train] = True
val_mask   = torch.zeros(N, dtype=torch.bool); val_mask[idx_val] = True
test_mask  = torch.zeros(N, dtype=torch.bool); test_mask[idx_test] = True

print(f"Splits: train {train_mask.sum().item()}, val {val_mask.sum().item()}, test {test_mask.sum().item()}")
data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask
data

In [None]:
# Edge index is undirected in PyG; keep it as-is
edge_index = data.edge_index  # [2, E]

# For evaluation convenience, build a boolean adjacency (without self loops)
A = torch.zeros((N, N), dtype=torch.bool)
A[edge_index[0], edge_index[1]] = True
A[edge_index[1], edge_index[0]] = True
A.fill_diagonal_(False)
true_edges_undirected = torch.nonzero(torch.triu(A, diagonal=1), as_tuple=False)  # [M, 2]
M_true = true_edges_undirected.shape[0]
density = M_true / (N*(N-1)/2)
print(f"True undirected edges: {M_true} | density={density:.4f}")

In [None]:
from functorch import make_functional_with_buffers

class GCN(nn.Module):
    def __init__(self, in_channels, hidden, out_channels, dropout=0.2):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden)
        self.conv2 = GCNConv(hidden, out_channels)
        self.dropout = dropout

    def forward(self, x, edge_index, training = True):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=training)
        x = self.conv2(x, edge_index)
        return x  # logits (N, K)

model = GCN(D, hidden=32, out_channels=K, dropout=0.2).to(device)
X_dev = X.to(device)
edge_index_dev = edge_index.to(device)
y_dev = y_node.to(device)
train_mask_dev = train_mask.to(device)
val_mask_dev   = val_mask.to(device)
test_mask_dev  = test_mask.to(device)

#For manual application of dp
model.eval()
fmodel, params, buffers = make_functional_with_buffers(model)
