In [None]:
"""
GNNFingers - Graph Classification on PROTEIN Dataset
=====================================================


PIPELINE:
1. Load PROTEIN dataset (multiple graphs, each graph = one sample)
2. Train target GCN model for graph classification
3. Generate 2 positive models (fine-tuned clones)
4. Generate 2 negative models (fresh GCN and GraphSAGE)
5. Create 5 synthetic fingerprints (random small graphs)
6. Collect model responses (graph-level predictions)
7. Train verifier (binary classifier)
8. Evaluate: TP, TN, Accuracy

"""

# ============================================================================
# CELL 1: Setup and Install Dependencies
# ============================================================================
print("=" * 70)
print("CELL 1: Installing Dependencies for Graph Classification")
print("=" * 70)

import subprocess
import sys

print("Installing packages...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                       "torch", "torch_geometric", "torch_scatter", "torch_sparse"])

import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from sklearn.metrics import accuracy_score

# Create base directory for graph classification
base_dir = Path("/content/gnnfingers_graph_classification")
base_dir.mkdir(exist_ok=True)

# Create subdirectories
(base_dir / "data").mkdir(exist_ok=True)
(base_dir / "models" / "target").mkdir(parents=True, exist_ok=True)
(base_dir / "models" / "positive").mkdir(parents=True, exist_ok=True)
(base_dir / "models" / "negative").mkdir(parents=True, exist_ok=True)
(base_dir / "fingerprints").mkdir(exist_ok=True)
(base_dir / "verifier").mkdir(exist_ok=True)
(base_dir / "results").mkdir(exist_ok=True)

print(f"\n✓ Directory structure created:")
print(f"  {base_dir}/")
print(f"    ├── data/            (PROTEIN dataset)")
print(f"    ├── models/")
print(f"    │   ├── target/      (target GCN)")
print(f"    │   ├── positive/    (fine-tuned clones)")
print(f"    │   └── negative/    (independent models)")
print(f"    ├── fingerprints/    (synthetic graphs)")
print(f"    ├── verifier/        (binary classifier)")
print(f"    └── results/         (TP/TN/accuracy)\n")

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

print("✓ Dependencies installed and directories ready\n")

CELL 1: Installing Dependencies for Graph Classification
Installing packages...

✓ Directory structure created:
  /content/gnnfingers_graph_classification/
    ├── data/            (PROTEIN dataset)
    ├── models/
    │   ├── target/      (target GCN)
    │   ├── positive/    (fine-tuned clones)
    │   └── negative/    (independent models)
    ├── fingerprints/    (synthetic graphs)
    ├── verifier/        (binary classifier)
    └── results/         (TP/TN/accuracy)

✓ Dependencies installed and directories ready



In [None]:
# ============================================================================
# CELL 2: Define Graph Classification Models
# ============================================================================
print("=" * 70)
print("CELL 2: Define Graph Classification Models")
print("=" * 70)

from torch_geometric.nn import GCNConv, SAGEConv, global_mean_pool, global_add_pool
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

class GCNGraphClassifier(nn.Module):
    """GCN for Graph Classification with Mean Pooling"""
    def __init__(self, num_features, hidden_channels=64, num_classes=2):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = nn.Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, batch):
        # Node embeddings
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)

        # Graph-level pooling (mean aggregation)
        x = global_mean_pool(x, batch)

        # Classification head
        x = self.lin(x)
        return x

class GraphSAGEClassifier(nn.Module):
    """GraphSAGE for Graph Classification"""
    def __init__(self, num_features, hidden_channels=64, num_classes=2):
        super().__init__()
        self.sage1 = SAGEConv(num_features, hidden_channels)
        self.sage2 = SAGEConv(hidden_channels, hidden_channels)
        self.sage3 = SAGEConv(hidden_channels, hidden_channels)
        self.lin = nn.Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, batch):
        x = self.sage1(x, edge_index)
        x = F.relu(x)
        x = self.sage2(x, edge_index)
        x = F.relu(x)
        x = self.sage3(x, edge_index)

        # Graph pooling
        x = global_mean_pool(x, batch)

        x = self.lin(x)
        return x

class Verifier(nn.Module):
    """Binary classifier verifier"""
    def __init__(self, input_dim, hidden_dim=32):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 16)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x.squeeze()

def load_protein_dataset():
    """Load PROTEIN dataset for graph classification"""
    dataset = TUDataset(root=str(base_dir / "data"), name='PROTEINS')

    # Split dataset: 80% train, 10% val, 10% test
    torch.manual_seed(42)
    dataset = dataset.shuffle()

    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))

    train_dataset = dataset[:train_size]
    val_dataset = dataset[train_size:train_size + val_size]
    test_dataset = dataset[train_size + val_size:]

    return dataset, train_dataset, val_dataset, test_dataset

def train_graph_classifier(model, train_loader, val_loader, epochs=50, lr=0.001, verbose=True):
    """Train a graph classification model"""
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, data.batch)
            loss = F.cross_entropy(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation
        if verbose and (epoch + 1) % 10 == 0:
            model.eval()
            correct = 0
            total = 0
            with torch.no_grad():
                for data in val_loader:
                    out = model(data.x, data.edge_index, data.batch)
                    pred = out.argmax(dim=1)
                    correct += (pred == data.y).sum().item()
                    total += data.y.size(0)

            val_acc = correct / total
            print(f"    Epoch {epoch+1}/{epochs} | Loss: {total_loss/len(train_loader):.4f} | Val Acc: {val_acc:.3f}")

    return model

print("✓ Graph Classification models defined\n")


CELL 2: Define Graph Classification Models
✓ Graph Classification models defined



In [None]:
# ============================================================================
# CELL 3: Load PROTEIN Dataset and Train Target Model
# ============================================================================
print("=" * 70)
print("CELL 3: Load PROTEIN Dataset and Train Target Model")
print("=" * 70)

dataset, train_dataset, val_dataset, test_dataset = load_protein_dataset()

print(f"\nDataset: PROTEINS (Graph Classification)")
print(f"  Total Graphs: {len(dataset)}")
print(f"  Num Classes: {dataset.num_classes}")
print(f"  Num Features: {dataset.num_features}")
print(f"  Train/Val/Test: {len(train_dataset)}/{len(val_dataset)}/{len(test_dataset)}")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Train target model
print(f"\nTraining TARGET model (GCN for Graph Classification)...")
target_model = GCNGraphClassifier(
    num_features=dataset.num_features,
    hidden_channels=64,
    num_classes=dataset.num_classes
)
target_model = train_graph_classifier(target_model, train_loader, val_loader, epochs=50)

# Save target model
target_path = base_dir / "models" / "target" / "gcn_graph_target.pt"
torch.save(target_model.state_dict(), target_path)
print(f"\n✓ Target model saved to {target_path}\n")

CELL 3: Load PROTEIN Dataset and Train Target Model


Downloading https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip
Processing...
Done!



Dataset: PROTEINS (Graph Classification)
  Total Graphs: 1113
  Num Classes: 2
  Num Features: 3
  Train/Val/Test: 890/111/112

Training TARGET model (GCN for Graph Classification)...
    Epoch 10/50 | Loss: 0.6248 | Val Acc: 0.676
    Epoch 20/50 | Loss: 0.6122 | Val Acc: 0.649
    Epoch 30/50 | Loss: 0.6112 | Val Acc: 0.694
    Epoch 40/50 | Loss: 0.6128 | Val Acc: 0.703
    Epoch 50/50 | Loss: 0.6056 | Val Acc: 0.649

✓ Target model saved to /content/gnnfingers_graph_classification/models/target/gcn_graph_target.pt



In [None]:
# ============================================================================
# CELL 4: Generate 2 Positive Models (Fine-tuned)
# ============================================================================
print("=" * 70)
print("CELL 4: Generate Positive Models (Fine-tuned Clones)")
print("=" * 70)

def clone_and_finetune_graph(model, train_loader, seed, finetune_epochs=10, lr=0.0001):
    """Clone target graph classifier and fine-tune"""
    torch.manual_seed(seed)
    cloned = GCNGraphClassifier(
        num_features=dataset.num_features,
        hidden_channels=64,
        num_classes=dataset.num_classes
    )
    cloned.load_state_dict(model.state_dict())

    optimizer = torch.optim.Adam(cloned.parameters(), lr=lr)

    for _ in range(finetune_epochs):
        cloned.train()
        for data in train_loader:
            optimizer.zero_grad()
            out = cloned(data.x, data.edge_index, data.batch)
            loss = F.cross_entropy(out, data.y)
            loss.backward()
            optimizer.step()

    return cloned

positive_models = []
positive_paths = []

for i in range(2):
    print(f"\nCreating POSITIVE model {i+1} (fine-tuned clone)...")
    pos_model = clone_and_finetune_graph(target_model, train_loader, seed=100+i, finetune_epochs=10)
    positive_models.append(pos_model)

    # Save model
    pos_path = base_dir / "models" / "positive" / f"gcn_graph_pos_{i}.pt"
    torch.save(pos_model.state_dict(), pos_path)
    positive_paths.append(pos_path)
    print(f"  ✓ Saved to {pos_path}")

print("\n")


CELL 4: Generate Positive Models (Fine-tuned Clones)

Creating POSITIVE model 1 (fine-tuned clone)...
  ✓ Saved to /content/gnnfingers_graph_classification/models/positive/gcn_graph_pos_0.pt

Creating POSITIVE model 2 (fine-tuned clone)...
  ✓ Saved to /content/gnnfingers_graph_classification/models/positive/gcn_graph_pos_1.pt




In [None]:
# ============================================================================
# CELL 5: Generate 2 Negative Models (Independent)
# ============================================================================
print("=" * 70)
print("CELL 5: Generate Negative Models (Independent Training)")
print("=" * 70)

negative_models = []
negative_paths = []

# Negative 1: Fresh GCN
print("\nCreating NEGATIVE model 1 (fresh GCN, different seed)...")
torch.manual_seed(200)
neg_model_1 = GCNGraphClassifier(
    num_features=dataset.num_features,
    hidden_channels=64,
    num_classes=dataset.num_classes
)
neg_model_1 = train_graph_classifier(neg_model_1, train_loader, val_loader, epochs=50, verbose=False)
negative_models.append(neg_model_1)

neg_path_1 = base_dir / "models" / "negative" / "gcn_graph_neg_0.pt"
torch.save(neg_model_1.state_dict(), neg_path_1)
negative_paths.append(neg_path_1)
print(f"  ✓ Saved to {neg_path_1}\n")

# Negative 2: GraphSAGE
print("Creating NEGATIVE model 2 (GraphSAGE, different architecture)...")
torch.manual_seed(201)
neg_model_2 = GraphSAGEClassifier(
    num_features=dataset.num_features,
    hidden_channels=64,
    num_classes=dataset.num_classes
)
neg_model_2 = train_graph_classifier(neg_model_2, train_loader, val_loader, epochs=50, verbose=False)
negative_models.append(neg_model_2)

neg_path_2 = base_dir / "models" / "negative" / "sage_graph_neg_1.pt"
torch.save(neg_model_2.state_dict(), neg_path_2)
negative_paths.append(neg_path_2)
print(f"  ✓ Saved to {neg_path_2}\n")

CELL 5: Generate Negative Models (Independent Training)

Creating NEGATIVE model 1 (fresh GCN, different seed)...
  ✓ Saved to /content/gnnfingers_graph_classification/models/negative/gcn_graph_neg_0.pt

Creating NEGATIVE model 2 (GraphSAGE, different architecture)...
  ✓ Saved to /content/gnnfingers_graph_classification/models/negative/sage_graph_neg_1.pt



In [None]:
# ============================================================================
# CELL 6: Create Synthetic Fingerprints (Random Graphs)
# ============================================================================
print("=" * 70)
print("CELL 6: Create Synthetic Fingerprints for Graph Classification")
print("=" * 70)

num_fingerprints = 5
nodes_per_fp = 20

def create_random_graph_fingerprint(num_nodes, num_features, sparsity=0.3):
    """
    Create random synthetic graph fingerprint.
    For graph classification, each fingerprint is a complete graph object.
    """
    # Random node features
    x = torch.randn(num_nodes, num_features)

    # Random sparse edges
    num_possible_edges = num_nodes * (num_nodes - 1) // 2
    num_edges = max(1, int(num_possible_edges * sparsity))

    edge_pairs = []
    for _ in range(num_edges):
        u = np.random.randint(0, num_nodes)
        v = np.random.randint(0, num_nodes)
        if u != v and [u, v] not in edge_pairs and [v, u] not in edge_pairs:
            edge_pairs.append([u, v])
            edge_pairs.append([v, u])  # Make undirected

    if edge_pairs:
        edge_index = torch.tensor(edge_pairs, dtype=torch.long).t().contiguous()
    else:
        edge_index = torch.zeros((2, 0), dtype=torch.long)

    # Batch tensor (all nodes belong to graph 0)
    batch = torch.zeros(num_nodes, dtype=torch.long)

    return x, edge_index, batch

fingerprints = []
print(f"\nCreating {num_fingerprints} random graph fingerprints...")

for i in range(num_fingerprints):
    x, edge_index, batch = create_random_graph_fingerprint(
        nodes_per_fp,
        dataset.num_features,
        sparsity=0.25
    )
    fingerprints.append((x, edge_index, batch))
    print(f"  ✓ FP {i+1}: nodes={x.shape[0]}, edges={edge_index.shape[1]//2}")

# Save fingerprints
fp_path = base_dir / "fingerprints" / "graph_fingerprints.pt"
torch.save(fingerprints, fp_path)
print(f"\n✓ Fingerprints saved to {fp_path}\n")

CELL 6: Create Synthetic Fingerprints for Graph Classification

Creating 5 random graph fingerprints...
  ✓ FP 1: nodes=20, edges=39
  ✓ FP 2: nodes=20, edges=40
  ✓ FP 3: nodes=20, edges=40
  ✓ FP 4: nodes=20, edges=36
  ✓ FP 5: nodes=20, edges=38

✓ Fingerprints saved to /content/gnnfingers_graph_classification/fingerprints/graph_fingerprints.pt



In [None]:

# ============================================================================
# CELL 7: Collect Model Response Vectors (Graph Predictions)
# ============================================================================
print("=" * 70)
print("CELL 7: Collect Model Response Vectors (Graph Predictions)")
print("=" * 70)

def get_graph_response_vector(model, fingerprints):
    """
    Query model on fingerprints and collect graph-level predictions.
    For graph classification, each fingerprint produces one prediction vector.
    """
    model.eval()
    responses = []

    with torch.no_grad():
        for fp_x, fp_edge, fp_batch in fingerprints:
            # Get graph-level prediction
            out = model(fp_x, fp_edge, fp_batch)  # [1, num_classes]
            responses.append(out.flatten())

    # Concatenate all responses into one vector
    response_vector = torch.cat(responses)
    return response_vector

# Collect responses from all models
all_responses = {}

print("\nCollecting graph prediction responses from TARGET model...")
all_responses['target'] = get_graph_response_vector(target_model, fingerprints)

for i, pos_model in enumerate(positive_models):
    print(f"Collecting responses from POSITIVE model {i}...")
    all_responses[f'pos_{i}'] = get_graph_response_vector(pos_model, fingerprints)

for i, neg_model in enumerate(negative_models):
    print(f"Collecting responses from NEGATIVE model {i}...")
    all_responses[f'neg_{i}'] = get_graph_response_vector(neg_model, fingerprints)

print(f"\n✓ Response vector dimension: {all_responses['target'].shape[0]}")
print(f"  (= {num_fingerprints} fingerprints × {dataset.num_classes} classes)\n")


CELL 7: Collect Model Response Vectors (Graph Predictions)

Collecting graph prediction responses from TARGET model...
Collecting responses from POSITIVE model 0...
Collecting responses from POSITIVE model 1...
Collecting responses from NEGATIVE model 0...
Collecting responses from NEGATIVE model 1...

✓ Response vector dimension: 10
  (= 5 fingerprints × 2 classes)



In [None]:
# ============================================================================
# CELL 8: Build Training Data and Train Verifier
# ============================================================================
print("=" * 70)
print("CELL 8: Build Training Data and Train Verifier")
print("=" * 70)

# Build training dataset for verifier
X_train = []
y_train = []

# Positive samples (label = 1) - target + fine-tuned models
X_train.append(all_responses['target'].unsqueeze(0))
y_train.append(1)
print("\n✓ Target model (label=1)")

for i in range(len(positive_models)):
    X_train.append(all_responses[f'pos_{i}'].unsqueeze(0))
    y_train.append(1)
    print(f"✓ Positive model {i} (label=1)")

# Negative samples (label = 0) - independent models
for i in range(len(negative_models)):
    X_train.append(all_responses[f'neg_{i}'].unsqueeze(0))
    y_train.append(0)
    print(f"✓ Negative model {i} (label=0)")

X_train = torch.cat(X_train, dim=0)
y_train = torch.tensor(y_train, dtype=torch.float32)

print(f"\nTraining data shape: X={X_train.shape}, y={y_train.shape}")
print(f"  Class 1 (positive): {(y_train == 1).sum()} samples")
print(f"  Class 0 (negative): {(y_train == 0).sum()} samples")

# Train verifier
print(f"\nTraining VERIFIER for Graph Classification task...")
verifier = Verifier(input_dim=X_train.shape[1], hidden_dim=32)
optimizer = torch.optim.Adam(verifier.parameters(), lr=0.01)
loss_fn = nn.BCELoss()

num_epochs = 200
for epoch in range(num_epochs):
    verifier.train()
    optimizer.zero_grad()

    y_pred = verifier(X_train)
    loss = loss_fn(y_pred, y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 50 == 0:
        print(f"  Epoch {epoch+1}/{num_epochs} | Loss: {loss.item():.4f}")

# Save verifier
verifier_path = base_dir / "verifier" / "verifier_graph.pt"
torch.save(verifier.state_dict(), verifier_path)
print(f"\n✓ Verifier saved to {verifier_path}\n")


CELL 8: Build Training Data and Train Verifier

✓ Target model (label=1)
✓ Positive model 0 (label=1)
✓ Positive model 1 (label=1)
✓ Negative model 0 (label=0)
✓ Negative model 1 (label=0)

Training data shape: X=torch.Size([5, 10]), y=torch.Size([5])
  Class 1 (positive): 3 samples
  Class 0 (negative): 2 samples

Training VERIFIER for Graph Classification task...
  Epoch 50/200 | Loss: 0.0120
  Epoch 100/200 | Loss: 0.0004
  Epoch 150/200 | Loss: 0.0002
  Epoch 200/200 | Loss: 0.0002

✓ Verifier saved to /content/gnnfingers_graph_classification/verifier/verifier_graph.pt



In [None]:
# ============================================================================
# CELL 9: Evaluate Verifier and Calculate Metrics
# ============================================================================
print("=" * 70)
print("CELL 9: EVALUATE VERIFIER - Calculate TP/TN/Accuracy")
print("=" * 70)

verifier.eval()
with torch.no_grad():
    y_pred_probs = verifier(X_train)
    y_pred = (y_pred_probs >= 0.5).long()
    y_true = y_train.long()

# Calculate confusion matrix
TP = ((y_pred == 1) & (y_true == 1)).sum().item()
TN = ((y_pred == 0) & (y_true == 0)).sum().item()
FP = ((y_pred == 1) & (y_true == 0)).sum().item()
FN = ((y_pred == 0) & (y_true == 1)).sum().item()

total = len(y_true)
accuracy = (TP + TN) / total
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0

print("\n" + "="*70)
print("CONFUSION MATRIX")
print("="*70)
print(f"  TP (True Positive):   {TP}   ← Positive models correctly identified")
print(f"  TN (True Negative):   {TN}   ← Negative models correctly identified")
print(f"  FP (False Positive):  {FP}   ← Negative incorrectly as positive")
print(f"  FN (False Negative):  {FN}   ← Positive incorrectly as negative")

print("\n" + "="*70)
print("METRICS")
print("="*70)
print(f"  Accuracy:   {accuracy:.3f}  (TP+TN)/Total = ({TP}+{TN})/{total}")
print(f"  Precision:  {precision:.3f}  TP/(TP+FP) = {TP}/({TP}+{FP})")
print(f"  Recall:     {recall:.3f}   TP/(TP+FN) = {TP}/({TP}+{FN})")
print()


CELL 9: EVALUATE VERIFIER - Calculate TP/TN/Accuracy

CONFUSION MATRIX
  TP (True Positive):   3   ← Positive models correctly identified
  TN (True Negative):   2   ← Negative models correctly identified
  FP (False Positive):  0   ← Negative incorrectly as positive
  FN (False Negative):  0   ← Positive incorrectly as negative

METRICS
  Accuracy:   1.000  (TP+TN)/Total = (3+2)/5
  Precision:  1.000  TP/(TP+FP) = 3/(3+0)
  Recall:     1.000   TP/(TP+FN) = 3/(3+0)

