In [None]:
"""
GNNFingers - Link Prediction on Cora Dataset
=============================================


PIPELINE:
1. Load Cora dataset (for link prediction)
2. Train target GCN model for link prediction
3. Generate 2 positive models (fine-tuned clones)
4. Generate 2 negative models (fresh GCN and GraphSAGE)
5. Create 5 synthetic fingerprints (random small graphs)
6. Collect model responses (edge predictions)
7. Train verifier (binary classifier)
8. Evaluate: TP, TN, Accuracy
"""

# ============================================================================
# CELL 1: Setup and Install Dependencies
# ============================================================================
print("=" * 70)
print("CELL 1: Installing Dependencies and Creating Folder Structure")
print("=" * 70)

import subprocess
import sys

print("Installing packages...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                       "torch", "torch_geometric", "torch_scatter", "torch_sparse"])

import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from sklearn.metrics import accuracy_score, roc_auc_score

# Create base directory for link prediction
base_dir = Path("/content/gnnfingers_link_prediction")
base_dir.mkdir(exist_ok=True)

# Create subdirectories
(base_dir / "data").mkdir(exist_ok=True)
(base_dir / "models" / "target").mkdir(parents=True, exist_ok=True)
(base_dir / "models" / "positive").mkdir(parents=True, exist_ok=True)
(base_dir / "models" / "negative").mkdir(parents=True, exist_ok=True)
(base_dir / "fingerprints").mkdir(exist_ok=True)
(base_dir / "verifier").mkdir(exist_ok=True)
(base_dir / "results").mkdir(exist_ok=True)

print(f"\n✓ Directory structure created:")
print(f"  {base_dir}/")
print(f"    ├── data/")
print(f"    ├── models/")
print(f"    │   ├── target/")
print(f"    │   ├── positive/")
print(f"    │   └── negative/")
print(f"    ├── fingerprints/")
print(f"    ├── verifier/")
print(f"    └── results/\n")

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

print("✓ Dependencies installed and directories ready\n")

CELL 1: Installing Dependencies and Creating Folder Structure
Installing packages...

✓ Directory structure created:
  /content/gnnfingers_link_prediction/
    ├── data/
    ├── models/
    │   ├── target/
    │   ├── positive/
    │   └── negative/
    ├── fingerprints/
    ├── verifier/
    └── results/

✓ Dependencies installed and directories ready



In [None]:
# ============================================================================
# CELL 2: Define Link Prediction Models and Utilities
# ============================================================================
print("=" * 70)
print("CELL 2: Define Link Prediction Models")
print("=" * 70)

from torch_geometric.nn import GCNConv, SAGEConv
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import negative_sampling, train_test_split_edges

class LinkPredictorGCN(nn.Module):
    """GCN for Link Prediction with edge decoder"""
    def __init__(self, in_channels, hidden_channels=64):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

    def encode(self, x, edge_index):
        """Get node embeddings"""
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

    def decode(self, z, edge_index):
        """Predict edge probabilities from node embeddings"""
        # Dot product decoder: edge_score = z_u · z_v
        src, dst = edge_index
        return (z[src] * z[dst]).sum(dim=1)

    def forward(self, x, edge_index):
        """Full forward pass for link prediction"""
        z = self.encode(x, edge_index)
        return z

class LinkPredictorSAGE(nn.Module):
    """GraphSAGE for Link Prediction"""
    def __init__(self, in_channels, hidden_channels=64):
        super().__init__()
        self.sage1 = SAGEConv(in_channels, hidden_channels)
        self.sage2 = SAGEConv(hidden_channels, hidden_channels)

    def encode(self, x, edge_index):
        """Get node embeddings"""
        x = self.sage1(x, edge_index)
        x = F.relu(x)
        x = self.sage2(x, edge_index)
        return x

    def decode(self, z, edge_index):
        """Predict edge probabilities"""
        src, dst = edge_index
        return (z[src] * z[dst]).sum(dim=1)

    def forward(self, x, edge_index):
        """Full forward pass"""
        z = self.encode(x, edge_index)
        return z

class Verifier(nn.Module):
    """Binary classifier verifier"""
    def __init__(self, input_dim, hidden_dim=32):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 16)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x.squeeze()

def load_dataset_link_prediction(dataset_name="Cora"):
    """Load dataset and prepare for link prediction"""
    dataset = Planetoid(root=str(base_dir / "data"), name=dataset_name)
    data = dataset[0]

    # Split edges into train/val/test for link prediction
    data = train_test_split_edges(data, val_ratio=0.05, test_ratio=0.1)

    return data, dataset

def get_link_labels(pos_edge_index, neg_edge_index):
    """Create labels for positive and negative edges"""
    num_pos = pos_edge_index.size(1)
    num_neg = neg_edge_index.size(1)

    link_labels = torch.cat([torch.ones(num_pos), torch.zeros(num_neg)])
    return link_labels

def train_link_predictor(model, data, epochs=50, lr=0.001, verbose=True):
    """Train a link prediction model"""
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        # Get node embeddings
        z = model.encode(data.x, data.train_pos_edge_index)

        # Positive edges
        pos_pred = model.decode(z, data.train_pos_edge_index)

        # Negative sampling
        neg_edge_index = negative_sampling(
            edge_index=data.train_pos_edge_index,
            num_nodes=data.num_nodes,
            num_neg_samples=data.train_pos_edge_index.size(1)
        )
        neg_pred = model.decode(z, neg_edge_index)

        # Binary cross-entropy loss
        loss = -torch.log(torch.sigmoid(pos_pred) + 1e-15).mean() - \
               torch.log(1 - torch.sigmoid(neg_pred) + 1e-15).mean()

        loss.backward()
        optimizer.step()

        if verbose and (epoch + 1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                z = model.encode(data.x, data.train_pos_edge_index)

                # Test set evaluation
                pos_pred = torch.sigmoid(model.decode(z, data.test_pos_edge_index))
                neg_pred = torch.sigmoid(model.decode(z, data.test_neg_edge_index))

                preds = torch.cat([pos_pred, neg_pred]).cpu().numpy()
                labels = torch.cat([torch.ones(data.test_pos_edge_index.size(1)),
                                   torch.zeros(data.test_neg_edge_index.size(1))]).cpu().numpy()

                auc = roc_auc_score(labels, preds)

            print(f"    Epoch {epoch+1}/{epochs} | Loss: {loss.item():.4f} | Test AUC: {auc:.3f}")

    return model

print("✓ Link Prediction models and utilities defined\n")

CELL 2: Define Link Prediction Models
✓ Link Prediction models and utilities defined



In [None]:
# ============================================================================
# CELL 3: Load Dataset and Train Target Model
# ============================================================================
print("=" * 70)
print("CELL 3: Load Cora for Link Prediction and Train Target Model")
print("=" * 70)

dataset_name = "Cora"
data, dataset = load_dataset_link_prediction(dataset_name)

print(f"\nDataset: {dataset_name} (Link Prediction)")
print(f"  Total Nodes: {data.num_nodes}")
print(f"  Total Features: {data.num_features}")
print(f"  Train Edges: {data.train_pos_edge_index.size(1)}")
print(f"  Val Edges: {data.val_pos_edge_index.size(1)}")
print(f"  Test Edges: {data.test_pos_edge_index.size(1)}")

# Train target model
print(f"\nTraining TARGET model (GCN for Link Prediction)...")
target_model = LinkPredictorGCN(data.num_features, hidden_channels=64)
target_model = train_link_predictor(target_model, data, epochs=50)

# Save target model
target_path = base_dir / "models" / "target" / "gcn_link_target.pt"
torch.save(target_model.state_dict(), target_path)
print(f"\n✓ Target model saved to {target_path}\n")

CELL 3: Load Cora for Link Prediction and Train Target Model


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!
  data = train_test_split_edges(data, val_ratio=0.05, test_ratio=0.1)



Dataset: Cora (Link Prediction)
  Total Nodes: 2708
  Total Features: 1433
  Train Edges: 8976
  Val Edges: 263
  Test Edges: 527

Training TARGET model (GCN for Link Prediction)...
    Epoch 10/50 | Loss: 1.1942 | Test AUC: 0.878
    Epoch 20/50 | Loss: 0.9511 | Test AUC: 0.896
    Epoch 30/50 | Loss: 0.8982 | Test AUC: 0.907
    Epoch 40/50 | Loss: 0.8582 | Test AUC: 0.917
    Epoch 50/50 | Loss: 0.8521 | Test AUC: 0.918

✓ Target model saved to /content/gnnfingers_link_prediction/models/target/gcn_link_target.pt



In [None]:
# ============================================================================
# CELL 4: Generate 2 Positive Models (Fine-tuned)
# ============================================================================
print("=" * 70)
print("CELL 4: Generate Positive Models (Fine-tuned Clones)")
print("=" * 70)

def clone_and_finetune_link(model, data, seed, finetune_epochs=10, lr=0.0001):
    """Clone target link predictor and fine-tune it"""
    torch.manual_seed(seed)
    cloned = LinkPredictorGCN(data.num_features, hidden_channels=64)
    cloned.load_state_dict(model.state_dict())

    optimizer = torch.optim.Adam(cloned.parameters(), lr=lr)

    for _ in range(finetune_epochs):
        cloned.train()
        optimizer.zero_grad()

        z = cloned.encode(data.x, data.train_pos_edge_index)
        pos_pred = cloned.decode(z, data.train_pos_edge_index)

        neg_edge_index = negative_sampling(
            edge_index=data.train_pos_edge_index,
            num_nodes=data.num_nodes,
            num_neg_samples=data.train_pos_edge_index.size(1)
        )
        neg_pred = cloned.decode(z, neg_edge_index)

        loss = -torch.log(torch.sigmoid(pos_pred) + 1e-15).mean() - \
               torch.log(1 - torch.sigmoid(neg_pred) + 1e-15).mean()

        loss.backward()
        optimizer.step()

    return cloned

positive_models = []
positive_paths = []

for i in range(2):
    print(f"\nCreating POSITIVE model {i+1} (fine-tuned clone)...")
    pos_model = clone_and_finetune_link(target_model, data, seed=100+i, finetune_epochs=10)
    positive_models.append(pos_model)

    # Save model
    pos_path = base_dir / "models" / "positive" / f"gcn_link_pos_{i}.pt"
    torch.save(pos_model.state_dict(), pos_path)
    positive_paths.append(pos_path)
    print(f"  ✓ Saved to {pos_path}")

print("\n")

CELL 4: Generate Positive Models (Fine-tuned Clones)

Creating POSITIVE model 1 (fine-tuned clone)...
  ✓ Saved to /content/gnnfingers_link_prediction/models/positive/gcn_link_pos_0.pt

Creating POSITIVE model 2 (fine-tuned clone)...
  ✓ Saved to /content/gnnfingers_link_prediction/models/positive/gcn_link_pos_1.pt




In [None]:
# ============================================================================
# CELL 5: Generate 2 Negative Models (Independent)
# ============================================================================
print("=" * 70)
print("CELL 5: Generate Negative Models (Independent Training)")
print("=" * 70)

negative_models = []
negative_paths = []

# Negative 1: Fresh GCN
print("\nCreating NEGATIVE model 1 (fresh GCN, different seed)...")
torch.manual_seed(200)
neg_model_1 = LinkPredictorGCN(data.num_features, hidden_channels=64)
neg_model_1 = train_link_predictor(neg_model_1, data, epochs=50, verbose=False)
negative_models.append(neg_model_1)

neg_path_1 = base_dir / "models" / "negative" / "gcn_link_neg_0.pt"
torch.save(neg_model_1.state_dict(), neg_path_1)
negative_paths.append(neg_path_1)
print(f"  ✓ Saved to {neg_path_1}\n")

# Negative 2: GraphSAGE
print("Creating NEGATIVE model 2 (GraphSAGE, different architecture)...")
torch.manual_seed(201)
neg_model_2 = LinkPredictorSAGE(data.num_features, hidden_channels=64)
neg_model_2 = train_link_predictor(neg_model_2, data, epochs=50, verbose=False)
negative_models.append(neg_model_2)

neg_path_2 = base_dir / "models" / "negative" / "sage_link_neg_1.pt"
torch.save(neg_model_2.state_dict(), neg_path_2)
negative_paths.append(neg_path_2)
print(f"  ✓ Saved to {neg_path_2}\n")

CELL 5: Generate Negative Models (Independent Training)

Creating NEGATIVE model 1 (fresh GCN, different seed)...
  ✓ Saved to /content/gnnfingers_link_prediction/models/negative/gcn_link_neg_0.pt

Creating NEGATIVE model 2 (GraphSAGE, different architecture)...
  ✓ Saved to /content/gnnfingers_link_prediction/models/negative/sage_link_neg_1.pt



In [None]:
# ============================================================================
# CELL 6: Create Synthetic Fingerprints (Small Graphs)
# ============================================================================
print("=" * 70)
print("CELL 6: Create Synthetic Fingerprints for Link Prediction")
print("=" * 70)

num_fingerprints = 5
nodes_per_fp = 16

def create_random_fingerprint(num_nodes, num_features, sparsity=0.3):
    """Create random synthetic graph fingerprint"""
    # Random node features
    x = torch.randn(num_nodes, num_features)

    # Random sparse edges
    num_possible_edges = num_nodes * (num_nodes - 1) // 2
    num_edges = max(1, int(num_possible_edges * sparsity))

    edge_pairs = []
    for _ in range(num_edges):
        u = np.random.randint(0, num_nodes)
        v = np.random.randint(0, num_nodes)
        if u != v and [u, v] not in edge_pairs and [v, u] not in edge_pairs:
            edge_pairs.append([u, v])

    if edge_pairs:
        edge_index = torch.tensor(edge_pairs, dtype=torch.long).t().contiguous()
    else:
        edge_index = torch.zeros((2, 0), dtype=torch.long)

    return x, edge_index

fingerprints = []
print(f"\nCreating {num_fingerprints} random fingerprints...")

for i in range(num_fingerprints):
    x, edge_index = create_random_fingerprint(nodes_per_fp, data.num_features, sparsity=0.25)
    fingerprints.append((x, edge_index))
    print(f"  ✓ FP {i+1}: nodes={x.shape[0]}, edges={edge_index.shape[1]}")

# Save fingerprints
fp_path = base_dir / "fingerprints" / "fingerprints.pt"
torch.save(fingerprints, fp_path)
print(f"\n✓ Fingerprints saved to {fp_path}\n")

CELL 6: Create Synthetic Fingerprints for Link Prediction

Creating 5 random fingerprints...
  ✓ FP 1: nodes=16, edges=26
  ✓ FP 2: nodes=16, edges=27
  ✓ FP 3: nodes=16, edges=23
  ✓ FP 4: nodes=16, edges=25
  ✓ FP 5: nodes=16, edges=25

✓ Fingerprints saved to /content/gnnfingers_link_prediction/fingerprints/fingerprints.pt



In [None]:
# ============================================================================
# CELL 7: Collect Model Response Vectors (Edge Predictions)
# ============================================================================
print("=" * 70)
print("CELL 7: Collect Model Response Vectors (Edge Predictions)")
print("=" * 70)

def get_link_response_vector(model, fingerprints, num_edge_samples=10):
    """
    Query model on fingerprints and collect edge prediction responses.
    For link prediction, we sample node pairs and predict edge scores.
    """
    model.eval()
    responses = []

    with torch.no_grad():
        for fp_x, fp_edge in fingerprints:
            # Get node embeddings from fingerprint
            z = model.encode(fp_x, fp_edge)

            # Sample random node pairs (potential edges)
            num_nodes = fp_x.shape[0]
            sampled_edges = []

            for _ in range(min(num_edge_samples, num_nodes * (num_nodes - 1) // 4)):
                u = np.random.randint(0, num_nodes)
                v = np.random.randint(0, num_nodes)
                if u != v:
                    sampled_edges.append([u, v])

            if sampled_edges:
                sampled_edge_index = torch.tensor(sampled_edges, dtype=torch.long).t()
                edge_scores = model.decode(z, sampled_edge_index)
                responses.append(edge_scores.flatten())

    # Concatenate all responses into one vector
    response_vector = torch.cat(responses) if responses else torch.tensor([])
    return response_vector

# Collect responses from all models
all_responses = {}

print("\nCollecting link prediction responses from TARGET model...")
all_responses['target'] = get_link_response_vector(target_model, fingerprints)

for i, pos_model in enumerate(positive_models):
    print(f"Collecting responses from POSITIVE model {i}...")
    all_responses[f'pos_{i}'] = get_link_response_vector(pos_model, fingerprints)

for i, neg_model in enumerate(negative_models):
    print(f"Collecting responses from NEGATIVE model {i}...")
    all_responses[f'neg_{i}'] = get_link_response_vector(neg_model, fingerprints)

print(f"\n✓ Response vector dimension: {all_responses['target'].shape[0]}\n")


CELL 7: Collect Model Response Vectors (Edge Predictions)

Collecting link prediction responses from TARGET model...
Collecting responses from POSITIVE model 0...
Collecting responses from POSITIVE model 1...
Collecting responses from NEGATIVE model 0...
Collecting responses from NEGATIVE model 1...

✓ Response vector dimension: 44



In [None]:
# ============================================================================
# CELL 8: Build Training Data and Train Verifier
# ============================================================================
print("=" * 70)
print("CELL 8: Build Training Data and Train Verifier")
print("=" * 70)

# Build training dataset for verifier
X_train = []
y_train = []

# Positive samples (label = 1)
X_train.append(all_responses['target'].unsqueeze(0))
y_train.append(1)
print("\n✓ Target model (label=1)")

for i in range(len(positive_models)):
    X_train.append(all_responses[f'pos_{i}'].unsqueeze(0))
    y_train.append(1)
    print(f"✓ Positive model {i} (label=1)")

# Negative samples (label = 0)
for i in range(len(negative_models)):
    X_train.append(all_responses[f'neg_{i}'].unsqueeze(0))
    y_train.append(0)
    print(f"✓ Negative model {i} (label=0)")

# Debug shapes
print("\nChecking tensor shapes before concatenation:")
for i, x in enumerate(X_train):
    print(f"  Tensor {i}: {x.shape}")

# Ensure all same feature dimension
min_dim = min(x.shape[1] if x.ndim > 1 else x.shape[0] for x in X_train)
X_train = [x[:, :min_dim] if x.shape[1] > min_dim else
           torch.nn.functional.pad(x, (0, min_dim - x.shape[1]))
           for x in X_train]

X_train = torch.cat(X_train, dim=0)
y_train = torch.tensor(y_train, dtype=torch.float32)

print(f"\nTraining data shape: X={X_train.shape}, y={y_train.shape}")
print(f"  Class 1 (positive): {(y_train == 1).sum()} samples")
print(f"  Class 0 (negative): {(y_train == 0).sum()} samples")

# Train verifier
print(f"\nTraining VERIFIER for Link Prediction task...")
verifier = Verifier(input_dim=X_train.shape[1], hidden_dim=32)
optimizer = torch.optim.Adam(verifier.parameters(), lr=0.01)
loss_fn = nn.BCELoss()

num_epochs = 200
for epoch in range(num_epochs):
    verifier.train()
    optimizer.zero_grad()

    y_pred = verifier(X_train)
    loss = loss_fn(y_pred, y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 50 == 0:
        print(f"  Epoch {epoch+1}/{num_epochs} | Loss: {loss.item():.4f}")

# Save verifier
verifier_path = base_dir / "verifier" / "verifier_link.pt"
torch.save(verifier.state_dict(), verifier_path)
print(f"\n✓ Verifier saved to {verifier_path}\n")


CELL 8: Build Training Data and Train Verifier

✓ Target model (label=1)
✓ Positive model 0 (label=1)
✓ Positive model 1 (label=1)
✓ Negative model 0 (label=0)
✓ Negative model 1 (label=0)

Checking tensor shapes before concatenation:
  Tensor 0: torch.Size([1, 44])
  Tensor 1: torch.Size([1, 46])
  Tensor 2: torch.Size([1, 48])
  Tensor 3: torch.Size([1, 47])
  Tensor 4: torch.Size([1, 48])

Training data shape: X=torch.Size([5, 44]), y=torch.Size([5])
  Class 1 (positive): 3 samples
  Class 0 (negative): 2 samples

Training VERIFIER for Link Prediction task...
  Epoch 50/200 | Loss: 0.0000
  Epoch 100/200 | Loss: 0.0000
  Epoch 150/200 | Loss: 0.0000
  Epoch 200/200 | Loss: 0.0000

✓ Verifier saved to /content/gnnfingers_link_prediction/verifier/verifier_link.pt



In [None]:
# ============================================================================
# CELL 9: Evaluate Verifier and Calculate Metrics
# ============================================================================
print("=" * 70)
print("CELL 9: EVALUATE VERIFIER - Calculate TP/TN/Accuracy")
print("=" * 70)

verifier.eval()
with torch.no_grad():
    y_pred_probs = verifier(X_train)
    y_pred = (y_pred_probs >= 0.5).long()
    y_true = y_train.long()

# Calculate confusion matrix
TP = ((y_pred == 1) & (y_true == 1)).sum().item()
TN = ((y_pred == 0) & (y_true == 0)).sum().item()
FP = ((y_pred == 1) & (y_true == 0)).sum().item()
FN = ((y_pred == 0) & (y_true == 1)).sum().item()

total = len(y_true)
accuracy = (TP + TN) / total
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0

print("\n" + "="*70)
print("CONFUSION MATRIX")
print("="*70)
print(f"  TP (True Positive):   {TP}   ← Positive models correctly identified")
print(f"  TN (True Negative):   {TN}   ← Negative models correctly identified")
print(f"  FP (False Positive):  {FP}   ← Negative incorrectly as positive")
print(f"  FN (False Negative):  {FN}   ← Positive incorrectly as negative")

print("\n" + "="*70)
print("METRICS")
print("="*70)
print(f"  Accuracy:   {accuracy:.3f}  (TP+TN)/Total = ({TP}+{TN})/{total}")
print(f"  Precision:  {precision:.3f}  TP/(TP+FP) = {TP}/({TP}+{FP})")
print(f"  Recall:     {recall:.3f}   TP/(TP+FN) = {TP}/({TP}+{FN})")
print()


CELL 9: EVALUATE VERIFIER - Calculate TP/TN/Accuracy

CONFUSION MATRIX
  TP (True Positive):   3   ← Positive models correctly identified
  TN (True Negative):   2   ← Negative models correctly identified
  FP (False Positive):  0   ← Negative incorrectly as positive
  FN (False Negative):  0   ← Positive incorrectly as negative

METRICS
  Accuracy:   1.000  (TP+TN)/Total = (3+2)/5
  Precision:  1.000  TP/(TP+FP) = 3/(3+0)
  Recall:     1.000   TP/(TP+FN) = 3/(3+0)

