In [None]:
"""
GNNFingers - Graph Matching on AIDS Dataset
============================================

PIPELINE:
1. Load AIDS dataset (pairs of graphs for similarity computation)
2. Train target GCN model for graph matching
3. Generate 2 positive models (fine-tuned clones)
4. Generate 2 negative models (fresh GCN and SimGNN)
5. Create 5 synthetic fingerprints (pairs of random graphs)
6. Collect model responses (similarity scores between graph pairs)
7. Train verifier (binary classifier)
8. Evaluate: TP, TN, Accuracy

"""

# ============================================================================
# CELL 1: Setup and Install Dependencies
# ============================================================================
print("=" * 70)
print("CELL 1: Installing Dependencies for Graph Matching")
print("=" * 70)

import subprocess
import sys

print("Installing packages...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                       "torch", "torch_geometric", "torch_scatter", "torch_sparse"])

import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from sklearn.metrics import mean_squared_error

# Create base directory for graph matching
base_dir = Path("/content/gnnfingers_graph_matching")
base_dir.mkdir(exist_ok=True)

# Create subdirectories
(base_dir / "data").mkdir(exist_ok=True)
(base_dir / "models" / "target").mkdir(parents=True, exist_ok=True)
(base_dir / "models" / "positive").mkdir(parents=True, exist_ok=True)
(base_dir / "models" / "negative").mkdir(parents=True, exist_ok=True)
(base_dir / "fingerprints").mkdir(exist_ok=True)
(base_dir / "verifier").mkdir(exist_ok=True)
(base_dir / "results").mkdir(exist_ok=True)

print(f"\n✓ Directory structure created:")
print(f"  {base_dir}/")
print(f"    ├── data/            (AIDS dataset)")
print(f"    ├── models/")
print(f"    │   ├── target/      (target GCN)")
print(f"    │   ├── positive/    (fine-tuned clones)")
print(f"    │   └── negative/    (independent models)")
print(f"    ├── fingerprints/    (synthetic graph pairs)")
print(f"    ├── verifier/        (binary classifier)")
print(f"    └── results/         (TP/TN/accuracy)\n")

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

print("✓ Dependencies installed and directories ready\n")

CELL 1: Installing Dependencies for Graph Matching
Installing packages...

✓ Directory structure created:
  /content/gnnfingers_graph_matching/
    ├── data/            (AIDS dataset)
    ├── models/
    │   ├── target/      (target GCN)
    │   ├── positive/    (fine-tuned clones)
    │   └── negative/    (independent models)
    ├── fingerprints/    (synthetic graph pairs)
    ├── verifier/        (binary classifier)
    └── results/         (TP/TN/accuracy)

✓ Dependencies installed and directories ready



In [2]:
# ============================================================================
# CELL 2: Define Graph Matching Models
# ============================================================================
print("=" * 70)
print("CELL 2: Define Graph Matching Models")
print("=" * 70)

from torch_geometric.nn import GCNConv, SAGEConv, global_mean_pool, global_add_pool
from torch_geometric.datasets import GEDDataset
from torch_geometric.loader import DataLoader

class GraphMatcherGCN(nn.Module):
    """GCN for Graph Matching (computes similarity between two graphs)"""
    def __init__(self, num_features, hidden_channels=64):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)

        # NTN (Neural Tensor Network) layer for similarity
        self.ntn_weight = nn.Parameter(torch.randn(hidden_channels, hidden_channels, 16))
        self.ntn_bias = nn.Parameter(torch.randn(16))
        self.linear = nn.Linear(16, 1)

    def encode(self, x, edge_index, batch):
        """Encode graph to embedding"""
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)

        # Graph-level pooling
        x = global_mean_pool(x, batch)
        return x

    def forward(self, data1, data2):
        """
        Compute similarity between two graphs.
        data1, data2: (x, edge_index, batch) tuples
        """
        x1, edge_index1, batch1 = data1
        x2, edge_index2, batch2 = data2

        # Get graph embeddings
        h1 = self.encode(x1, edge_index1, batch1)  # [batch_size, hidden]
        h2 = self.encode(x2, edge_index2, batch2)

        # Compute similarity via Neural Tensor Network
        # scores[i] = h1^T W[i] h2
        scores = []
        for i in range(16):
            score = torch.sum(h1 * torch.mm(h2, self.ntn_weight[:, :, i].t()), dim=1)
            scores.append(score)
        scores = torch.stack(scores, dim=1) + self.ntn_bias

        # Final similarity score
        similarity = self.linear(F.relu(scores))
        return similarity.squeeze()

class GraphMatcherSimGNN(nn.Module):
    """Simplified SimGNN-style architecture"""
    def __init__(self, num_features, hidden_channels=64):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

        # Attention mechanism
        self.attention = nn.Linear(hidden_channels, 1)

        # Similarity predictor
        self.fc1 = nn.Linear(hidden_channels * 2, 32)
        self.fc2 = nn.Linear(32, 1)

    def encode(self, x, edge_index, batch):
        """Encode with attention pooling"""
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)

        # Attention weights
        att_weights = torch.sigmoid(self.attention(x))
        x = x * att_weights

        # Global pooling
        x = global_mean_pool(x, batch)
        return x

    def forward(self, data1, data2):
        """Compute graph similarity"""
        x1, edge_index1, batch1 = data1
        x2, edge_index2, batch2 = data2

        h1 = self.encode(x1, edge_index1, batch1)
        h2 = self.encode(x2, edge_index2, batch2)

        # Concatenate and predict similarity
        h = torch.cat([h1, h2], dim=1)
        h = F.relu(self.fc1(h))
        similarity = self.fc2(h)

        return similarity.squeeze()

class Verifier(nn.Module):
    """Binary classifier verifier"""
    def __init__(self, input_dim, hidden_dim=32):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 16)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x.squeeze()

def load_aids_dataset():
    """Load AIDS dataset for graph matching"""
    # Download AIDS dataset
    dataset = GEDDataset(root=str(base_dir / "data"), name='AIDS700nef')

    # Create graph pairs for training
    # For simplicity, we'll create random pairs
    torch.manual_seed(42)
    num_pairs = 500
    pairs = []

    for _ in range(num_pairs):
        idx1 = np.random.randint(0, len(dataset))
        idx2 = np.random.randint(0, len(dataset))

        # Normalized GED as ground truth similarity (0=identical, 1=very different)
        # We'll simulate this with random values for toy example
        ged = np.random.uniform(0, 10)
        normalized_sim = 1.0 / (1.0 + ged)  # Convert to similarity

        pairs.append((idx1, idx2, normalized_sim))

    return dataset, pairs

def train_graph_matcher(model, dataset, pairs, epochs=50, lr=0.001, verbose=True):
    """Train a graph matching model"""
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Split pairs into train/val
    train_pairs = pairs[:int(0.8 * len(pairs))]
    val_pairs = pairs[int(0.8 * len(pairs)):]

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        # Sample mini-batches
        np.random.shuffle(train_pairs)
        batch_size = 32

        for i in range(0, len(train_pairs), batch_size):
            batch_pairs = train_pairs[i:i+batch_size]

            optimizer.zero_grad()
            batch_loss = 0

            for idx1, idx2, true_sim in batch_pairs:
                g1 = dataset[idx1]
                g2 = dataset[idx2]

                # Create batch tensors
                batch1 = torch.zeros(g1.x.shape[0], dtype=torch.long)
                batch2 = torch.zeros(g2.x.shape[0], dtype=torch.long)

                # Predict similarity
                pred_sim = model((g1.x, g1.edge_index, batch1),
                                (g2.x, g2.edge_index, batch2))

                # MSE loss
                loss = F.mse_loss(pred_sim.unsqueeze(0), torch.tensor([true_sim]))
                batch_loss += loss

            batch_loss = batch_loss / len(batch_pairs)
            batch_loss.backward()
            optimizer.step()
            total_loss += batch_loss.item()

        if verbose and (epoch + 1) % 10 == 0:
            # Validation
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for idx1, idx2, true_sim in val_pairs[:50]:  # Sample for speed
                    g1 = dataset[idx1]
                    g2 = dataset[idx2]
                    batch1 = torch.zeros(g1.x.shape[0], dtype=torch.long)
                    batch2 = torch.zeros(g2.x.shape[0], dtype=torch.long)

                    pred_sim = model((g1.x, g1.edge_index, batch1),
                                    (g2.x, g2.edge_index, batch2))
                    val_loss += F.mse_loss(pred_sim.unsqueeze(0), torch.tensor([true_sim])).item()

            val_loss = val_loss / min(50, len(val_pairs))
            print(f"    Epoch {epoch+1}/{epochs} | Train Loss: {total_loss/(len(train_pairs)//batch_size):.4f} | Val Loss: {val_loss:.4f}")

    return model

print("✓ Graph Matching models defined\n")


CELL 2: Define Graph Matching Models
✓ Graph Matching models defined



In [6]:
# ============================================================================
# CELL 3: Load AIDS Dataset and Train Target Model
# ============================================================================

import torch
from torch_geometric.datasets import TUDataset
from pathlib import Path

print("=" * 70)
print("CELL 3: Load AIDS Dataset and Train Target Model")
print("=" * 70)

# ----------------------------------------------------------------------------
# Load AIDS Dataset (using TUDataset)
# ----------------------------------------------------------------------------
def load_aids_dataset():
    dataset = TUDataset(root="data/AIDS", name="AIDS")

    # Each pair now includes a label (true_sim)
    # Here we assign dummy similarity values:
    # - 1.0 for similar graphs (example placeholder)
    # You can change this logic if your task defines similarity differently.
    pairs = [(i, (i + 1) % len(dataset), 1.0) for i in range(len(dataset) - 1)]

    return dataset, pairs


# Load dataset
dataset, pairs = load_aids_dataset()

print(f"\nDataset: AIDS (Graph Matching)")
print(f"  Total Graphs: {len(dataset)}")
print(f"  Num Features: {dataset.num_node_features}")
print(f"  Graph Pairs for Training: {len(pairs)}")

# ----------------------------------------------------------------------------
# Train target model (GCN for Graph Matching)
# ----------------------------------------------------------------------------
print(f"\nTraining TARGET model (GCN for Graph Matching)...")

target_model = GraphMatcherGCN(
    num_features=dataset.num_node_features,
    hidden_channels=64
)

target_model = train_graph_matcher(target_model, dataset, pairs, epochs=50)

# ----------------------------------------------------------------------------
# Save target model
# ----------------------------------------------------------------------------
base_dir = Path(".")
save_dir = base_dir / "models" / "target"
save_dir.mkdir(parents=True, exist_ok=True)

target_path = save_dir / "gcn_match_target.pt"
torch.save(target_model.state_dict(), target_path)

print(f"\n✓ Target model saved to {target_path}\n")


CELL 3: Load AIDS Dataset and Train Target Model

Dataset: AIDS (Graph Matching)
  Total Graphs: 2000
  Num Features: 38
  Graph Pairs for Training: 1999

Training TARGET model (GCN for Graph Matching)...
    Epoch 10/50 | Train Loss: 0.0001 | Val Loss: 0.0002
    Epoch 20/50 | Train Loss: 0.0000 | Val Loss: 0.0002
    Epoch 30/50 | Train Loss: 0.0000 | Val Loss: 0.0002
    Epoch 40/50 | Train Loss: 0.0000 | Val Loss: 0.0002
    Epoch 50/50 | Train Loss: 0.0000 | Val Loss: 0.0002

✓ Target model saved to models/target/gcn_match_target.pt



In [8]:
# ============================================================================
# CELL 4: Generate 2 Positive Models (Fine-tuned)
# ============================================================================

import torch
import torch.nn.functional as F
import numpy as np
from pathlib import Path

print("=" * 70)
print("CELL 4: Generate Positive Models (Fine-tuned Clones)")
print("=" * 70)


# ----------------------------------------------------------------------------
# Clone and Fine-tune Function
# ----------------------------------------------------------------------------
def clone_and_finetune_matcher(model, dataset, pairs, seed, finetune_epochs=10, lr=0.0001):
    """Clone target graph matcher and fine-tune"""
    torch.manual_seed(seed)

    cloned = GraphMatcherGCN(
        num_features=dataset.num_node_features,  # FIXED
        hidden_channels=64
    )
    cloned.load_state_dict(model.state_dict())

    optimizer = torch.optim.Adam(cloned.parameters(), lr=lr)
    train_pairs = pairs[:int(0.8 * len(pairs))]

    for _ in range(finetune_epochs):
        cloned.train()
        np.random.shuffle(train_pairs)

        for i in range(0, min(100, len(train_pairs)), 10):
            batch_pairs = train_pairs[i:i+10]
            optimizer.zero_grad()
            batch_loss = 0

            for idx1, idx2, true_sim in batch_pairs:
                g1 = dataset[idx1]
                g2 = dataset[idx2]
                batch1 = torch.zeros(g1.x.shape[0], dtype=torch.long)
                batch2 = torch.zeros(g2.x.shape[0], dtype=torch.long)

                pred_sim = cloned((g1.x, g1.edge_index, batch1),
                                  (g2.x, g2.edge_index, batch2))

                loss = F.mse_loss(pred_sim.unsqueeze(0), torch.tensor([true_sim], dtype=torch.float))
                batch_loss += loss

            batch_loss = batch_loss / len(batch_pairs)
            batch_loss.backward()
            optimizer.step()

    return cloned


# ----------------------------------------------------------------------------
# Generate Fine-tuned Positive Models
# ----------------------------------------------------------------------------
positive_models = []
positive_paths = []

# Create the save directory first (FIXED)
pos_dir = Path("models/positive")
pos_dir.mkdir(parents=True, exist_ok=True)

for i in range(2):
    print(f"\nCreating POSITIVE model {i+1} (fine-tuned clone)...")
    pos_model = clone_and_finetune_matcher(target_model, dataset, pairs, seed=100+i, finetune_epochs=10)
    positive_models.append(pos_model)

    # Save model
    pos_path = pos_dir / f"gcn_match_pos_{i}.pt"
    torch.save(pos_model.state_dict(), pos_path)
    positive_paths.append(pos_path)
    print(f"  ✓ Saved to {pos_path}")

print("\n✓ All positive models created successfully!\n")


CELL 4: Generate Positive Models (Fine-tuned Clones)

Creating POSITIVE model 1 (fine-tuned clone)...
  ✓ Saved to models/positive/gcn_match_pos_0.pt

Creating POSITIVE model 2 (fine-tuned clone)...
  ✓ Saved to models/positive/gcn_match_pos_1.pt

✓ All positive models created successfully!



In [10]:
# ============================================================================
# CELL 5: Generate 2 Negative Models (Independent)
# ============================================================================

import torch
from pathlib import Path

print("=" * 70)
print("CELL 5: Generate Negative Models (Independent Training)")
print("=" * 70)

negative_models = []
negative_paths = []

# ----------------------------------------------------------------------------
# Ensure save directory exists (FIXED)
# ----------------------------------------------------------------------------
neg_dir = Path("models/negative")
neg_dir.mkdir(parents=True, exist_ok=True)

# ----------------------------------------------------------------------------
# Negative 1: Fresh GCN (trained from scratch)
# ----------------------------------------------------------------------------
print("\nCreating NEGATIVE model 1 (fresh GCN, different seed)...")
torch.manual_seed(200)

neg_model_1 = GraphMatcherGCN(
    num_features=dataset.num_node_features,  # FIXED for PyG
    hidden_channels=64
)
neg_model_1 = train_graph_matcher(neg_model_1, dataset, pairs, epochs=50, verbose=False)
negative_models.append(neg_model_1)

neg_path_1 = neg_dir / "gcn_match_neg_0.pt"
torch.save(neg_model_1.state_dict(), neg_path_1)
negative_paths.append(neg_path_1)
print(f"  ✓ Saved to {neg_path_1}\n")

# ----------------------------------------------------------------------------
# Negative 2: SimGNN-style model (different architecture)
# ----------------------------------------------------------------------------
print("Creating NEGATIVE model 2 (SimGNN-style, different architecture)...")
torch.manual_seed(201)

neg_model_2 = GraphMatcherSimGNN(
    num_features=dataset.num_node_features,  # FIXED for PyG
    hidden_channels=64
)
neg_model_2 = train_graph_matcher(neg_model_2, dataset, pairs, epochs=50, verbose=False)
negative_models.append(neg_model_2)

neg_path_2 = neg_dir / "simgnn_match_neg_1.pt"
torch.save(neg_model_2.state_dict(), neg_path_2)
negative_paths.append(neg_path_2)
print(f"  ✓ Saved to {neg_path_2}\n")

print("\n✓ All negative models created successfully!\n")


CELL 5: Generate Negative Models (Independent Training)

Creating NEGATIVE model 1 (fresh GCN, different seed)...
  ✓ Saved to models/negative/gcn_match_neg_0.pt

Creating NEGATIVE model 2 (SimGNN-style, different architecture)...
  ✓ Saved to models/negative/simgnn_match_neg_1.pt


✓ All negative models created successfully!



In [12]:
# ============================================================================
# CELL 6: Create Synthetic Fingerprints (Random Graph Pairs)
# ============================================================================
import torch
import numpy as np
from pathlib import Path

print("=" * 70)
print("CELL 6: Create Synthetic Fingerprints for Graph Matching")
print("=" * 70)

num_fingerprints = 5
nodes_per_graph = 15

def create_random_graph_pair_fingerprint(num_nodes, num_features, sparsity=0.3):
    """
    Create a pair of random graphs as fingerprint.
    For graph matching, fingerprints are PAIRS of graphs.
    """
    # ------------------------------
    # Graph 1
    # ------------------------------
    x1 = torch.randn(num_nodes, num_features)
    num_edges1 = max(1, int(num_nodes * (num_nodes - 1) / 2 * sparsity))
    edge_pairs1 = []
    for _ in range(num_edges1):
        u = np.random.randint(0, num_nodes)
        v = np.random.randint(0, num_nodes)
        if u != v:
            edge_pairs1.append([u, v])
    edge_index1 = torch.tensor(edge_pairs1, dtype=torch.long).t() if edge_pairs1 else torch.zeros((2, 0), dtype=torch.long)
    batch1 = torch.zeros(num_nodes, dtype=torch.long)

    # ------------------------------
    # Graph 2 (slightly different)
    # ------------------------------
    x2 = torch.randn(num_nodes, num_features)
    num_edges2 = max(1, int(num_nodes * (num_nodes - 1) / 2 * sparsity))
    edge_pairs2 = []
    for _ in range(num_edges2):
        u = np.random.randint(0, num_nodes)
        v = np.random.randint(0, num_nodes)
        if u != v:
            edge_pairs2.append([u, v])
    edge_index2 = torch.tensor(edge_pairs2, dtype=torch.long).t() if edge_pairs2 else torch.zeros((2, 0), dtype=torch.long)
    batch2 = torch.zeros(num_nodes, dtype=torch.long)

    return (x1, edge_index1, batch1), (x2, edge_index2, batch2)

# ------------------------------
# Generate Fingerprints
# ------------------------------
fingerprints = []
print(f"\nCreating {num_fingerprints} random graph pair fingerprints...")

for i in range(num_fingerprints):
    graph_pair = create_random_graph_pair_fingerprint(
        nodes_per_graph,
        dataset.num_node_features,  # FIXED for PyG
        sparsity=0.25
    )
    fingerprints.append(graph_pair)
    g1, g2 = graph_pair
    print(f"  ✓ FP {i+1}: G1(nodes={g1[0].shape[0]}, edges={g1[1].shape[1]}) "
          f"<-> G2(nodes={g2[0].shape[0]}, edges={g2[1].shape[1]})")

# ------------------------------
# Save Fingerprints
# ------------------------------
fp_path = base_dir / "fingerprints" / "match_fingerprints.pt"

# ✅ Create directory if missing
fp_path.parent.mkdir(parents=True, exist_ok=True)

torch.save(fingerprints, fp_path)
print(f"\n✓ Fingerprints saved to {fp_path}\n")


CELL 6: Create Synthetic Fingerprints for Graph Matching

Creating 5 random graph pair fingerprints...
  ✓ FP 1: G1(nodes=15, edges=24) <-> G2(nodes=15, edges=24)
  ✓ FP 2: G1(nodes=15, edges=25) <-> G2(nodes=15, edges=24)
  ✓ FP 3: G1(nodes=15, edges=24) <-> G2(nodes=15, edges=25)
  ✓ FP 4: G1(nodes=15, edges=22) <-> G2(nodes=15, edges=24)
  ✓ FP 5: G1(nodes=15, edges=23) <-> G2(nodes=15, edges=23)

✓ Fingerprints saved to fingerprints/match_fingerprints.pt



In [13]:
# ============================================================================
# CELL 7: Collect Model Response Vectors (Similarity Scores)
# ============================================================================
print("=" * 70)
print("CELL 7: Collect Model Response Vectors (Similarity Scores)")
print("=" * 70)

def get_matching_response_vector(model, fingerprints):
    """
    Query model on fingerprints and collect similarity scores.
    For graph matching, each fingerprint is a pair, producing one similarity score.
    """
    model.eval()
    responses = []

    with torch.no_grad():
        for data1, data2 in fingerprints:
            # Compute similarity between the two graphs
            similarity = model(data1, data2)
            responses.append(similarity.unsqueeze(0))

    # Concatenate all responses
    response_vector = torch.cat(responses)
    return response_vector

# Collect responses from all models
all_responses = {}

print("\nCollecting similarity responses from TARGET model...")
all_responses['target'] = get_matching_response_vector(target_model, fingerprints)

for i, pos_model in enumerate(positive_models):
    print(f"Collecting responses from POSITIVE model {i}...")
    all_responses[f'pos_{i}'] = get_matching_response_vector(pos_model, fingerprints)

for i, neg_model in enumerate(negative_models):
    print(f"Collecting responses from NEGATIVE model {i}...")
    all_responses[f'neg_{i}'] = get_matching_response_vector(neg_model, fingerprints)

print(f"\n✓ Response vector dimension: {all_responses['target'].shape[0]}")
print(f"  (= {num_fingerprints} fingerprint pairs × 1 similarity score each)\n")



CELL 7: Collect Model Response Vectors (Similarity Scores)

Collecting similarity responses from TARGET model...
Collecting responses from POSITIVE model 0...
Collecting responses from POSITIVE model 1...
Collecting responses from NEGATIVE model 0...
Collecting responses from NEGATIVE model 1...

✓ Response vector dimension: 5
  (= 5 fingerprint pairs × 1 similarity score each)



In [15]:
# ============================================================================
# CELL 8: Build Training Data and Train Verifier
# ============================================================================
import torch
import torch.nn as nn
from pathlib import Path

print("=" * 70)
print("CELL 8: Build Training Data and Train Verifier")
print("=" * 70)

# Build training dataset for verifier
X_train = []
y_train = []

# Positive samples (label = 1)
X_train.append(all_responses['target'].unsqueeze(0))
y_train.append(1)
print("\n✓ Target model (label=1)")

for i in range(len(positive_models)):
    X_train.append(all_responses[f'pos_{i}'].unsqueeze(0))
    y_train.append(1)
    print(f"✓ Positive model {i} (label=1)")

# Negative samples (label = 0)
for i in range(len(negative_models)):
    X_train.append(all_responses[f'neg_{i}'].unsqueeze(0))
    y_train.append(0)
    print(f"✓ Negative model {i} (label=0)")

X_train = torch.cat(X_train, dim=0)
y_train = torch.tensor(y_train, dtype=torch.float32)

print(f"\nTraining data shape: X={X_train.shape}, y={y_train.shape}")
print(f"  Class 1 (positive): {(y_train == 1).sum()} samples")
print(f"  Class 0 (negative): {(y_train == 0).sum()} samples")

# Train verifier
print(f"\nTraining VERIFIER for Graph Matching task...")
verifier = Verifier(input_dim=X_train.shape[1], hidden_dim=32)
optimizer = torch.optim.Adam(verifier.parameters(), lr=0.01)
loss_fn = nn.BCELoss()

num_epochs = 200
for epoch in range(num_epochs):
    verifier.train()
    optimizer.zero_grad()

    y_pred = verifier(X_train)
    loss = loss_fn(y_pred, y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 50 == 0:
        print(f"  Epoch {epoch+1}/{num_epochs} | Loss: {loss.item():.4f}")

# Save verifier
verifier_path = base_dir / "verifier" / "verifier_match.pt"

# ✅ Create the 'verifier' directory if it doesn't exist
verifier_path.parent.mkdir(parents=True, exist_ok=True)

torch.save(verifier.state_dict(), verifier_path)
print(f"\n✓ Verifier saved to {verifier_path}\n")


CELL 8: Build Training Data and Train Verifier

✓ Target model (label=1)
✓ Positive model 0 (label=1)
✓ Positive model 1 (label=1)
✓ Negative model 0 (label=0)
✓ Negative model 1 (label=0)

Training data shape: X=torch.Size([5, 5]), y=torch.Size([5])
  Class 1 (positive): 3 samples
  Class 0 (negative): 2 samples

Training VERIFIER for Graph Matching task...
  Epoch 50/200 | Loss: 0.0107
  Epoch 100/200 | Loss: 0.0012
  Epoch 150/200 | Loss: 0.0007
  Epoch 200/200 | Loss: 0.0004

✓ Verifier saved to verifier/verifier_match.pt



In [16]:

# ============================================================================
# CELL 9: Evaluate Verifier and Calculate Metrics
# ============================================================================
print("=" * 70)
print("CELL 9: EVALUATE VERIFIER - Calculate TP/TN/Accuracy")
print("=" * 70)

verifier.eval()
with torch.no_grad():
    y_pred_probs = verifier(X_train)
    y_pred = (y_pred_probs >= 0.5).long()
    y_true = y_train.long()

# Calculate confusion matrix
TP = ((y_pred == 1) & (y_true == 1)).sum().item()
TN = ((y_pred == 0) & (y_true == 0)).sum().item()
FP = ((y_pred == 1) & (y_true == 0)).sum().item()
FN = ((y_pred == 0) & (y_true == 1)).sum().item()

total = len(y_true)
accuracy = (TP + TN) / total
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0

print("\n" + "="*70)
print("CONFUSION MATRIX")
print("="*70)
print(f"  TP (True Positive):   {TP}   ← Positive models correctly identified")
print(f"  TN (True Negative):   {TN}   ← Negative models correctly identified")
print(f"  FP (False Positive):  {FP}   ← Negative incorrectly as positive")
print(f"  FN (False Negative):  {FN}   ← Positive incorrectly as negative")

print("\n" + "="*70)
print("METRICS")
print("="*70)
print(f"  Accuracy:   {accuracy:.3f}  (TP+TN)/Total = ({TP}+{TN})/{total}")
print(f"  Precision:  {precision:.3f}  TP/(TP+FP) = {TP}/({TP}+{FP})")
print(f"  Recall:     {recall:.3f}   TP/(TP+FN) = {TP}/({TP}+{FN})")
print()

CELL 9: EVALUATE VERIFIER - Calculate TP/TN/Accuracy

CONFUSION MATRIX
  TP (True Positive):   3   ← Positive models correctly identified
  TN (True Negative):   2   ← Negative models correctly identified
  FP (False Positive):  0   ← Negative incorrectly as positive
  FN (False Negative):  0   ← Positive incorrectly as negative

METRICS
  Accuracy:   1.000  (TP+TN)/Total = (3+2)/5
  Precision:  1.000  TP/(TP+FP) = 3/(3+0)
  Recall:     1.000   TP/(TP+FN) = 3/(3+0)

