In [None]:
!python3.12 --version
!python3.12 -m pip install ipykernel
!python3.12 -m pip install --upgrade pip
!python3.12 -m pip install matplotlib
!python3.12 -m pip uninstall torch torch_scatter -y
!python3.12 -m pip uninstall numpy -y
!python3.12 -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
!export MACOSX_DEPLOYMENT_TARGET=10.14
!python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-2.8.0+cpu.html
!python3 -m pip install torch-geometric
!python3 -m pip install numba
!python3 -m pip install pykeops
!python3 -m pip install numpy==2.0.2
!python3 -m pip install ogb==1.3.6 torchdiffeq==0.2.5
!python3 -m pip install pyvis

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import time
import networkx as nx
from torch_geometric.utils import to_networkx
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
from importlib import reload

import data as dt
from torch_geometric.datasets import Planetoid
from GNN import GNN
from mutual import get_optimizer, train
from mutual import test as test_model
import random

In [None]:
# for other pc launch this
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
opt = {'self_loop_weight': 1, 'leaky_relu_slope': 0.2, 'heads': 2, 'K': 10, 'not_lcc': True, 'dataset': 'Cora', 'force_reload': True,
        'attention_norm_idx': 0, 'simple': True, 'alpha': 0, 'alpha_dim': 'sc', 'beta_dim': 'sc', "use_labels": True,
        'hidden_dim': 64, 'block': 'rewire_attention', 'function': 'laplacian', 'alpha_sigmoid': True, 'augment': False, 'adjoint': False,
        'tol_scale': 70, 'time': 20, 'input_dropout': 0.5, 'dropout': 0.2, 'method': 'dopri5', 'optimizer':'adam', 'lr':0.008, "use_mlp": True,
        'decay':0.007, 'epoch':20, 'kinetic_energy':None, 'jacobian_norm2':None, 'total_deriv':None, 'directional_penalty':None, "beltrami": False}
opt["batch_norm"] = False
opt["heads"] = 8
opt["attention_dim"] = 128
opt['attention_type'] = 'scaled_dot'
opt['label_rate'] = 0.5
opt['square_plus'] = True
opt['reweight_attention'] = False
opt['step_size'] = 1
opt['max_nfe'] = 5000
opt['no_alpha_sigmoid'] = False
opt['add_source'] = False
opt['fc_out'] = False
opt['att_samp_pct'] = 1

In [None]:
def construct_graph(edges, attention=None, threshold=0.01):
    if isinstance(edges, torch.Tensor):
        edges = edges.cpu().numpy()
    if attention is not None:
        edges = edges[:, attention > threshold]
    edge_list = zip(edges[0], edges[1])
    g = nx.Graph(edge_list)
    return g

In [None]:
def add_noisy_edges(edge_index, num_nodes, noise_ratio=0.2):
    """
    Add random noisy edges to the graph.
    """
    num_edges = edge_index.shape[1]
    num_add = int(noise_ratio * num_edges)  # Number of noisy edges to add

    added_edges = []
    existing_edges = set(map(tuple, edge_index.t().tolist()))  # Convert edges to a set for lookup

    while len(added_edges) < num_add:
        # Pick two random nodes
        u, v = random.randint(0, num_nodes - 1), random.randint(0, num_nodes - 1)

        # Ensure it's a new edge and not a self-loop
        if u != v and (u, v) not in existing_edges and (v, u) not in existing_edges:
            added_edges.append((u, v))

    # Convert to tensor and concatenate
    print(f"Edges added: {len(added_edges)}")
    new_edges = torch.tensor(added_edges, dtype=torch.long).t()
    return torch.cat([edge_index, new_edges], dim=1)


In [None]:
def add_labels(feat, labels, idx, num_classes, device):
  onehot = torch.zeros([feat.shape[0], num_classes]).to(device)
  if idx.dtype == torch.bool:
    idx = torch.where(idx)[0]  # convert mask to linear index
  onehot[idx, labels.squeeze()[idx]] = 1

  return torch.cat([feat, onehot], dim=-1)

In [None]:
def to_edge_set(edge_index):
    # Convert [2, N] tensor to a set of sorted tuples (for undirected comparison)
    return set(tuple(sorted((int(edge_index[0, i]), int(edge_index[1, i])))) for i in range(edge_index.size(1)))

In [None]:
def score_edges(z, edge_pairs):
    return (z[edge_pairs[0]] * z[edge_pairs[1]]).sum(dim=1)  #

In [None]:
@torch.no_grad()
def test(model, data, pos_encoding=None, opt=None):  # opt required for runtime polymorphism
  model.eval()
  feat = data.x
  if model.opt['use_labels']:
    feat = add_labels(feat, data.y, data.train_mask, model.num_classes, model.device)
  logits, accs = model(feat, pos_encoding), []
  for _, mask in data('train_mask', 'val_mask', 'test_mask'):
    pred = logits[mask].max(1)[1]
    acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
    accs.append(acc)
  return logits,accs


In [None]:
def draw_graph_after_adding_noise(dataset, added_edges):
  # Convert to NetworkX
  G = to_networkx(dataset.data)

  # Add all nodes to ensure disconnected ones are included
  G.add_nodes_from(range(dataset.data.num_nodes))

  # Draw base graph
  pos = nx.spring_layout(G, seed=42)
  # nx.draw(G, pos, node_color="lightblue", node_size=4, edge_color="gray", width=0.5)


  nx.draw(G, node_color="lightblue", node_size=4, edge_color="gray", width=0.5)


  # Highlight the added (noisy) edges
  added_edge_list = list(added_edges)
  nx.draw_networkx_edges(
      G,
      pos,
      edgelist=added_edge_list,
      edge_color="red",
      width=1.5,
      style="dashed",
      label="Noisy Edges"
  )

  plt.title("Graph with Noisy (Added) Edges in Red")
  plt.legend()
  plt.show()

In [None]:
def prepare_data(dataset_name: str = 'Cora'):
  # Step 1: Prepare original edges
  dataset = Planetoid(root='data', name=dataset_name)
  original_edges = dataset.data.edge_index

  # Step 2: Remove a subset of edges
  num_edges_to_remove = int(original_edges.size(1) * 0.25)
  edge_indices = list(range(original_edges.size(1)))
  random.shuffle(edge_indices)
  removed_edge_indices = edge_indices[:num_edges_to_remove]
  removed_edges = original_edges[:, removed_edge_indices]

  # Step 3: Keep the remaining edges
  remaining_edges = torch.cat(
      [original_edges[:, i].unsqueeze(1) for i in edge_indices[num_edges_to_remove:]], dim=1
  )

  # Step 4: Update the graph for training
  remaining_edges_to_verify = remaining_edges.clone()
  dataset.data.edge_index = remaining_edges


  print("Training edges:", dataset.edge_index.shape)
  return dataset, removed_edges, remaining_edges_to_verify


In [None]:
def trainModel(opt, dataset):
  model, dat = GNN(opt, dataset, device).to(device), dataset.data.to(device)
  parameters = [p for p in model.parameters() if p.requires_grad]
  optimizer = get_optimizer(opt['optimizer'], parameters, lr=opt['lr'], weight_decay=opt['decay'])
  best_time = best_epoch = train_acc = val_acc = test_acc = 0


  for epoch in range(1, opt['epoch']):
      start_time = time.time()

      loss = train(model, optimizer, dat)
      tmp_train_acc, tmp_val_acc, tmp_test_acc = test_model(model, dat)
      best_time = opt['time']
      if tmp_val_acc > val_acc:
        best_epoch = epoch
        train_acc = tmp_train_acc
        val_acc = tmp_val_acc
        test_acc = tmp_test_acc
        best_time = opt['time']
      if not opt['no_early'] and model.odeblock.test_integrator.solver.best_val > val_acc:
        best_epoch = epoch
        val_acc = model.odeblock.test_integrator.solver.best_val
        test_acc = model.odeblock.test_integrator.solver.best_test
        train_acc = model.odeblock.test_integrator.solver.best_train
        best_time = model.odeblock.test_integrator.solver.best_time
      log = 'Epoch: {:03d}, Runtime {:03f}, Loss {:03f}, forward nfe {:d}, backward nfe {:d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}, Best time: {:.4f}'
      # print(log.format(epoch, time.time() - start_time, loss, model.fm.sum, model.bm.sum, train_acc, val_acc, test_acc, best_time))

  print('best val accuracy {:03f} with test accuracy {:03f} at epoch {:d} and best time {:03f}'.format(val_acc, test_acc,
                                                                                                     best_epoch,
                                                                                                     best_time))
  return model, dat, val_acc, tmp_test_acc, train_acc, loss

In [None]:
def generate_and_delete_reconstructed_edges(dataset_name: str='Cora'):
      # delete new predict and removed one
    dataset, removed_edges, _ = prepare_data(dataset_name=dataset_name)
    removed_edges_set = set(tuple(sorted((int(u), int(v)))) for u, v in zip(removed_edges[0], removed_edges[1]))
    # dataset.data.edge_index = add_noisy_edges(dataset.data.edge_index, num_nodes=dataset.data.num_nodes, noise_ratio=0.001)
    print("Deleted edges:", len(removed_edges_set))
    return dataset, removed_edges_set

In [None]:
import torch.nn.functional as F
def score_edges(embeddings, edge_index):
    u = embeddings[edge_index[0]]
    v = embeddings[edge_index[1]]
    return F.cosine_similarity(u, v)  # returns score per edge

In [None]:
def is_close_enough(embeddings, edge_index, epsilon=0.05):
    u = embeddings[edge_index[0]]
    v = embeddings[edge_index[1]]
    distances = torch.norm(u - v, dim=1)
    return distances <= epsilon

In [None]:
# PUBMED DATASET OP
opt = {'M_nodes': 64, 'adaptive': False, 'add_source': True, 'adjoint': True, 'adjoint_method': 'adaptive_heun', 
       'adjoint_step_size': 1, 'alpha': 1.0, 'alpha_dim': 'sc', 'att_samp_pct': 1, 'attention_dim': 128, 'attention_norm_idx': 0,
         'attention_rewiring': False, 'attention_type': 'cosine_sim', 'augment': False, 'baseline': False, 'batch_norm': False, 
         'beltrami': False, 'beta_dim': 'sc', 'block': 'rewire_attention', 'cpus': 1, 'data_norm': 'rw', 'dataset': 'Pubmed', 
         'decay': 0.0018236722171703636, 'directional_penalty': None, 'dropout': 0.07191100715473969, 'dt': 0.001, 
         'dt_min': 1e-05, 'epoch': 600, 'exact': False, 'fc_out': False, 'feat_hidden_dim': 64, 
         'function': 'laplacian', 'gdc_avg_degree': 64, 'gdc_k': 64, 'gdc_method': 'ppr', 'gdc_sparsification': 'topk',
           'gdc_threshold': 0.01, 'gpus': 1.0, 'grace_period': 20, 'heads': 1, 'heat_time': 3.0, 'hidden_dim': 64,
             'input_dropout': 0.5, 'jacobian_norm2': None, 'kinetic_energy': None, 'label_rate': 0.5, 'leaky_relu_slope': 0.2, 
             'lr': 0.0095, 'max_epochs': 120, 'max_iters': 100, 'max_nfe': 5000, 'method': 'dopri5',
               'metric': 'test_acc', 'mix_features': False, 'name': None, 'new_edges': 'random', 'no_alpha_sigmoid': False,
                 'not_lcc': True, 'num_init': 1, 'num_samples': 400, 'num_splits': 8, 'ode_blocks': 1, 'optimizer': 'adamax', 
                 'patience': 50, 'pos_enc_dim': 'row', 'pos_enc_hidden_dim': 16, 'ppr_alpha': 0.05, 'reduction_factor': 10,
                   'regularise': False, 'reweight_attention': False, 'rewire_KNN': False, 'rewire_KNN_T': 'T0', 
                   'rewire_KNN_epoch': 10, 'rewire_KNN_k': 64, 'rewire_KNN_sym': False, 'rewiring': None, 'rw_addD': 0.02, 
                   'rw_rmvR': 0.02, 'self_loop_weight': 1, 'sparsify': 'S_hat', 'square_plus': True, 'step_size': 1,
                     'threshold_type': 'addD_rvR', 'time': 12.942327880200853, 'tol_scale': 1991.0688305523001, 
                     'tol_scale_adjoint': 16324.368093998313, 'total_deriv': None, 'use_cora_defaults': False, 'use_flux': False, 
                     'use_labels': False, 'use_lcc': True, 'use_mlp': False, 'folder': 'pubmed_linear_att_beltrami_adj2', 'index': 0,
                       'run_with_KNN': False, 'change_att_sim_type': False, 'reps': 1, 'max_test_steps': 100, 'no_early': True,
                         'earlystopxT': 5.0, 'pos_enc_csv': False, 'pos_enc_type': 'GDC'}
opt.update({
    'lr': 0.005,
    'optimizer': 'adam',
    'hidden_dim': 128,
    'feat_hidden_dim': 64,
    'dropout': 0.3,
    'input_dropout': 0.3,
    'decay': 0.0005,
    'max_epochs': 150,
    'patience': 50,
    'method': 'dopri5',
    'max_nfe': 100000,
    'tol_scale': 1e-3,
    'tol_scale_adjoint': 1e-3,
    'att_samp_pct': 1.0,
    'rw_addD': 0.005,
    'rw_rmvR': 0.005,
    'heads': 4,
    'use_labels': True,
    'block': 'rewire_attention'
})


In [None]:
#CORA DATASET OPT

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
opt = {'self_loop_weight': 1, 'leaky_relu_slope': 0.2, 'heads': 2, 'K': 10, 'not_lcc': True, 'dataset': 'Cora', 'force_reload': True,
        'attention_norm_idx': 0, 'simple': True, 'alpha': 0, 'alpha_dim': 'sc', 'beta_dim': 'sc', "use_labels": True,
        'hidden_dim': 64, 'block': 'attention', 'function': 'laplacian', 'alpha_sigmoid': True, 'augment': False, 'adjoint': False,
        'tol_scale': 70, 'time': 20, 'input_dropout': 0.5, 'dropout': 0.2, 'method': 'dopri5', 'optimizer':'adam', 'lr':0.009, "use_mlp": True,
        'decay':0.007, 'epoch':20, 'kinetic_energy':None, 'jacobian_norm2':None, 'total_deriv':None, 'directional_penalty':None, "beltrami": False}
opt["fc_out"] = False
opt["batch_norm"] = False
opt["heads"] = 8
opt["attention_dim"] = 128
opt['attention_type'] = 'scaled_dot'
opt['label_rate'] = 0.5
opt['square_plus'] = True
opt['reweight_attention'] = False
opt['step_size'] = 1
opt['max_nfe'] = 5000
opt['no_alpha_sigmoid'] = False
opt['add_source'] = False
opt['dataset'] = 'Cora'



In [None]:
# CITESEER DATASET OPT
opt = {'M_nodes': 64, 'adaptive': False, 'add_source': True, 'adjoint': False, 'adjoint_method': 'adaptive_heun', 
       'adjoint_step_size': 1, 'alpha': 1.0, 'alpha_dim': 'sc', 'att_samp_pct': 1, 'attention_dim': 32, 
       'attention_norm_idx': 1, 'attention_rewiring': False, 'attention_type': 'exp_kernel', 'augment': False, 
       'baseline': False, 'batch_norm': False, 'beltrami': False, 'beta_dim': 'sc', 'block': 'attention', 'cpus': 1, 
       'data_norm': 'rw', 'dataset': 'Citeseer', 'decay': 0.1, 'directional_penalty': None, 'dropout': 0.7488085003122172, 
       'dt': 0.001, 'dt_min': 1e-05, 'epoch': 200, 'exact': True, 'fc_out': False, 'function': 'laplacian',
       'gdc_avg_degree': 64, 'gdc_k': 128, 'gdc_method': 'ppr', 'gdc_sparsification': 'topk', 'gdc_threshold': 0.01, 'gpus': 1.0,
       'grace_period': 20, 'heads': 8, 'heat_time': 3.0, 'hidden_dim': 80, 'input_dropout': 0.6803233752085334, 'jacobian_norm2': None, 
       'kinetic_energy': None, 'label_rate': 0.5, 'leaky_relu_slope': 0.5825086997804176, 'lr': 0.00863585231323069, 'max_epochs': 1000,
       'max_iters': 100, 'max_nfe': 3000, 'method': 'dopri5', 'metric': 'accuracy', 'mix_features': False, 'name': 'Citeseer_beltrami_1_KNN',
       'new_edges': 'random', 'no_alpha_sigmoid': False, 'not_lcc': True, 'num_class': 6, 'num_init': 2,
       'num_nodes': 2120, 'num_samples': 400, 'num_splits': 1, 'ode_blocks': 1, 'optimizer': 'adam', 'patience': 100, 
       'pos_enc_dim': 'row', 'pos_enc_hidden_dim': 16, 'ppr_alpha': 0.05, 'reduction_factor': 4, 'regularise': False,
       'reweight_attention': False, 'rewire_KNN': False, 'rewire_KNN_epoch': 10, 'rewire_KNN_k': 64, 'rewire_KNN_sym': False,
       'rewiring': None, 'rw_addD': 0.02, 'rw_rmvR': 0.02, 'self_loop_weight': 1, 'sparsify': 'S_hat', 'square_plus': True,
       'step_size': 1, 'threshold_type': 'addD_rvR', 'time': 7.874113442879092, 'tol_scale': 2.9010446330432815, 'tol_scale_adjoint': 1.0, 
       'total_deriv': None, 'use_cora_defaults': False, 'use_flux': False, 'use_labels': True, 'use_lcc': True, 'use_mlp': False, 'no_early': True}

In [None]:

from collections import defaultdict
dataset, removed_edges = generate_and_delete_reconstructed_edges(dataset_name=opt["dataset"])
G = to_networkx(dataset.data)
total_tests = 1  # Initial number of test rounds
epsilon = 0.1     # Distance threshold for edge prediction
known_edges = to_edge_set(dataset.x)

reconstruction_counter = defaultdict(int)
reconstructed_edges_list = []
new_predicted_counter = defaultdict(int)

num_samples = 50000
all_possible = torch.combinations(torch.arange(G.number_of_nodes()), r=2)
perm = torch.randperm(all_possible.size(0))[:num_samples]
sampled_pairs = all_possible[perm]
candidate_edge_index = sampled_pairs.T

best_val_acc = test_acc = best_epoch = 0

for i in range(total_tests):
    dataset, removed_edges = generate_and_delete_reconstructed_edges(dataset_name=opt["dataset"])
    opt['num_feature'] = dataset.data.x.shape
    model, dat, val_acc, tmp_test_acc, train_acc, loss = trainModel(opt, dataset)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
        best_epoch = i
    log = 'Loss {:03f}, forward nfe {:d}, backward nfe {:d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    print(log.format(loss, model.fm.sum, model.bm.sum, train_acc, best_val_acc, test_acc))
    print('best val accuracy {:03f} with test accuracy {:03f} at epoch {:d}'.format(best_val_acc, test_acc, best_epoch))
    model.eval()
    with torch.no_grad():
        feat = dat.x
        feat = add_labels(feat, dat.y, dat.train_mask, model.num_classes, model.device)
        node_embeddings = model(feat, None)
        # Score all candidate edges (e.g., using cosine or dot-product)
        scores = score_edges(node_embeddings, candidate_edge_index)

        # (Optional) Score filtering - top 10% most confident scores
        threshold_value = torch.quantile(scores, 0.90)
        high_score_mask = scores >= threshold_value
        high_score_edges = candidate_edge_index[:, high_score_mask]

        # Distance-based filtering
        # epsilon = torch.quantile(distances, 0.2).item()
        # close_mask = distances <= epsilon

        # predicted_edges = high_score_edges[:, close_mask]
        distances = torch.norm(node_embeddings[high_score_edges[0]] - node_embeddings[high_score_edges[1]], dim=1)
        epsilon = torch.quantile(distances, 0.3).item()
        predicted_edges = high_score_edges[:, distances <= epsilon]


        print("\n\nScore range:", scores.min().item(), "->", scores.max().item())
        print("Distance range:", distances.min().item(), "->", distances.max().item())
        print("Total predicted edges:", predicted_edges.size(1))
        # Compare predicted edges to removed edges
        for u, v in zip(predicted_edges[0], predicted_edges[1]):
            edge = tuple(sorted((int(u), int(v))))
            if edge in removed_edges:
                reconstruction_counter[edge] += 1
                reconstructed_edges_list.append(edge)
            else:
                new_predicted_counter[edge] += 1

        if len(reconstruction_counter) and len(new_predicted_counter):
            print(f"\nRound {i + 1}, total removed reconstructed: {len(reconstruction_counter.keys())}, "
                  f"total new reconstructed: {len(new_predicted_counter.keys())}"
                  f"\nTotal Weak Edges in round: {len(removed_edges)-len(reconstructed_edges_list)}")
        print("\n ******************************************* \n")

In [None]:
len(removed_edges)

In [None]:
reconstruction_counter

In [None]:
scores = score_edges(node_embeddings, candidate_edge_index)
top_k = 100
topk_indices = scores.topk(top_k).indices
predicted_edges = candidate_edge_index[:, topk_indices]

In [None]:
# Display the graph of reconstructed edges
import plotly.graph_objects as go

new_labels = []
new_counts = []
recovered_labels = []
recovered_counts = []

for edge, count in reconstruction_counter.items():
    label = f"{edge[0]}-{edge[1]}"
    recovered_labels.append(label)
    recovered_counts.append(count)

for edge, count in new_predicted_counter.items():
    label = f"{edge[0]}-{edge[1]}"
    new_labels.append(label)
    new_counts.append(count)
# Plotly bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=recovered_labels,
    y=recovered_counts,
    name='Recovered (Originally Removed)',
    marker_color='cornflowerblue'
))

fig.add_trace(go.Bar(
    x=new_labels,
    y=new_counts,
    name='New Predicted (Not Originally Present)',
    marker_color='orange'
))

fig.update_layout(
    title="Edge Reconstruction vs New Predictions",
    xaxis_title="Edge (u-v)",
    yaxis_title="Reconstruction Count",
    barmode='group',
    xaxis_tickangle=-45,
    template='plotly_white',
    margin=dict(l=40, r=40, t=60, b=100)
)

fig.show()

In [None]:
import pandas as pd

# Create a dictionary from the lists
data = {
    'Edge': recovered_labels + new_labels,
    'Count': recovered_counts + new_counts,
    'Type': ['Recovered'] * len(recovered_labels) + ['New Predicted'] * len(new_labels)
}

# Create a pandas DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('edge_reconstruction_data.csv', index=False)

print("Data saved to edge_reconstruction_data.csv")

In [None]:
from pyvis.network import Network

# Create PyVis network
net = Network(notebook=False, height="750px", width="100%", bgcolor="#ffffff", font_color="black")

# Add graph from networkx
net.from_nx(G)

# Highlight reconstructed edges in red
for u, v in reconstructed:
    net.add_edge(int(u), int(v), color='red', width=10)

# Optional: change node size or color
for node in G.nodes():
    net.get_node(node)['size'] = 10
    net.get_node(node)['color'] = "#cccccc"

# Save and open in browser
net.save_graph("graph_reconstruction.html")

In [None]:
total_data_to_review = [
  {
    "iteration": 1,
    "total_deleted": 300,
    "total_reconstructed": 180,
    "new_reconstructed": 50
  },
  {
    "iteration": 2,
    "total_deleted": 300,
    "total_reconstructed": 200,
    "new_reconstructed": 45
  },
  {
    "iteration": 3,
    "total_deleted": 300,
    "total_reconstructed": 190,
    "new_reconstructed": 55
  }
]

# Extract values
iterations = [entry["iteration"] for entry in total_data_to_review]
deleted = [entry["total_deleted"] for entry in total_data_to_review]
reconstructed = [entry["total_reconstructed"] for entry in total_data_to_review]
new_reconstructed = [entry["new_reconstructed"] for entry in total_data_to_review]

# Bar chart setup
bar_width = 0.25
x = np.arange(len(iterations))

plt.figure(figsize=(12, 6))

# Bars
plt.bar(x - bar_width, deleted, width=bar_width, label='Total Deleted')
plt.bar(x, reconstructed, width=bar_width, label='Total Reconstructed')
plt.bar(x + bar_width, new_reconstructed, width=bar_width, label='New Reconstructed')

# Labels and ticks
plt.xlabel("Iteration")
plt.ylabel("Number of Edges")
plt.title("Edge Reconstruction Metrics Per Iteration")
plt.xticks(x, [f"Iter {i}" for i in iterations])
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# %80 %20 - accuracy best
# %70 %30 - accuracy best
# ....
# 85% %15 - ....


# removed edges -> evaluations
# delete_reconstructed....


In [None]:
def get_node_embeddings(model, data):
    """Extract final node embeddings from the GNN model"""
    model.eval()
    with torch.no_grad():
        # If use_labels is True, add labels to features before passing to the model
        if model.opt['use_labels']:
            x = add_labels(data.x, data.y, data.train_mask, model.num_classes, model.device)
        else:
            x = data.x

        # Get model output, handle single or multiple outputs
        output = model(x, None)

        # If only embeddings are returned, unpack accordingly
        if isinstance(output, torch.Tensor):
            embeddings = output
        # If multiple outputs are returned, unpack as before
        else:
            embeddings, _ = output

    return embeddings

def calculate_confidence(embeddings, edge_index):
    """Calculate cosine similarity between connected nodes"""
    src_emb = embeddings[edge_index[0]]
    dst_emb = embeddings[edge_index[1]]
    confidence = torch.cosine_similarity(src_emb, dst_emb).numpy()
    return (edge_index.numpy(), confidence)

def draw_confidence_graph(edges, confidence, threshold=0.5):
    """Draw graph with confidence-colored edges"""
    G = nx.Graph()
    edge_list = []

    for i in range(edges.shape[1]):
        if confidence[i] > threshold:
            edge_list.append((edges[0,i], edges[1,i],
                           {'confidence': confidence[i]}))

    G.add_edges_from(edge_list)

    pos = nx.spring_layout(G)
    edge_colors = [d['confidence'] for _,_,d in G.edges(data=True)]

    plt.figure(figsize=(10,8))
    nx.draw(G, pos, node_size=20,
            edge_color=edge_colors, edge_cmap=plt.cm.Reds,
            width=1.5, arrows=False)
    plt.title("Reconstructed Edges with Confidence Scores")
    plt.show()

In [None]:
# After training the model
embeddings = get_node_embeddings(model, dataset.data)
edges, confidence = calculate_confidence(embeddings, dataset.data.edge_index)

# Visualize high-confidence edges (threshold=0.7)
draw_confidence_graph(edges, confidence, threshold=0.7)

# Compare with original graph
draw_graph_after_adding_noise(dataset, edges.T)  # Existing visualization

In [None]:

def get_node_embeddings(model, data):
    """Extract final node embeddings from the GNN model"""
    model.eval()
    with torch.no_grad():
        # If use_labels is True, add labels to features before passing to the model
        if model.opt['use_labels']:
            x = add_labels(data.x, data.y, data.train_mask, model.num_classes, model.device)
        else:
            x = data.x

        # Get model output, handle single or multiple outputs
        output = model(x, None)

        # If only embeddings are returned, unpack accordingly
        if isinstance(output, torch.Tensor):
            embeddings = output
        # If multiple outputs are returned, unpack as before
        else:
            embeddings, _ = output
    return embeddings

def calculate_confidence(embeddings, edge_index):
    """Calculate cosine similarity between connected nodes"""
    src_emb = embeddings[edge_index[0]]
    dst_emb = embeddings[edge_index[1]]
    scores = score_edges(node_embeddings, candidate_edge_index)  # Already used
    top_scores, top_indices = scores.topk(100)
    predicted_edges = candidate_edge_index[:, top_indices]
    confidence = top_scores.cpu().numpy()
    return (edge_index.numpy(), confidence)

def draw_confidence_graph(edges, confidence, original_edge_index=None, threshold=None, figsize=(12, 16)):
    """
    Draw graph with confidence-colored edges and show a matching histogram

    Parameters:
    -----------
    edges : numpy.ndarray
        Edge list with shape (2, num_edges) for reconstructed edges
    confidence : numpy.ndarray
        Confidence scores for each edge
    original_edge_index : torch.Tensor or numpy.ndarray, optional
        The original edge indices from the dataset
    threshold : float or None
        If provided, shows a line at this threshold in the histogram
        and will be used as the minimum value for the colormap
    figsize : tuple
        Figure size for the plot (width, height)
    """
    # Create figure with stacked subplots - graph on top (larger), histogram below
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize,
                                  gridspec_kw={'height_ratios': [3, 1]})  # 3:1 ratio favoring the graph

    # Build the graph with ALL confidence values (no filtering)
    G = nx.Graph()

    # Convert original edge indices to set for fast lookup if provided
    original_edges_set = set()
    if original_edge_index is not None:
        if isinstance(original_edge_index, torch.Tensor):
            original_edge_index = original_edge_index.numpy()

        for i in range(original_edge_index.shape[1]):
            # Add both directions (undirected graph)
            edge = (original_edge_index[0, i], original_edge_index[1, i])
            original_edges_set.add(edge)
            original_edges_set.add(edge[::-1])  # Add reverse edge too

    # Separate edges into original (existing) and reconstructed
    original_edge_tuples = []
    reconstructed_edge_tuples = []
    reconstructed_confidence = []

    if isinstance(edges, torch.Tensor):
        edges = edges.numpy()


    for i in range(edges.shape[1]):
        u, v = edges[0, i], edges[1, i]
        current_edge_tuple = tuple(sorted((int(u), int(v)))) # Ensure consistent edge representation

        if original_edge_index is not None and current_edge_tuple in original_edges_set:
            # This edge exists in the original graph
            original_edge_tuples.append((u, v))
        else:
            # This is a reconstructed or potentially new edge
            reconstructed_edge_tuples.append((u, v))
            reconstructed_confidence.append(confidence[i])  # Append the confidence score

    # Add all unique edges to the graph first
    all_unique_edges = list(set(original_edge_tuples + reconstructed_edge_tuples))
    G.add_edges_from(all_unique_edges)

    # Add attributes to the reconstructed edges
    for i, edge in enumerate(reconstructed_edge_tuples):
        u, v = edge
        G[u][v]['confidence'] = reconstructed_confidence[i]
        G[u][v]['original'] = False # Explicitly mark as not original

    # Add attribute to original edges (if needed for later use)
    for u, v in original_edge_tuples:
         # Check if the edge exists before adding attribute
         if G.has_edge(u, v):
             G[u][v]['original'] = True


    # Set up color mapping for reconstructed edges
    cmap = plt.cm.viridis  # Using viridis colormap (can be changed to Reds, etc.)

    # Set colormap boundaries for reconstructed edges
    if reconstructed_confidence:  # Only if we have reconstructed edges
        if threshold is not None:
            vmin = threshold
        else:
            vmin = min(reconstructed_confidence)
        vmax = max(reconstructed_confidence)
    else:
        vmin, vmax = 0, 1  # Default if no reconstructed edges

    norm = plt.Normalize(vmin=vmin, vmax=vmax)

    # Draw the graph on the top subplot (larger area)
    pos = nx.spring_layout(G, seed=42)  # Fixed seed for reproducibility

    # Draw nodes
    nx.draw_networkx_nodes(G, pos, node_size=2, node_color='gray', ax=ax1)

    # Draw original edges in black
    original_edges_to_draw = [(u, v) for u, v, d in G.edges(data=True) if d.get('original', False)]
    if original_edges_to_draw:
        nx.draw_networkx_edges(G, pos,
                             edgelist=original_edges_to_draw,
                             edge_color='black',
                             width=0.2,
                             alpha=0.9,
                             ax=ax1,
                             label='Original Edges')

    # Draw reconstructed edges with confidence colors
    reconstructed_edges_to_draw = [(u, v) for u, v, d in G.edges(data=True) if not d.get('original', True) and 'confidence' in d]
    if reconstructed_edges_to_draw:
        edge_colors = [G[u][v]['confidence'] for u, v in reconstructed_edges_to_draw]
        nx.draw_networkx_edges(G, pos,
                             edgelist=reconstructed_edges_to_draw,
                             edge_color=edge_colors,
                             edge_cmap=cmap,
                             edge_vmin=vmin,
                             edge_vmax=vmax,
                             width=3.5,
                             alpha=0.7,
                             ax=ax1,
                             label='Reconstructed Edges')

        # Add a colorbar for edge confidence
        sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
        sm.set_array([])
        cbar = fig.colorbar(sm, ax=ax1, label='Confidence Score', orientation='vertical',
                           pad=0.01, fraction=0.05)

    ax1.set_title("Graph with Original (Black) and Reconstructed (Colored) Edges", fontsize=14)
    ax1.axis('off')

    # Add legend for edge types
    # Check if both types of edges were drawn before adding legend
    if original_edges_to_draw or reconstructed_edges_to_draw:
        from matplotlib.lines import Line2D
        legend_elements = []
        if original_edges_to_draw:
             legend_elements.append(Line2D([0], [0], color='black', lw=4, label='Original Edges'))
        if reconstructed_edges_to_draw:
             # Using cmap(0.5) gives an intermediate color from the colormap for the legend
             legend_elements.append(Line2D([0], [0], color=cmap(0.5 if reconstructed_confidence else 0), lw=3.5, label='Reconstructed Edges'))
        if legend_elements:
            ax1.legend(handles=legend_elements, loc='upper right')


    # Draw the histogram on the bottom subplot for reconstructed edge confidence
    if reconstructed_confidence:
        bins = np.linspace(min(reconstructed_confidence), max(reconstructed_confidence), 20)
        ax2.hist(reconstructed_confidence, bins=bins, color='skyblue', edgecolor='black', alpha=0.7)
        ax2.set_title("Distribution of Reconstructed Edge Confidence Scores", fontsize=12)
        ax2.set_xlabel("Confidence Score")
        ax2.set_ylabel("Frequency")
        ax2.grid(alpha=0.3)

        # Add vertical line for threshold if provided
        if threshold is not None:
            ax2.axvline(x=threshold, color='red', linestyle='--', label=f'Threshold: {threshold:.2f}') # Format threshold for clarity
            ax2.legend()

        # Add statistics to the histogram plot
        avg_confidence = np.mean(reconstructed_confidence)
        median_confidence = np.median(reconstructed_confidence)

        # Add statistics to the histogram plot
        avg_confidence = np.mean(reconstructed_confidence)
        median_confidence = np.median(reconstructed_confidence)

        stats_text = (f"Statistics (Reconstructed Edges):\n"
                     f"Mean: {avg_confidence:.3f}\n"
                     f"Median: {median_confidence:.3f}\n"
                     f"Min: {min(reconstructed_confidence):.3f}\n"
                     f"Max: {max(reconstructed_confidence):.3f}\n"
                     f"Edges: {len(reconstructed_confidence)}")

        ax2.text(0.05, 0.95, stats_text, transform=ax2.transAxes,
                 verticalalignment='top', bbox=dict(boxstyle='round', alpha=0.1))
    else:
        ax2.text(0.5, 0.5, "No reconstructed edges to display",
                 transform=ax2.transAxes, ha='center', fontsize=12)

    plt.tight_layout()
    return fig, (ax1, ax2)

In [None]:
# After training the model
embeddings = get_node_embeddings(model, dataset.data)
edges, confidence = calculate_confidence(embeddings, dataset.data.edge_index)

# Visualize with the new function including original edge information
fig, (graph_ax, hist_ax) = draw_confidence_graph(
    edges,
    confidence,
    original_edge_index=dataset.data.edge_index,  # Pass original edges
    threshold=0.1
)
plt.show()
