# Basic Model Preparation

In [1]:
import numpy as np
import random
import torch

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 设定种子
set_seed(18)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import argparse
from types import SimpleNamespace
import sys
sys.path.append('/data/lige/HKN')# Please change accordingly!

from __future__ import division
from __future__ import print_function

from geoopt import ManifoldParameter as geoopt_ManifoldParameter
from manifolds.base import ManifoldParameter as base_ManifoldParameter

import datetime
import json
import logging
from optim import RiemannianAdam, RiemannianSGD
import os
import pickle
import time

import numpy as np
import torch
from config import parser
from models.base_models import NCModel, LPModel, GCModel
from utils.data_utils import load_data, get_nei, GCDataset, split_batch
from utils.train_utils import get_dir_name, format_metrics
from utils.eval_utils import acc_f1

from geoopt import ManifoldParameter as geoopt_ManifoldParameter
from manifolds.base import ManifoldParameter as base_ManifoldParameter

#import torch.nn.functional as F


config_args = {
    'training_config': {
        'lr': (1e-4, 'learning rate'),
        'dropout': (0.25, 'dropout probability'),
        'cuda': (0, 'which cuda device to use (-1 for cpu training)'),
        'epochs': (1000, 'maximum number of epochs to train for'),
        'weight_decay': (1e-4, 'l2 regularization strength'),
        'optimizer': ('radam', 'which optimizer to use, can be any of [rsgd, radam]'),
        'momentum': (0.999, 'momentum in optimizer'),
        'patience': (15, 'patience for early stopping'),
        'seed': (18, 'seed for training'),
        'log_freq': (1, 'how often to compute print train/val metrics (in epochs)'),
        'eval_freq': (1, 'how often to compute val metrics (in epochs)'),
        'save': (0, '1 to save model and logs and 0 otherwise'),
        'save_dir': (None, 'path to save training logs and model weights (defaults to logs/task/date/run/)'),
        'sweep_c': (0, ''),
        'lr_reduce_freq': (None, 'reduce lr every lr-reduce-freq or None to keep lr constant'),
        'gamma': (0.5, 'gamma for lr scheduler'),
        'print_epoch': (True, ''),
        'grad_clip': (None, 'max norm for gradient clipping, or None for no gradient clipping'),
        'min_epochs': (300, 'do not early stop before min-epochs')
    },
    'model_config': {
        'use_geoopt': (False, "which manifold class to use, if false then use basd.manifold"),
        'AggKlein':(False, "if false, then use hyperboloid centorid for aggregation"),
        'corr': (0,'0: d(x_i ominus x, x_k), 1: d(x_ik,x_k)'),
        'task': ('nc', 'which tasks to train on, can be any of [lp, nc]'),
        'model': ('BKNet', 'which encoder to use, can be any of [Shallow, MLP, HNN, GCN, GAT, HyperGCN, HyboNet,BKNet,BMLP]'),
        'dim': (32, 'embedding dimension'),
        'manifold': ('PoincareBall', 'which manifold to use, can be any of [Euclidean, Hyperboloid, PoincareBall, Lorentz]'),
        'c': (1.0, 'hyperbolic radius, set to None for trainable curvature'),
        'r': (2., 'fermi-dirac decoder parameter for lp'),
        't': (1., 'fermi-dirac decoder parameter for lp'),
        'margin': (2., 'margin of MarginLoss'),
        'pretrained_embeddings': (None, 'path to pretrained embeddings (.npy file) for Shallow node classification'),
        'pos_weight': (0, 'whether to upweight positive class in node classification tasks'),
        'num_layers': (2, 'number of hidden layers in encoder'),
        'bias': (1, 'whether to use bias (1) or not (0)'),
        'act': ('relu', 'which activation function to use (or None for no activation)'),
        'n_heads': (4, 'number of attention heads for graph attention networks, must be a divisor dim'),
        'alpha': (0.2, 'alpha for leakyrelu in graph attention networks'),
        'double_precision': ('1', 'whether to use double precision'),
        'use_att': (0, 'whether to use hyperbolic attention or not'),
        'local_agg': (0, 'whether to local tangent space aggregation or not'),
        'kernel_size': (6, 'number of kernels'),
        'KP_extent': (0.66, 'influence radius of each kernel point'),
        'radius': (1, 'radius used for kernel point init'),
        'deformable': (False, 'deformable kernel'),
        'linear_before': (64, 'dim of linear before gcn')#64
    },
    'data_config': {
        'dataset': ('wisconsin', 'which dataset to use(cornell,wisconsin,squirrel,cora)'),
        'batch_size': (32, 'batch size for gc'),
        'val_prop': (0.05, 'proportion of validation edges for link prediction'),
        'test_prop': (0.1, 'proportion of test edges for link prediction'),
        'use_feats': (1, 'whether to use node features or not'),
        'normalize_feats': (1, 'whether to normalize input node features'),
        'normalize_adj': (1, 'whether to row-normalize the adjacency matrix'),
        'split_seed': (1234, 'seed for data splits (train/test/val)'),
        'split_graph': (False, 'whether to split the graph')
    }
}

# 将所有参数转换为 SimpleNamespace
args = SimpleNamespace(
    **{k: v[0] for config in config_args.values() for k, v in config.items()}
)

#choose which manifold class to follow 
if args.use_geoopt == False:
    ManifoldParameter = base_ManifoldParameter
else:
    ManifoldParameter = geoopt_ManifoldParameter
np.random.seed(args.seed)#args.seed
torch.manual_seed(args.seed)#args.seed
if int(args.cuda):#args.double_precision
    torch.set_default_dtype(torch.float64)
if int(args.cuda) >= 0:#args.cuda
    torch.cuda.manual_seed(args.seed)#args.seed
args.device = 'cuda:' + str(args.cuda) if int(args.cuda) >= 0 else 'cpu' #args.device actually,<-args.cuda
args.patience = args.epochs if not args.patience else args.patience #args.patience<-args.epochs|args.patience

print(f'Using: {args.device}')
print("Using seed {}.".format(args.seed))
print(f"Dataset: {args.dataset}")

# Load data
data = load_data(args, os.path.join('data', args.dataset))
if args.task == 'gc':
    args.n_nodes, args.feat_dim = data['features'][0].shape
else:
    args.n_nodes, args.feat_dim = data['features'].shape
if args.task == 'nc':
    Model = NCModel
    args.n_classes = int(data['labels'].max() + 1)
    args.data = data
    print(f'Num classes: {args.n_classes}')
elif args.task == 'gc':
    Model = GCModel
    args.n_classes = int(data['labels'].max() + 1)
    print(f'Num classes: {args.n_classes}')
else:
    args.nb_false_edges = len(data['train_edges_false'])
    args.nb_edges = len(data['train_edges'])
    if args.task == 'lp':
        Model = LPModel
        args.n_classes = 2

if not args.lr_reduce_freq:
    args.lr_reduce_freq = args.epochs


###A simple check on data
print(data.keys())
print(data['adj_train'].todense().shape)
print(data['features'].shape)
###A simple check on data

# Model and optimizer
model = Model(args)
print(str(model))
no_decay = ['bias', 'scale']
optimizer_grouped_parameters = [{
    'params': [
        p for n, p in model.named_parameters()
        if p.requires_grad and not any(
            nd in n
            for nd in no_decay) and not isinstance(p, ManifoldParameter)
    ],
    'weight_decay':
    args.weight_decay
}, {
    'params': [
        p for n, p in model.named_parameters() if p.requires_grad and any(
            nd in n
            for nd in no_decay) or isinstance(p, ManifoldParameter)
    ],
    'weight_decay':
    0.0
}]
if args.optimizer == 'radam':
    optimizer = RiemannianAdam(params=optimizer_grouped_parameters,
                                lr=args.lr,
                                stabilize=10)
elif args.optimizer == 'rsgd':
    optimizer = RiemannianSGD(params=optimizer_grouped_parameters,
                                lr=args.lr,
                                stabilize=10)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=int(
                                                args.lr_reduce_freq),
                                                gamma=float(args.gamma))
tot_params = sum([np.prod(p.size()) for p in model.parameters()])
model = model.to(args.device)
for x, val in data.items():
    if torch.is_tensor(data[x]):
        data[x] = data[x].to(args.device)
print(f"Total number of parameters: {tot_params}")

# Train model for nc:
t_total = time.time()
counter = 0
best_val_metrics = model.init_metric_dict()
best_test_metrics = None
best_emb = None
if args.n_classes > 2:
    f1_average = 'micro'
else:
    f1_average = 'binary'

if args.model == 'HKPNet':
    nei, nei_mask = get_nei(data['adj_train'])
    nei = nei.to(args.device)
    nei_mask = nei_mask.to(args.device)
elif args.model == 'BKNet':
    nei, nei_mask = get_nei(data['adj_train'])
    nei = nei.to(args.device)
    nei_mask = nei_mask.to(args.device) #nei/nei_mask on cuda now


Using: cuda:0
Using seed 18.
Dataset: wisconsin
Num classes: 5
dict_keys(['adj_train', 'features', 'labels', 'idx_train', 'idx_val', 'idx_test', 'adj_train_norm'])
(251, 251)
torch.Size([251, 1703])


  adj = nx.adjacency_matrix(G, sorted(G.nodes()))


NCModel(
  (encoder): BKNet(
    (linear_before): BLinear(
      in_features=1703, out_features=64, c=tensor([1.], device='cuda:0'), use_bias=1, act=None, dropout_rate=0.25
      (dropout): Dropout(p=0.25, inplace=False)
      (E_linear): Linear(in_features=1703, out_features=64, bias=False)
    )
    (layers): Sequential(
      (0): KPGraphConvolution(
        (net): KernelPointAggregation(
          (linears): ModuleList(
            (0): BLinear(
              in_features=64, out_features=32, c=tensor([1.], device='cuda:0'), use_bias=1, act=None, dropout_rate=0.25
              (dropout): Dropout(p=0.25, inplace=False)
              (E_linear): Linear(in_features=64, out_features=32, bias=False)
            )
            (1): BLinear(
              in_features=64, out_features=32, c=tensor([1.], device='cuda:0'), use_bias=1, act=None, dropout_rate=0.25
              (dropout): Dropout(p=0.25, inplace=False)
              (E_linear): Linear(in_features=64, out_features=32, bias=False

## Check whether nei,nei_mask,x_nei are correct

In [3]:
"""
Basic datas:
    Parameters:
    adj (scipy.sparse matrix): A sparse adjacency matrix representing the graph.

    Returns:
    nei (torch.Tensor): A tensor of shape (n, num_nei) where each row contains the indices of
                        the neighbors of the corresponding node, padded with 0s to the maximum 
                        number of neighbors.
    nei_mask (torch.Tensor): A mask tensor of the same shape as nei, where a value of 1 indicates
                             a valid neighbor index and 0 indicates padding.

    Example:
    Given an adjacency matrix adj:
    [[0, 1, 0, 0],
     [1, 0, 1, 0],
     [0, 1, 0, 1],
     [0, 0, 1, 0]]

    The graph is:
    Node 0 is connected to Node 1
    Node 1 is connected to Node 0 and Node 2
    Node 2 is connected to Node 1 and Node 3
    Node 3 is connected to Node 2

    n = 4  # Number of nodes
    num_nei = 2  # Maximum number of neighbors (Node 1 and Node 2 have the most neighbors, 2)

    adj_list = [[1], [0, 2], [1, 3], [2]]

    The function generates:
    nei = [
        [1, 0],    # Neighbors of Node 0 (padded with 0)
        [0, 2],    # Neighbors of Node 1
        [1, 3],    # Neighbors of Node 2
        [2, 0]     # Neighbors of Node 3 (padded with 0)
    ]

    nei_mask = [
        [1, 0],    # Mask for Node 0 (1 for valid neighbor, 0 for padding)
        [1, 1],    # Mask for Node 1
        [1, 1],    # Mask for Node 2
        [1, 0]     # Mask for Node 3 (1 for valid neighbor, 0 for padding)
    ]
    It tells which are paddings and which are not!
"""
#This construction is different from what we usually have as an adjacency matrix
print(nei.shape,nei_mask.shape)
print("This construction is different from what we usually have as an adjacency matrix.")
print("Checked it's correct")

torch.Size([251, 122]) torch.Size([251, 122])
This construction is different from what we usually have as an adjacency matrix.
Checked it's correct


# Check steps before aggregation

In [4]:
#import func_outof_class as foc
import layers.B_layers as B_layers
print(data['features'].shape)
print(torch.norm(data['features'], p=2, dim=1).max())
print(torch.norm(data['features'], p=2, dim=1).min())

x = data['features'] #(n,d)
x_nei = B_layers.gather(x, nei) #(n,nei_num,d)

x=x.to(torch.float64)
x_tan = model.manifold.proj_tan0(x, model.c)
x_hyp = model.manifold.expmap0(x_tan, c=model.c)
x = model.manifold.proj(x_hyp, c=model.c)

x=model.encoder.linear_before(x)

nei, nei_mask = (nei, nei_mask) #(nei, nei_mask) is adj in the train.py code
print(x.shape,nei.shape,nei_mask.shape)
input = (x, nei, nei_mask)
#model.encoder.layers.forward(input) This is how data flows, but we'll break it done here
#model.encoder.layers[0].net #This is the KernelPointAggregation Module
x_nei = B_layers.gather(x, nei) #(n,nei_num,d')
print(x_nei.shape)

#Let's check the correspondence is correct
print(torch.equal(x[nei[0][0]],x_nei[0][0]),torch.equal(x[nei[-1][1]],x_nei[-1][1]))
#Note: although it seems many rows have numbers, most are paddings that will be deactivated by nei_mask

#if transp:
print("Previous x_nei[1]:\n",x_nei[1])
print("Previous x_nei[1][2] Euclidean norm",torch.norm(x_nei[1],dim=-1)[2])
x, x_nei = model.encoder.layers[0].net.transport_x(x, x_nei)
print("Translated x_nei[1]:\n",x_nei[1])
print("Translated x_nei[1][2] Euclidean norm",torch.norm(x_nei[1],dim=-1)[2])


n, nei_num, d = x_nei.shape

kernels=model.encoder.layers[0].net.get_kernel_pos(x, nei, nei_mask, sample=False, sample_num=16, transp= False)#(n,k,d')
print("Since we are moving neighborhood to the origin kernel, each kernels (k,d') should be identical")
print(torch.equal(kernels[0],kernels[-1]))

#The following function might be incorrect!!!!!

#x_nei_kernel_dis=model.encoder.layers[0].net.get_nei_kernel_dis(kernels,x_nei)# (n, k, nei_num)

torch.Size([251, 1703])
tensor(1., device='cuda:0')
tensor(0.0489, device='cuda:0')
torch.Size([251, 64]) torch.Size([251, 122]) torch.Size([251, 122])
torch.Size([251, 122, 64])
True True
Previous x_nei[1]:
 tensor([[-0.7014,  0.0036, -0.7042,  ...,  0.0049, -0.0241, -0.0178],
        [-0.7014,  0.0036, -0.7042,  ...,  0.0049, -0.0241, -0.0178],
        [-0.7014,  0.0036, -0.7042,  ...,  0.0049, -0.0241, -0.0178],
        ...,
        [-0.7014,  0.0036, -0.7042,  ...,  0.0049, -0.0241, -0.0178],
        [-0.7014,  0.0036, -0.7042,  ...,  0.0049, -0.0241, -0.0178],
        [-0.7014,  0.0036, -0.7042,  ...,  0.0049, -0.0241, -0.0178]],
       device='cuda:0', dtype=torch.float64, grad_fn=<SelectBackward0>)
Previous x_nei[1][2] Euclidean norm tensor(1.0000, device='cuda:0', dtype=torch.float64, grad_fn=<SelectBackward0>)
Translated x_nei[1]:
 tensor([[7.2325e-01, 1.6436e-16, 6.8446e-01,  ..., 1.7814e-03, 5.9243e-03,
         3.1975e-02],
        [7.2325e-01, 1.6436e-16, 6.8446e-01,  ...,

In [5]:
import math as math
print(math.isclose(torch.sqrt(model.manifold.sqdist(x_nei[0][0],kernels[0][1],c=model.c)).item(),\
model.encoder.layers[0].net.get_nei_kernel_dis(kernels,x_nei)[0][1][0].item(),abs_tol=1e-5))
#Note we use math.isclose because they are numerically a bit different
print("So that we've checked this get_nei_kernel_dis func is correct")
x_nei_kernel_dis=model.encoder.layers[0].net.get_nei_kernel_dis(kernels,x_nei)# (n, k, nei_num)


False
So that we've checked this get_nei_kernel_dis func is correct


In [6]:
print("Now, we check whether nei_mask helps to filter out padding element\n")
print(nei_mask.shape)
nei_mask = nei_mask.repeat(1,1,model.encoder.layers[0].net.K).view(n,model.encoder.layers[0].net.K,nei_num)#(n, k, nei_num)
print(nei_mask.shape)
print("Since padding element are not actually connected to center node, we don't need them in outer aggregation,\n\
       so they are also not needed for inner aggregation, we use nei_mask to get rid of them.\n\
       Making use of element wise product of nei_mask.\n\
       I think this step is correct, let's check:")

print("Previous x_nei_kernel_dis(n,k,nei_num)[0][1]: \n",x_nei_kernel_dis[0][1])
x_nei_kernel_dis = x_nei_kernel_dis * nei_mask
print("After mask, x_nei_kernel_dis(n,k,nei_num)[0][1]: \n",x_nei_kernel_dis[0][1])

Now, we check whether nei_mask helps to filter out padding element

torch.Size([251, 122])
torch.Size([251, 6, 122])
Since padding element are not actually connected to center node, we don't need them in outer aggregation,
       so they are also not needed for inner aggregation, we use nei_mask to get rid of them.
       Making use of element wise product of nei_mask.
       I think this step is correct, let's check:
Previous x_nei_kernel_dis(n,k,nei_num)[0][1]: 
 tensor([12.2061, 12.2061, 12.2061, 12.2061, 12.2061,  0.6600,  0.6600,  0.6600,
         0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,
         0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,
         0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,
         0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,
         0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,  0.6600,
         0.6600,  0.6600,  0.6600,  0.6600,  0.660

In [7]:
print("Now, we check whether apply_kernel_transform is correct\n")
print(x_nei.shape)
x_nei_transform = model.encoder.layers[0].net.apply_kernel_transform(x_nei)
print(x_nei_transform.shape)
print("Seems to be correct")

Now, we check whether apply_kernel_transform is correct

torch.Size([251, 122, 64])
torch.Size([251, 6, 122, 32])
Seems to be correct


# Check steps for aggregation

## Hyperboloid Centroid

In [8]:
print("Using Hyperboloid Centroid for Aggregation")
#model.encoder.layers[0].net.
print(x_nei_transform[0][0].shape)
print(model.manifold.poincare_to_hyperboloid(x_nei_transform[0][0],c=model.c).shape)

print(x_nei_transform.shape)
print(model.manifold.poincare_to_hyperboloid(x_nei_transform,c=model.c).shape)
print("Dimensions seems to be correct, we need to further check whether things are on hyperboloid!")

hyperboloid_x_nei_transform = model.manifold.poincare_to_hyperboloid(x_nei_transform,c=model.c)#(n,K,nei_num,d')
a=torch.allclose(
    model.manifold.hyperboloid_proj(hyperboloid_x_nei_transform, model.c),
    hyperboloid_x_nei_transform,
    rtol=1e-5,  # 相对容差
    atol=1e-5   # 绝对容差
)
print("Whether mapped points are on the hyperboloid: ",a,"\n")
print("We don't use equal, because after testing, it seems they're not absolutely equal,\n\
so I think such projection is neccesary to constrain points on the manifold")

hyperboloid_x_nei_transform = model.manifold.hyperboloid_proj(hyperboloid_x_nei_transform, model.c)
print(hyperboloid_x_nei_transform.shape,x_nei_kernel_dis.shape)
hyperboloid_x_nei_transform=model.encoder.layers[0].net.avg_kernel(hyperboloid_x_nei_transform,x_nei_kernel_dis,False)#inner_agg#(n,nei_num,d') in hyperboloid
print("After inner aggregation,hyperboloid_x_nei_transform: ",hyperboloid_x_nei_transform.shape)
hyperboloid_x_final=model.manifold.hyperboloid_centroid(hyperboloid_x_nei_transform,model.c)#outer_agg#(n,d')# on hyperboloid
print("After outer aggregation,hyperboloid_x_final: ",hyperboloid_x_final.shape)
x_final=model.manifold.hyperboloid_to_poincare(hyperboloid_x_final,c=model.c)#(n,d')# in Poincare
x_final = model.manifold.proj(x_final,c=model.c)

Using Hyperboloid Centroid for Aggregation
torch.Size([122, 32])
torch.Size([122, 33])
torch.Size([251, 6, 122, 32])
torch.Size([251, 6, 122, 33])
Dimensions seems to be correct, we need to further check whether things are on hyperboloid!
Whether mapped points are on the hyperboloid:  True 

We don't use equal, because after testing, it seems they're not absolutely equal,
so I think such projection is neccesary to constrain points on the manifold
torch.Size([251, 6, 122, 33]) torch.Size([251, 6, 122])
After inner aggregation,hyperboloid_x_nei_transform:  torch.Size([251, 122, 33])
After outer aggregation,hyperboloid_x_final:  torch.Size([251, 33])


In [9]:
import torch.nn.functional as F
w=torch.sqrt(model.manifold.sqdist(torch.zeros_like(hyperboloid_x_nei_transform),hyperboloid_x_nei_transform,c=model.c))
print(w.shape,'\n',w)
F.softmax(w, dim=-1)

torch.Size([251, 122]) 
 tensor([[12.2061, 12.2061, 12.2061,  ..., 12.2061, 12.2061, 12.2061],
        [12.2061, 12.2061, 12.2061,  ..., 12.2061, 12.2061, 12.2061],
        [12.2061, 12.2061, 12.2061,  ..., 12.2061, 12.2061, 12.2061],
        ...,
        [12.2061, 12.2061, 12.2061,  ..., 12.2061, 12.2061, 12.2061],
        [12.2061, 12.2061, 12.2061,  ..., 12.2061, 12.2061, 12.2061],
        [12.2061, 12.2061, 12.2061,  ..., 12.2061, 12.2061, 12.2061]],
       device='cuda:0', dtype=torch.float64, grad_fn=<SqrtBackward0>)


tensor([[0.0082, 0.0082, 0.0082,  ..., 0.0082, 0.0082, 0.0082],
        [0.0082, 0.0082, 0.0082,  ..., 0.0082, 0.0082, 0.0082],
        [0.0082, 0.0082, 0.0082,  ..., 0.0082, 0.0082, 0.0082],
        ...,
        [0.0082, 0.0082, 0.0082,  ..., 0.0082, 0.0082, 0.0082],
        [0.0082, 0.0082, 0.0082,  ..., 0.0082, 0.0082, 0.0082],
        [0.0082, 0.0082, 0.0082,  ..., 0.0082, 0.0082, 0.0082]],
       device='cuda:0', dtype=torch.float64, grad_fn=<SoftmaxBackward0>)

In [10]:
torch.norm(x_final,dim=-1).max()

tensor(0.4810, device='cuda:0', dtype=torch.float64, grad_fn=<MaxBackward1>)

## Klein Midpoint

In [13]:
print("Using Klein MidPoint for Aggregation")
#model.encoder.layers[0].net.
print(x_nei_transform.shape)
print(model.manifold.poincare_to_klein(x_nei_transform,c=model.c).shape)
print("Dimensions seems to be correct, we need to further check whether things are on klein disk!")

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)


x_nei_transform = 0.08*(torch.randn(x_nei_transform.shape[0],x_nei_transform.shape[1],
                                x_nei_transform.shape[2],x_nei_transform.shape[3])).to(model.c.device)
x_nei_transform = model.manifold.proj(x_nei_transform, model.c)

klein_x_nei_transform = model.manifold.poincare_to_klein(x_nei_transform,c=model.c)#(n,K,nei_num,d')
#torch.norm(klein_x_nei_transform,dim=-1)#.shape
a=torch.allclose(
    model.manifold.klein_proj(klein_x_nei_transform, model.c),
    klein_x_nei_transform,
    rtol=1e-5,  # 相对容差
    atol=1e-5   # 绝对容差
)
print("Whether mapped points are on the klein disk: ",a,"\n")
print("We don't use equal, because after testing, it seems they're not absolutely equal,\n\
so I think such projection is neccesary to constrain points on the manifold")

klein_x_nei_transform = model.manifold.klein_proj(klein_x_nei_transform, model.c)
print(klein_x_nei_transform.shape,x_nei_kernel_dis.shape)

#klein_x_nei_transform=model.encoder.layers[0].net.avg_kernel(klein_x_nei_transform,x_nei_kernel_dis,False)#inner_agg#(n,nei_num,d') in hyperboloid

Using Klein MidPoint for Aggregation
torch.Size([251, 6, 122, 32])
torch.Size([251, 6, 122, 32])
Dimensions seems to be correct, we need to further check whether things are on klein disk!
Whether mapped points are on the klein disk:  True 

We don't use equal, because after testing, it seems they're not absolutely equal,
so I think such projection is neccesary to constrain points on the manifold
torch.Size([251, 6, 122, 32]) torch.Size([251, 6, 122])


RuntimeError: expected scalar type Double but found Float

In [None]:
hyperboloid_x_nei_transform = model.manifold.hyperboloid_proj(hyperboloid_x_nei_transform, model.c)
print(hyperboloid_x_nei_transform.shape,x_nei_kernel_dis.shape)
hyperboloid_x_nei_transform=model.encoder.layers[0].net.avg_kernel(hyperboloid_x_nei_transform,x_nei_kernel_dis,False)#inner_agg#(n,nei_num,d') in hyperboloid
print("After inner aggregation,hyperboloid_x_nei_transform: ",hyperboloid_x_nei_transform.shape)
hyperboloid_x_final=model.manifold.hyperboloid_centroid(hyperboloid_x_nei_transform,model.c)#outer_agg#(n,d')# on hyperboloid
print("After outer aggregation,hyperboloid_x_final: ",hyperboloid_x_final.shape)
x_final=model.manifold.hyperboloid_to_poincare(hyperboloid_x_final,c=model.c)#(n,d')# in Poincare
x_final = model.manifold.proj(x_final,c=model.c)

# Test Hyperboloid Aggregation

In [2]:
import argparse
from types import SimpleNamespace
import sys
sys.path.append('/data/lige/HKN')# Please change accordingly!

from __future__ import division
from __future__ import print_function

from geoopt import ManifoldParameter as geoopt_ManifoldParameter
from manifolds.base import ManifoldParameter as base_ManifoldParameter

import datetime
import json
import logging
from optim import RiemannianAdam, RiemannianSGD
import os
import pickle
import time

import numpy as np
import torch
from config import parser
from models.base_models import NCModel, LPModel, GCModel
from utils.data_utils import load_data, get_nei, GCDataset, split_batch
from utils.train_utils import get_dir_name, format_metrics
from utils.eval_utils import acc_f1

from geoopt import ManifoldParameter as geoopt_ManifoldParameter
from manifolds.base import ManifoldParameter as base_ManifoldParameter


os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
torch.cuda.empty_cache()

#import torch.nn.functional as F


config_args = {
    'training_config': {
        'lr': (1e-3, 'learning rate'),
        'dropout': (0.5, 'dropout probability'),
        'cuda': (0, 'which cuda device to use (-1 for cpu training)'),
        'epochs': (5000, 'maximum number of epochs to train for'),
        'weight_decay': (1e-3, 'l2 regularization strength'),
        'optimizer': ('radam', 'which optimizer to use, can be any of [rsgd, radam]'),
        'momentum': (0.999, 'momentum in optimizer'),
        'patience': (250, 'patience for early stopping'),
        'seed': (8, 'seed for training'),
        'log_freq': (1, 'how often to compute print train/val metrics (in epochs)'),
        'eval_freq': (1, 'how often to compute val metrics (in epochs)'),
        'save': (0, '1 to save model and logs and 0 otherwise'),
        'save_dir': (None, 'path to save training logs and model weights (defaults to logs/task/date/run/)'),
        'sweep_c': (0, ''),
        'lr_reduce_freq': (None, 'reduce lr every lr-reduce-freq or None to keep lr constant'),
        'gamma': (0.5, 'gamma for lr scheduler'),
        'print_epoch': (True, ''),
        'grad_clip': (None, 'max norm for gradient clipping, or None for no gradient clipping'),
        'min_epochs': (2500, 'do not early stop before min-epochs')
    },
    'model_config': {
        'use_geoopt': (False, "which manifold class to use, if false then use basd.manifold"),
        'AggKlein':(False, "if false, then use hyperboloid centorid for aggregation"),
        'corr': (0,'0: d(x_i ominus x, x_k), 1: d(x_ik,x_k)'),
        'task': ('nc', 'which tasks to train on, can be any of [lp, nc]'),
        'model': ('BKNet', 'which encoder to use, can be any of [Shallow, MLP, HNN, GCN, GAT, HyperGCN, HyboNet,BKNet,BMLP]'),
        #'dim': (64, 'embedding dimension'),
        'dim': (16, 'embedding dimension'),
        'manifold': ('PoincareBall', 'which manifold to use, can be any of [Euclidean, Hyperboloid, PoincareBall, Lorentz]'),
        'c': (1.0, 'hyperbolic radius, set to None for trainable curvature'),
        'r': (2., 'fermi-dirac decoder parameter for lp'),
        't': (1., 'fermi-dirac decoder parameter for lp'),
        'margin': (2., 'margin of MarginLoss'),
        'pretrained_embeddings': (None, 'path to pretrained embeddings (.npy file) for Shallow node classification'),
        'pos_weight': (0, 'whether to upweight positive class in node classification tasks'),
        'num_layers': (2, 'number of hidden layers in encoder'),
        'bias': (1, 'whether to use bias (1) or not (0)'),
        'act': ('relu', 'which activation function to use (or None for no activation)'),
        'n_heads': (4, 'number of attention heads for graph attention networks, must be a divisor dim'),
        'alpha': (0.2, 'alpha for leakyrelu in graph attention networks'),
        'double_precision': ('1', 'whether to use double precision'),
        'use_att': (0, 'whether to use hyperbolic attention or not'),
        'local_agg': (0, 'whether to local tangent space aggregation or not'),
        'kernel_size': (6, 'number of kernels'),
        'KP_extent': (0.75, 'influence radius of each kernel point'),
        'radius': (1, 'radius used for kernel point init'),
        'deformable': (False, 'deformable kernel'),
        #'linear_before': (64, 'dim of linear before gcn')#64
        'linear_before': (32, 'dim of linear before gcn')#64
    },
    'data_config': {
        'dataset': ('wisconsin', 'which dataset to use(cornell,wisconsin,squirrel,cora)'),
        'batch_size': (32, 'batch size for gc'),
        'val_prop': (0.05, 'proportion of validation edges for link prediction'),
        'test_prop': (0.1, 'proportion of test edges for link prediction'),
        'use_feats': (1, 'whether to use node features or not'),
        'normalize_feats': (1, 'whether to normalize input node features'),
        'normalize_adj': (1, 'whether to row-normalize the adjacency matrix'),
        'split_seed': (8, 'seed for data splits (train/test/val)'),
        'split_graph': (False, 'whether to split the graph')
    }
}

# 将所有参数转换为 SimpleNamespace
args = SimpleNamespace(
    **{k: v[0] for config in config_args.values() for k, v in config.items()}
)

#choose which manifold class to follow 
if args.use_geoopt == False:
    ManifoldParameter = base_ManifoldParameter
else:
    ManifoldParameter = geoopt_ManifoldParameter
np.random.seed(args.seed)#args.seed
torch.manual_seed(args.seed)#args.seed
if int(args.cuda):#args.double_precision
    torch.set_default_dtype(torch.float64)
if int(args.cuda) >= 0:#args.cuda
    torch.cuda.manual_seed(args.seed)#args.seed
args.device = 'cuda:' + str(args.cuda) if int(args.cuda) >= 0 else 'cpu' #args.device actually,<-args.cuda
args.patience = args.epochs if not args.patience else args.patience #args.patience<-args.epochs|args.patience

print(f'Using: {args.device}')
print("Using seed {}.".format(args.seed))
print(f"Dataset: {args.dataset}")

# Load data
data = load_data(args, os.path.join('data', args.dataset))
if args.task == 'gc':
    args.n_nodes, args.feat_dim = data['features'][0].shape
else:
    args.n_nodes, args.feat_dim = data['features'].shape
if args.task == 'nc':
    Model = NCModel
    args.n_classes = int(data['labels'].max() + 1)
    args.data = data
    print(f'Num classes: {args.n_classes}')
elif args.task == 'gc':
    Model = GCModel
    args.n_classes = int(data['labels'].max() + 1)
    print(f'Num classes: {args.n_classes}')
else:
    args.nb_false_edges = len(data['train_edges_false'])
    args.nb_edges = len(data['train_edges'])
    if args.task == 'lp':
        Model = LPModel
        args.n_classes = 2

if not args.lr_reduce_freq:
    args.lr_reduce_freq = args.epochs


###A simple check on data
print(data.keys())
print(data['adj_train'].todense().shape)
print(data['features'].shape)
###A simple check on data

# Model and optimizer
model = Model(args)
print(str(model))
no_decay = ['bias', 'scale']
optimizer_grouped_parameters = [{
    'params': [
        p for n, p in model.named_parameters()
        if p.requires_grad and not any(
            nd in n
            for nd in no_decay) and not isinstance(p, ManifoldParameter)
    ],
    'weight_decay':
    args.weight_decay
}, {
    'params': [
        p for n, p in model.named_parameters() if p.requires_grad and any(
            nd in n
            for nd in no_decay) or isinstance(p, ManifoldParameter)
    ],
    'weight_decay':
    0.0
}]
if args.optimizer == 'radam':
    optimizer = RiemannianAdam(params=optimizer_grouped_parameters,
                                lr=args.lr,
                                stabilize=10)
elif args.optimizer == 'rsgd':
    optimizer = RiemannianSGD(params=optimizer_grouped_parameters,
                                lr=args.lr,
                                stabilize=10)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=int(
                                                args.lr_reduce_freq),
                                                gamma=float(args.gamma))
tot_params = sum([np.prod(p.size()) for p in model.parameters()])
model = model.to(args.device)
for x, val in data.items():
    if torch.is_tensor(data[x]):
        data[x] = data[x].to(args.device)
print(f"Total number of parameters: {tot_params}")

# Train model for nc:
t_total = time.time()
counter = 0
best_val_metrics = model.init_metric_dict()
best_test_metrics = None
best_emb = None
if args.n_classes > 2:
    f1_average = 'micro'
else:
    f1_average = 'binary'

if args.model == 'HKPNet':
    nei, nei_mask = get_nei(data['adj_train'])
    nei = nei.to(args.device)
    nei_mask = nei_mask.to(args.device)
elif args.model == 'BKNet':
    nei, nei_mask = get_nei(data['adj_train'])
    nei = nei.to(args.device)
    nei_mask = nei_mask.to(args.device) #nei/nei_mask on cuda now


Using: cuda:0
Using seed 8.
Dataset: wisconsin
Num classes: 5
dict_keys(['adj_train', 'features', 'labels', 'idx_train', 'idx_val', 'idx_test', 'adj_train_norm'])
(251, 251)
torch.Size([251, 1703])
False
NCModel(
  (encoder): BKNet(
    (linear_before): BLinear(
      in_features=1703, out_features=32, c=tensor([1.], device='cuda:0'), use_bias=1, act=<function relu at 0x7fdfb8ae3370>, dropout_rate=0.5
      (E_linear): Linear(in_features=1703, out_features=32, bias=False)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (layers): Sequential(
      (0): KPGraphConvolution(
        (net): KernelPointAggregation(
          (linears): ModuleList(
            (0): BLinear(
              in_features=32, out_features=16, c=tensor([1.], device='cuda:0'), use_bias=1, act=<function relu at 0x7fdfb8ae3370>, dropout_rate=0.5
              (E_linear): Linear(in_features=32, out_features=16, bias=False)
              (dropout): Dropout(p=0.5, inplace=False)
            )
            (1): BLi

In [2]:
if config_args['model_config']['model'][0] == 'BMLP':
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name, param.data)
    print(model.encode(data['features'], data['adj_train_norm']))#[2]
    for name, param in model.named_parameters():
        print(name, param.requires_grad)  # 确保所有参数的 requires_grad 都是 True
elif config_args['model_config']['model'][0] == 'BKNet':
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name, param.data)
    print(model.encode(data['features'], (nei, nei_mask)))#[2]
    for name, param in model.named_parameters():
        print(name, param.requires_grad)  # 确保所有参数的 requires_grad 都是 True

encoder.linear_before.bias tensor([-0.0175, -0.0021,  0.0009, -0.0206,  0.0214,  0.0212, -0.0153, -0.0087,
        -0.0109, -0.0013,  0.0230,  0.0185, -0.0172,  0.0237, -0.0191,  0.0045,
         0.0109, -0.0075, -0.0007,  0.0086, -0.0053,  0.0080,  0.0082,  0.0031,
        -0.0128,  0.0154, -0.0181,  0.0142,  0.0154, -0.0107,  0.0002, -0.0087],
       device='cuda:0', dtype=torch.float64)
encoder.linear_before.E_linear.weight tensor([[ 0.0467, -0.0755, -0.0434,  ...,  0.0245,  0.0135, -0.0054],
        [ 0.0182,  0.0564,  0.0509,  ...,  0.0562,  0.0532,  0.0816],
        [-0.0103, -0.0821,  0.0264,  ...,  0.0627,  0.0687,  0.0562],
        ...,
        [ 0.0287,  0.0041, -0.0265,  ..., -0.0307, -0.0643, -0.0196],
        [-0.0307, -0.0680, -0.0169,  ...,  0.0403, -0.0816, -0.0240],
        [-0.0406,  0.0603, -0.0753,  ..., -0.0194,  0.0107,  0.0656]],
       device='cuda:0', dtype=torch.float64)
encoder.layers.0.net.linears.0.bias tensor([-0.1205,  0.1094, -0.0615, -0.1218,  0.0937, -

# Test Klein Aggregation

# OLD

In [28]:
"""
import func_outof_class as foc


print(data['features'].shape)
print(torch.norm(data['features'], p=2, dim=1).max())
print(torch.norm(data['features'], p=2, dim=1).min())

x = data['features'] #(n,d)
x_nei = foc.gather(x, nei) #(n,nei_num,d)

x=x.to(torch.float64)
x_tan = model.manifold.proj_tan0(x, model.c)
x_hyp = model.manifold.expmap0(x_tan, c=model.c)
x = model.manifold.proj(x_hyp, c=model.c)

x=model.encoder.linear_before(x)

x_nei = foc.gather(x, nei) #(n,nei_num,d')
print(x.shape)#,x)
print(x_nei.shape)#,x_nei) #Many repeated samples so that n might be different from x_nei

print("If we PT neiborhood back to the origin, we shouldn remain kernel points near at the origin")
x, x_nei = foc.transport_x(model.manifold,x, x_nei,model.c)
#print("x: ",x)
#print("x_nei: ",x_nei[0].shape,x_nei[0])
kernel_tangents = foc.init_KP(model.manifold,KP_extent=0.66,K=6,in_channels=64,c=model.c)
kernels = foc.get_kernel_pos(model.manifold,kernel_tangents,x,model.c,KP_extent=0.66,transp=False)#(n, k, d)
print(torch.equal(kernels[0],kernels[-1]))
print("E_dis between kernel points and origin: ",torch.norm(kernels,dim=-1))

#Note we are currently using Poincare distance
x_nei_kernel_dis=foc.get_nei_kernel_dis(model.manifold,kernels, x_nei, model.c) #(n,k,nei_num)
nei_mask = nei_mask.repeat(1, 1, kernels.shape[1]).view(x_nei_kernel_dis.shape[0],kernels.shape[1],x_nei_kernel_dis.shape[2])
x_nei_kernel_dis = x_nei_kernel_dis * nei_mask  # (n, k, nei_num)
print(x_nei_kernel_dis.shape,'\n',x_nei_kernel_dis)
"""

'\nimport func_outof_class as foc\n\n\nprint(data[\'features\'].shape)\nprint(torch.norm(data[\'features\'], p=2, dim=1).max())\nprint(torch.norm(data[\'features\'], p=2, dim=1).min())\n\nx = data[\'features\'] #(n,d)\nx_nei = foc.gather(x, nei) #(n,nei_num,d)\n\nx=x.to(torch.float64)\nx_tan = model.manifold.proj_tan0(x, model.c)\nx_hyp = model.manifold.expmap0(x_tan, c=model.c)\nx = model.manifold.proj(x_hyp, c=model.c)\n\nx=model.encoder.linear_before(x)\n\nx_nei = foc.gather(x, nei) #(n,nei_num,d\')\nprint(x.shape)#,x)\nprint(x_nei.shape)#,x_nei) #Many repeated samples so that n might be different from x_nei\n\nprint("If we PT neiborhood back to the origin, we shouldn remain kernel points near at the origin")\nx, x_nei = foc.transport_x(model.manifold,x, x_nei,model.c)\n#print("x: ",x)\n#print("x_nei: ",x_nei[0].shape,x_nei[0])\nkernel_tangents = foc.init_KP(model.manifold,KP_extent=0.66,K=6,in_channels=64,c=model.c)\nkernels = foc.get_kernel_pos(model.manifold,kernel_tangents,x,mod