In [1]:
!pip install torch_geometric
!pip install topomodelx



In [None]:
import numpy as np
import torch
import pandas as pd
import pickle

import torch_geometric.datasets as geom_datasets
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN # modelo que voy a utilzzar

## **Código que utiliza el modelo**
### **Este código (copiado-pegado) está en los tutoriales (GitHub TopoModelX) de como utilizar estos modelos. Utilizan este mismo modelo con Cora representado con BoW y hyperedges pero a pares (pairwise). Va a servir como base para utiloizar mis embeddings...**

In [68]:
torch.manual_seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

dataset = geom_datasets.Planetoid(root="tmp/", name="cora")[0]

dataset["incidence_1"] = torch.sparse_coo_tensor(
    dataset["edge_index"], torch.ones(dataset["edge_index"].shape[1]), dtype=torch.long
)
dataset = dataset.to(device)

x_0s = dataset["x"]
print(x_0s)
y = dataset["y"]
print(set([val.item() for val in y]))
incidence_1 = dataset["incidence_1"]

print(incidence_1)

class Network(torch.nn.Module):
    """Network class that initializes the base model and readout layer.
    Base model parameters:
    ----------
    Reqired:
    in_channels : int
        Dimension of the input features.
    hidden_channels : int
        Dimension of the hidden features.
    Optitional:
    **kwargs : dict
        Additional arguments for the base model.
    Readout layer parameters:
    ----------
    out_channels : int
        Dimension of the output features.
    task_level : str
        Level of the task. Either "graph" or "node".
    """
    def __init__(
        self, in_channels, hidden_channels, out_channels, task_level="graph", **kwargs
    ):
        super().__init__()
        # Define the model
        self.base_model = HMPNN(
            in_channels=in_channels, hidden_channels=hidden_channels, **kwargs
        )
        # Readout
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"
    def forward(self, x_0, x_1, incidence_1):
        # Base model
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        # Pool over all nodes in the hypergraph
        x = torch.max(x_0, dim=0)[0] if self.out_pool is True else x_0
        return self.linear(x)


# Base model hyperparameters
in_channels = x_0s.shape[1]
hidden_channels = 128
n_layers = 1
# Readout hyperparameters
out_channels = torch.unique(y).shape[0]
task_level = "graph" if out_channels == 1 else "node"
model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level,
).to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()
train_mask = dataset["train_mask"]
val_mask = dataset["val_mask"]
test_mask = dataset["test_mask"]


torch.manual_seed(0)
test_interval = 5
num_epochs = 50
initial_x_1 = torch.zeros_like(x_0s)
for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()
    y_hat = model(x_0s, initial_x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    train_loss = loss.item()
    y_pred = y_hat.argmax(dim=-1)
    train_acc = accuracy_score(y[train_mask].cpu(), y_pred[train_mask].cpu())
    if epoch % test_interval == 0:
        model.eval()
        y_hat = model(x_0s, initial_x_1, incidence_1)
        val_loss = loss_fn(y_hat[val_mask], y[val_mask]).item()
        y_pred = y_hat.argmax(dim=-1)
        val_acc = accuracy_score(y[val_mask].cpu(), y_pred[val_mask].cpu())
        test_loss = loss_fn(y_hat[test_mask], y[test_mask]).item()
        y_pred = y_hat.argmax(dim=-1)
        test_acc = accuracy_score(y[test_mask].cpu(), y_pred[test_mask].cpu())
        print(
            f"Epoch: {epoch + 1} train loss: {train_loss:.4f} train acc: {train_acc:.2f} "
            f" val loss: {val_loss:.4f} val acc: {val_acc:.2f}"
            f" test loss: {test_acc:.4f} val acc: {test_acc:.2f}"
        )


cpu
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
{0, 1, 2, 3, 4, 5, 6}
tensor(indices=tensor([[ 633, 1862, 2582,  ...,  598, 1473, 2706],
                       [   0,    0,    0,  ..., 2707, 2707, 2707]]),
       values=tensor([1, 1, 1,  ..., 1, 1, 1]),
       size=(2708, 2708), nnz=10556, layout=torch.sparse_coo)
Epoch: 6 train loss: 1.2624 train acc: 0.82  val loss: 1.9808 val acc: 0.21 test loss: 0.1960 val acc: 0.20
Epoch: 11 train loss: 0.8396 train acc: 0.99  val loss: 1.7650 val acc: 0.35 test loss: 0.3730 val acc: 0.37
Epoch: 16 train loss: 0.5310 train acc: 1.00  val loss: 1.6741 val acc: 0.40 test loss: 0.4140 val acc: 0.41
Epoch: 21 train loss: 0.2944 train acc: 1.00  val loss: 1.6093 val acc: 0.41 test loss: 0.4290 val acc: 0.43
Epoch: 26 train loss: 0.1497 train acc:

## **Los ids de los papers que tengo que quitar en cada Hipergrafo**

In [None]:
## Quitar del hyperedge CORA CLUSTER:
"""
-1 -> [1131300, 1105810]
5 -> [1481, 41714, 44455, 94229]
8 -> [123456, 6786, 17242, 28350, 45052, 74937, 86258, 218666, 248119, 423463, 645571, 1105698]
19 -> [123456, 789012, 137790, 250566, 1102442]
23 -> [6209, 6214, 8581, 28485, 69198, 95594, 189571, 1105433, 1106789]
24 -> [123456, 789012, 23545, 51052, 519353, 1103969, 1120138, 1125909, 1129106, 1134348, 1152858]
28 -> [123456, 789012]
33 -> [123456, 789012, 167670, 272720]
37 -> [1113551, 1122642, 1128208]
39 -> [1102625, 1119295]
50 -> [4553, 63931, 1128990]
51 -> [643069, 1129608, 1138970]
63 -> [12275, 17477, 22875, 30895, 1119180, 1129243]
67 -> [1105231, 1116181]
72 -> [3240, 6163, 6216, 6334, 10793, 20193, 672071, 1111052, 1131466]
73 -> [1107558]
78 -> [631052, 654326]
80 -> [123456, 789012, 1115701, 1152821]
81 -> [2653, 6125]
82 -> [189566, 198866, 200630, 1106236]
84 -> [123456, 789012, 646836, 1128437, 1130634]
95 -> [12350, 1113852, 1114512]
98 -> [123456, 789012]
99 -> [123456, 789012, 1113739, 1114125]
103 -> [643199, 1105603]
105 -> [67246, 1130586]
108 -> [14807, 174425]
112 -> [1105718, 110587]
118 -> [4335, 633721, 643221, 1131266, 1131314]
124 -> [13652, 399339, 568857]
126 -> [123456, 789012]
128 -> [27623, 28267, 49895, 62389, 66794, 80656, 90888, 1022969]
138 -> [1103979, 1152244, 1153724]
147 -> [1109581, 1114629]
149 -> [29708, 6923, 101660, 107251, 1119216]
"""


In [None]:
## Quitar del hyperedge PUBMED CLUSTER:
"""
-1 -> [3277013, 17720018]
0 -> [16921608, 15691219]
1 -> [17988433, 17519307]
2 -> [17476355, 14652300, 1349989]
3 -> [8269790, 7956637]
4 -> [1568757, 2351024]
10 -> [6480821, 8024653]
13 -> [7971976, 8772485]
17 -> [3540010, 7674911, 14598880]
22 -> [2200729, 2204503, 10411548, 2529158]
25 -> [9063410, 7702375, 2491424]
26 -> [3065002, 8281737]
28 -> [16242708, 17212763, 8278373, 10797469]
29 -> [8095192, 7575994]
31 -> [15946965, 8893974]
33 -> [8777718, 18981116]
35 -> [6236118, 143386]
37 -> [16342958, 1899406]
41 -> [8549009, 3338379]
44 -> [7556961, 8886558, 16530770, 17603822]
45 -> [19734534, 7702885]
48 -> [7758877, 8039603, 9096978]
"""

In [None]:
## Quitar del hyperedge CORA CITATION:
"""
35 -> [66563, 486840, 573964, 210872, 97645]
114 -> [1114125]
117 -> [123456, 789012]
910 -> [1104379, 1105344, 1118848, 1120858, 1122460, 1135137]
1365 -> [1105062]
1481 -> [10567]
230879 -> [696346]
2653 -> [102406, 107177]
2658 -> [1135899, 1132948]
2665 -> [123456, 789012, 345678, 987654, 135790]
2696 -> [123456, 789012]
35922 -> [1105116, 1140547]
3191 -> [123456, 789012, 105865, 129896, 129897, 137873, 162664, 308920, 310742, 423463, 561364]
3229 -> [1109392, 1154251]
3231 -> [328370, 636098, 1102761, 1115471]
19621 -> [1128846, 1135082]
20193 -> [112813, 1130653, 1152244]
4330 -> [1103737, 1109439]
4584 -> [976339, 1031453]
39127 -> [123456, 789012]
8224 -> [1132815, 1153148, 1110426, 1128531, 1111788]
24966 -> [27627, 1104449, 1105344, 1131149, 1123576]
8703 -> [10435, 27535, 51866, 51909, 1102751, 1119078]
10169 -> [636098]
12182 -> [12165, 12210, 321861, 429781, 1106418, 1117249]
31353 -> [10531, 31336, 31927, 43698, 194617, 686532, 1123576, 1129442, 1135746, 1152162]
15429 -> [10169, 1107572]

"""

In [None]:
## Quitar del hyperedge PUBMED CITATION:
"""
9742976 -> [18620046]
8366922 -> [18840781, 18364392, 16259490]
11832527 -> [12941712, 16936143]
19479186 -> [15662004, 9217892]
18776148 -> [9259273]
18664617 -> [16122464, 17982429]
17349009 -> [8958223, 18422727, 15161749, 19037920]
16215165 -> [8835919, 7955687]
18561508 -> [12788877, 15983242]
19364331 -> [17517853]
12560454 -> [10867717]
3309680 -> [16075062, 16109069]
19436665 -> [16978370, 16283239]
11333990 -> [17897465, 17877832]
9732337 -> [18697899, 19127292]
3057885 -> [3260201]
8775937 -> [127]
17463246 -> [18544707, 18366806]
18078023 -> [8922349, 16784180]
11303130 -> [123456, 789012]
18423879 -> [16801515, 10761967]
8637860 -> [15184501, 10190896]
1697648 -> [8200974, 8423231]
8232539 -> [8920894, 9053453]
19956106 -> [19956105]
18628530 -> [12843147]
3899825 -> [18442638]
9294791 -> [8404431, 8968014]
18292465 -> [15161785]
17293876 -> [17688680]
18782870 -> [11017071]
16847277 -> [11978678, 10389838]
7694152 -> [8775937, 10430939]
16371630 -> [18070658]
17463248 -> [18366646, 19578398]
17463249 -> [18714373, 18544707]
18729180 -> [12663577, 14657818, 15857233, 1551485, 12401759]
9096977 -> [1234567, 1789012, 9876543, 4567890]
9362527 -> [12975475]
18493227 -> [17437080]
18437223 -> [10218775]

"""

# **1.Cora Clusterwise**

In [22]:
cora_clusters = pd.read_csv("./Cora_clusters.csv")[['cluster_hypergraph']]

print("Número de columnas en clusters:", len(cora_clusters.columns))
print(cora_clusters.head())
print("Filas en clusters:", len(cora_clusters))

with open('cora_with_embeddings.pkl', 'rb') as f:
    datos = pickle.load(f)

datos = datos[['paper_id', 'topic2', 'cls_embedding']]

print("Filas en datos:", len(datos))
print(datos.head())

datos['cluster_hypergraph'] = cora_clusters['cluster_hypergraph']

print(datos.head())

Número de columnas en clusters: 1
   cluster_hypergraph
0                   1
1                  39
2                   9
3                 147
4                  -1
Filas en clusters: 2368
Filas en datos: 2368
   paper_id                  topic2  \
0        35      Genetic_Algorithms   
1        40      Genetic_Algorithms   
2       114  Reinforcement_Learning   
3       117  Reinforcement_Learning   
4       128  Reinforcement_Learning   

                                       cls_embedding  
0  [-0.093049884, 0.8792597, 0.7796652, -0.175753...  
1  [-0.10484472, 0.42675668, 0.74734676, -0.07206...  
2  [0.89663494, 0.08120963, 1.0009942, 0.02328234...  
3  [-0.4162119, -0.17980981, 0.45048836, 0.100121...  
4  [0.44965452, -0.20131657, 0.6979245, 1.5324184...  
   paper_id                  topic2  \
0        35      Genetic_Algorithms   
1        40      Genetic_Algorithms   
2       114  Reinforcement_Learning   
3       117  Reinforcement_Learning   
4       128  Reinforcement_Le

In [None]:
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

df = datos.copy()

df = df.reset_index(drop=True)

# generar mis features X: embeddings CLS
X = torch.tensor(
    np.stack(df["cls_embedding"].values),
    dtype=torch.float32
).to(device)

print("X shape:", X.shape)  # (num_nodes, emb_dim)

le = LabelEncoder()
y_np = le.fit_transform(df["topic2"])
y = torch.tensor(y_np, dtype=torch.long).to(device)

print("Classes:", le.classes_)
print("y shape:", y.shape)


clusters = df["cluster_hypergraph"].unique()
cluster_to_idx = {c: i for i, c in enumerate(clusters)}

num_nodes = len(df)
num_hyperedges = len(clusters)

print("Num nodes:", num_nodes)
print("Num hyperedges:", num_hyperedges)
row_idx = []
col_idx = []

## Crear la matriz de incidencia
for node_idx, cluster in enumerate(df["cluster_hypergraph"]):
    hyperedge_idx = cluster_to_idx[cluster]
    row_idx.append(node_idx)
    col_idx.append(hyperedge_idx)

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)

print("Incidence matrix:", incidence_1)


x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)


class Network(torch.nn.Module):
    def __init__(
        self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs
    ):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)


in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level,
).to(device)


num_nodes = len(df)
perm = torch.randperm(num_nodes)

train_size = int(0.7 * num_nodes)
val_size = int(0.15 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

cpu
X shape: torch.Size([2368, 768])
Classes: ['Case_Based' 'Genetic_Algorithms' 'Neural_Networks'
 'Probabilistic_Methods' 'Reinforcement_Learning' 'Rule_Learning' 'Theory']
y shape: torch.Size([2368])
Num nodes: 2368
Num hyperedges: 160
Incidence matrix: tensor(indices=tensor([[   0,    1,    2,  ..., 2365, 2366, 2367],
                       [   0,    1,    2,  ...,    4,  116,    4]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(2368, 160), nnz=2368, layout=torch.sparse_coo)


In [48]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 50
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()

    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(
        y[train_mask].cpu(),
        y_hat.argmax(dim=-1)[train_mask].cpu()
    )

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat = model(X, x_1, incidence_1)

            val_acc = accuracy_score(
                y[val_mask].cpu(),
                y_hat.argmax(dim=-1)[val_mask].cpu()
            )
            test_acc = accuracy_score(
                y[test_mask].cpu(),
                y_hat.argmax(dim=-1)[test_mask].cpu()
            )

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )


Epoch 005 | Loss 1.2164 | Train 0.70 | Val 0.76 | Test 0.76
Epoch 010 | Loss 0.9775 | Train 0.76 | Val 0.81 | Test 0.81
Epoch 015 | Loss 0.7971 | Train 0.80 | Val 0.82 | Test 0.80
Epoch 020 | Loss 0.6721 | Train 0.81 | Val 0.82 | Test 0.81
Epoch 025 | Loss 0.5848 | Train 0.84 | Val 0.85 | Test 0.81
Epoch 030 | Loss 0.5231 | Train 0.85 | Val 0.84 | Test 0.82
Epoch 035 | Loss 0.4632 | Train 0.87 | Val 0.85 | Test 0.82
Epoch 040 | Loss 0.4255 | Train 0.88 | Val 0.85 | Test 0.83
Epoch 045 | Loss 0.3912 | Train 0.89 | Val 0.86 | Test 0.84
Epoch 050 | Loss 0.3477 | Train 0.90 | Val 0.85 | Test 0.84
Epoch 055 | Loss 0.3124 | Train 0.92 | Val 0.86 | Test 0.84
Epoch 060 | Loss 0.2839 | Train 0.93 | Val 0.85 | Test 0.83
Epoch 065 | Loss 0.2599 | Train 0.93 | Val 0.85 | Test 0.83
Epoch 070 | Loss 0.2296 | Train 0.95 | Val 0.85 | Test 0.82
Epoch 075 | Loss 0.2041 | Train 0.95 | Val 0.85 | Test 0.81
Epoch 080 | Loss 0.1824 | Train 0.96 | Val 0.86 | Test 0.81
Epoch 085 | Loss 0.1537 | Train 0.97 | V

# **2.Cora Clusterwise LLM**
### **Eliminando de los hyperedges los nodos que me ha dicho el LLM**

In [None]:
nodes_to_remove = {
    -1: [1131300, 1105810],
    5: [1481, 41714, 44455, 94229],
    8: [6786, 17242, 28350, 45052, 74937, 86258, 218666, 248119, 423463, 645571, 1105698],
    19: [137790, 250566, 1102442],
    23: [6209, 6214, 8581, 28485, 69198, 95594, 189571, 1105433, 1106789],
    24: [23545, 51052, 519353, 1103969, 1120138, 1125909, 1129106, 1134348, 1152858],
    33: [167670, 272720],
    37: [1113551, 1122642, 1128208],
    39: [1102625, 1119295],
    50: [4553, 63931, 1128990],
    51: [643069, 1129608, 1138970],
    63: [12275, 17477, 22875, 30895, 1119180, 1129243],
    67: [1105231, 1116181],
    72: [3240, 6163, 6216, 6334, 10793, 20193, 672071, 1111052, 1131466],
    73: [1107558],
    78: [631052, 654326],
    80: [1115701, 1152821],
    81: [2653, 6125],
    82: [189566, 198866, 200630, 1106236],
    84: [646836, 1128437, 1130634],
    95: [12350, 1113852, 1114512],
    99: [1113739, 1114125],
    103: [643199, 1105603],
    105: [67246, 1130586],
    108: [14807, 174425],
    112: [1105718],
    118: [4335, 633721, 643221, 1131266, 1131314],
    124: [13652, 399339, 568857],
    128: [27623, 28267, 49895, 62389, 66794, 80656, 90888, 1022969],
    138: [1103979, 1152244, 1153724],
    147: [1109581, 1114629],
    149: [29708, 6923, 101660, 107251, 1119216],
}

inconsistencies = []

for cluster_id, paper_ids in nodes_to_remove.items():
    for pid in paper_ids:
        row = df[df["paper_id"] == pid]

        if row.empty:
            inconsistencies.append(
                (cluster_id, pid, "paper_id not found in df")
            )
        else:
            real_cluster = row["cluster_hypergraph"].iloc[0]
            if real_cluster != cluster_id:
                inconsistencies.append(
                    (cluster_id, pid, f"belongs to cluster {real_cluster}")
                )

if len(inconsistencies) == 0:
    print("No inconsistencies found. All nodes belong to their specified clusters.")
else:
    print("Inconsistencies found:")
    for cluster_id, pid, reason in inconsistencies:
        print(f"Cluster {cluster_id} | paper_id {pid} | {reason}")

✅ No inconsistencies found. All nodes belong to their specified clusters.


In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

df = datos.copy()
df = df.reset_index(drop=True)

# X: embeddings CLS
X = torch.tensor(
    np.stack(df["cls_embedding"].values),
    dtype=torch.float32
).to(device)
print("X shape:", X.shape)

# Y: topic2 codificado (es el label)
le = LabelEncoder()
y_np = le.fit_transform(df["topic2"])
y = torch.tensor(y_np, dtype=torch.long).to(device)
print("Classes:", le.classes_)
print("y shape:", y.shape)

# Diccionario de nodos a quitar de cada cluster
# Ejemplo: nodes_to_remove = {5: [1481, 41714, 44455, 94229], ...}
nodes_to_remove_sets = {k: set(v) for k, v in nodes_to_remove.items()}

# Clusters → índices de hyperedges
clusters = df["cluster_hypergraph"].unique()
cluster_to_idx = {c: i for i, c in enumerate(clusters)}

num_nodes = len(df)
num_hyperedges = len(clusters)
print("Num nodes:", num_nodes, "| Num hyperedges:", num_hyperedges)

# Construir incidence matrix COO
row_idx = []
col_idx = []

for node_idx, cluster in enumerate(df["cluster_hypergraph"]):
    paper_id = df["paper_id"].iloc[node_idx]
    # Si el nodo está en la lista de eliminación para este cluster, saltarlo
    if cluster in nodes_to_remove_sets and paper_id in nodes_to_remove_sets[cluster]:
        continue

    hyperedge_idx = cluster_to_idx[cluster]
    row_idx.append(node_idx)
    col_idx.append(hyperedge_idx)

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)

print("Incidence matrix shape:", incidence_1.shape)


x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)

# Modelo
class Network(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)

# Inicializar modelo
in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level,
).to(device)

# Máscaras train/val/test
perm = torch.randperm(num_nodes)
train_size = int(0.7 * num_nodes)
val_size = int(0.15 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

# Entrenar
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()
num_epochs = 50
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()
    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(y[train_mask].cpu(), y_hat.argmax(dim=-1)[train_mask].cpu())

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat_eval = model(X, x_1, incidence_1)
            val_acc = accuracy_score(y[val_mask].cpu(), y_hat_eval.argmax(dim=-1)[val_mask].cpu())
            test_acc = accuracy_score(y[test_mask].cpu(), y_hat_eval.argmax(dim=-1)[test_mask].cpu())

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )


cpu
X shape: torch.Size([2368, 768])
Classes: ['Case_Based' 'Genetic_Algorithms' 'Neural_Networks'
 'Probabilistic_Methods' 'Reinforcement_Learning' 'Rule_Learning' 'Theory']
y shape: torch.Size([2368])
Num nodes: 2368 | Num hyperedges: 160
Incidence matrix shape: torch.Size([2368, 160])
Epoch 005 | Loss 1.2152 | Train 0.70 | Val 0.76 | Test 0.76
Epoch 010 | Loss 0.9742 | Train 0.76 | Val 0.80 | Test 0.81
Epoch 015 | Loss 0.7880 | Train 0.80 | Val 0.82 | Test 0.80
Epoch 020 | Loss 0.6692 | Train 0.82 | Val 0.83 | Test 0.81
Epoch 025 | Loss 0.5807 | Train 0.83 | Val 0.85 | Test 0.81
Epoch 030 | Loss 0.5179 | Train 0.85 | Val 0.84 | Test 0.83
Epoch 035 | Loss 0.4695 | Train 0.86 | Val 0.85 | Test 0.82
Epoch 040 | Loss 0.4234 | Train 0.88 | Val 0.85 | Test 0.84
Epoch 045 | Loss 0.3924 | Train 0.88 | Val 0.85 | Test 0.84
Epoch 050 | Loss 0.3509 | Train 0.91 | Val 0.85 | Test 0.84


# **3.PubMed clusterswise**

In [None]:
pubmed_clusters = pd.read_csv("./PubMded_clusterwise.csv")[['PMID', 'label', 'cluster_hypergraph']]

print("Número de columnas:", len(cora_clusters.columns))
print(pubmed_clusters.columns)
print(pubmed_clusters.head())
print("Filas en clusters:", len(pubmed_clusters))

loaded_embeddings = np.load("PubMed_embeddings.npz")["embeddings"]
print("Embeddings shape:", loaded_embeddings.shape)


if len(pubmed_clusters) == loaded_embeddings.shape[0]:
    pubmed_clusters['embeddings'] = list(loaded_embeddings) 
    pubmed_clusters['embeddings'] = [emb for emb in loaded_embeddings]


print(pubmed_clusters.head(2))
print(f"Column: {pubmed_clusters.columns}")

Número de columnas: 9
Index(['PMID', 'label', 'cluster_hypergraph'], dtype='object')
       PMID  label  cluster_hypergraph
0  12187484      1                  31
1   2344352      1                  31
2  14654069      1                  31
3  16443886      2                  44
4   2684155      1                  31
Filas en clusters: 19716
Embeddings shape: (19716, 768)
       PMID  label  cluster_hypergraph  \
0  12187484      1                  31   
1   2344352      1                  31   

                                          embeddings  
0  [-0.2475476, 0.6143033, 1.0025382, -0.2145277,...  
1  [-0.045406226, 0.7384806, 0.7458422, -0.247402...  
Column: Index(['PMID', 'label', 'cluster_hypergraph', 'embeddings'], dtype='object')


In [87]:
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

df = pubmed_clusters.copy()

df = df.reset_index(drop=True)

# X: embeddings CLS
X = torch.tensor(
    np.stack(df["embeddings"].values),
    dtype=torch.float32
).to(device)

print("X shape:", X.shape)  # (num_nodes, emb_dim)

le = LabelEncoder()
y_np = le.fit_transform(df["label"])
y = torch.tensor(y_np, dtype=torch.long).to(device)

print("Classes:", le.classes_)
print("y shape:", y.shape)


clusters = df["cluster_hypergraph"].unique()
cluster_to_idx = {c: i for i, c in enumerate(clusters)}

num_nodes = len(df)
num_hyperedges = len(clusters)

print("Num nodes:", num_nodes)
print("Num hyperedges:", num_hyperedges)
row_idx = []
col_idx = []

for node_idx, cluster in enumerate(df["cluster_hypergraph"]):
    hyperedge_idx = cluster_to_idx[cluster]
    row_idx.append(node_idx)
    col_idx.append(hyperedge_idx)

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)

print("Incidence matrix:", incidence_1)


x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)


class Network(torch.nn.Module):
    def __init__(
        self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs
    ):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)


in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level,
).to(device)


num_nodes = len(df)
perm = torch.randperm(num_nodes)

train_size = int(0.7 * num_nodes)
val_size = int(0.15 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True


optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 50
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()

    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(
        y[train_mask].cpu(),
        y_hat.argmax(dim=-1)[train_mask].cpu()
    )

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat = model(X, x_1, incidence_1)

            val_acc = accuracy_score(
                y[val_mask].cpu(),
                y_hat.argmax(dim=-1)[val_mask].cpu()
            )
            test_acc = accuracy_score(
                y[test_mask].cpu(),
                y_hat.argmax(dim=-1)[test_mask].cpu()
            )

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )



cpu
X shape: torch.Size([19716, 768])
Classes: [1 2 3]
y shape: torch.Size([19716])
Num nodes: 19716
Num hyperedges: 50
Incidence matrix: tensor(indices=tensor([[    0,     1,     2,  ..., 19713, 19714, 19715],
                       [    0,     0,     0,  ...,     0,     2,    21]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(19716, 50), nnz=19716, layout=torch.sparse_coo)
Epoch 005 | Loss 0.5742 | Train 0.88 | Val 0.87 | Test 0.88
Epoch 010 | Loss 0.4203 | Train 0.89 | Val 0.89 | Test 0.89
Epoch 015 | Loss 0.3445 | Train 0.90 | Val 0.89 | Test 0.90
Epoch 020 | Loss 0.3067 | Train 0.90 | Val 0.90 | Test 0.90
Epoch 025 | Loss 0.2822 | Train 0.90 | Val 0.90 | Test 0.90
Epoch 030 | Loss 0.2746 | Train 0.90 | Val 0.90 | Test 0.90
Epoch 035 | Loss 0.2681 | Train 0.91 | Val 0.90 | Test 0.90
Epoch 040 | Loss 0.2634 | Train 0.91 | Val 0.90 | Test 0.91
Epoch 045 | Loss 0.2619 | Train 0.91 | Val 0.90 | Test 0.91
Epoch 050 | Loss 0.2591 | Train 0.91 | Val 0.90 | Test 0.91

# **Pubmed TF-IDF (para comparar)**


In [None]:
import pandas as pd
import ast
import numpy as np

# Limpiar los datos
print("Cargando datos...")
df = pd.read_csv("./PubMded_clusterwise.csv")[['PMID', 'label', 'TFIDF']]

print(f"Filas cargadas: {len(df)}")
print("Columnas:", df.columns.tolist())

def safe_parse_tfidf(text):
    if pd.isna(text) or not text:
        return {}
    try:
        return ast.literal_eval(text)
    except (ValueError, SyntaxError):
        print("Problema al parsear fila:", text[:100], "...")
        return {}

print("Parseando TF-IDF strings a diccionarios...")
df['tfidf_dict'] = df['TFIDF'].apply(safe_parse_tfidf)

# Construir el vocabulario completo (todas las palabras únicas) para hacer bien TF-IDF
print("Recolectando todo el vocabulario...")
all_terms = set()

for tfidf_dict in df['tfidf_dict']:
    all_terms.update(tfidf_dict.keys())

vocab = sorted(all_terms)
vocab_size = len(vocab)

print(f"Vocabulario total encontrado: {vocab_size} términos únicos")
print("Primeros 10 términos (ejemplo):", vocab[:10])
print("Últimos 10 términos (ejemplo):", vocab[-10:])

# Crear matriz de TF-IDF (todos los vectores mismo tamaño y orden)
print("Creando matriz densa TF-IDF...")
tfidf_matrix = np.zeros((len(df), vocab_size), dtype=np.float32)

for i, tfidf_dict in enumerate(df['tfidf_dict']):
    for term, value in tfidf_dict.items():
        j = vocab.index(term)
        tfidf_matrix[i, j] = value


X_tfidf = torch.from_numpy(tfidf_matrix)  # o .to(device) si tienes GPU
print("Tensor listo para modelo:", X_tfidf.shape)
print("Tipo de datos:", X_tfidf.dtype)

df['tfidf_vector'] = [row for row in tfidf_matrix]

In [103]:
df = df[['PMID', 'label', 'tfidf_vector']]
print(df.head())

pubmed_pairwise_links = pd.read_csv("./Pairwise_cite_PubMed.csv")
pubmed_pairwise_links.head()
print(f"Number of links: {len(pubmed_pairwise_links)}")

print(pubmed_pairwise_links.head())

       PMID  label                                       tfidf_vector
0  12187484      1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1   2344352      1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2  14654069      1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3  16443886      2  [0.00832153, 0.0, 0.0, 0.0, 0.0, 0.0, 0.019853...
4   2684155      1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Number of links: 44337
     source    target
0  19127292  17363749
1  19668377  17293876
2   1313726   3002783
3  19110882  14578298
4  18606979  10333910


In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

df = df.copy()
df = df.reset_index(drop=True)

# X: embeddings TF-IDF
X = torch.tensor(
    np.stack(df["tfidf_vector"].values),
    dtype=torch.float32
).to(device)
print("X shape:", X.shape)

# Y: label 
le = LabelEncoder()
y_np = le.fit_transform(df["label"])
y = torch.tensor(y_np, dtype=torch.long).to(device)
print("Classes:", le.classes_)
print("y shape:", y.shape)

# Crear un mapping de PMID → índice en X/y
pmid_to_idx = {pmid: idx for idx, pmid in enumerate(df["PMID"].values)}

# Construir incidence matrix a partir de pairwise links
links = pubmed_pairwise_links.copy()
row_idx = []
col_idx = []

for i, row in links.iterrows():
    src = row['source']
    tgt = row['target']

    if src not in pmid_to_idx or tgt not in pmid_to_idx:
        continue

    hyperedge_idx = i  # cada par tiene su propio hyperedge

    row_idx.append(pmid_to_idx[src])
    col_idx.append(hyperedge_idx)
    row_idx.append(pmid_to_idx[tgt])
    col_idx.append(hyperedge_idx)

num_nodes = len(df)
num_hyperedges = len(links)

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)

print("Incidence matrix shape:", incidence_1.shape)

x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)

# Modelo HMPNN
class Network(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)

in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level,
).to(device)

# Máscaras
perm = torch.randperm(num_nodes)
train_size = int(0.7 * num_nodes)
val_size = int(0.15 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True


optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()
num_epochs = 100
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()

    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(y[train_mask].cpu(), y_hat.argmax(dim=-1)[train_mask].cpu())

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat_eval = model(X, x_1, incidence_1)
            val_acc = accuracy_score(y[val_mask].cpu(), y_hat_eval.argmax(dim=-1)[val_mask].cpu())
            test_acc = accuracy_score(y[test_mask].cpu(), y_hat_eval.argmax(dim=-1)[test_mask].cpu())

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )


cpu
X shape: torch.Size([19716, 500])
Classes: [1 2 3]
y shape: torch.Size([19716])
Incidence matrix shape: torch.Size([19716, 44337])
Epoch 005 | Loss 0.9234 | Train 0.53 | Val 0.40 | Test 0.41
Epoch 010 | Loss 0.6992 | Train 0.68 | Val 0.39 | Test 0.39
Epoch 015 | Loss 0.5430 | Train 0.83 | Val 0.40 | Test 0.41
Epoch 020 | Loss 0.4615 | Train 0.85 | Val 0.68 | Test 0.68
Epoch 025 | Loss 0.4116 | Train 0.86 | Val 0.70 | Test 0.70
Epoch 030 | Loss 0.3674 | Train 0.88 | Val 0.64 | Test 0.64
Epoch 035 | Loss 0.3353 | Train 0.89 | Val 0.74 | Test 0.74
Epoch 040 | Loss 0.3137 | Train 0.89 | Val 0.77 | Test 0.77
Epoch 045 | Loss 0.2935 | Train 0.90 | Val 0.75 | Test 0.75
Epoch 050 | Loss 0.2763 | Train 0.90 | Val 0.80 | Test 0.80
Epoch 055 | Loss 0.2625 | Train 0.91 | Val 0.84 | Test 0.84
Epoch 060 | Loss 0.2510 | Train 0.91 | Val 0.87 | Test 0.87
Epoch 065 | Loss 0.2378 | Train 0.92 | Val 0.87 | Test 0.88
Epoch 070 | Loss 0.2260 | Train 0.92 | Val 0.88 | Test 0.88
Epoch 075 | Loss 0.2151 |

# **4.PubMed clusterwise LLM**

In [None]:
pubmed_clusters = pd.read_csv("./PubMded_clusterwise.csv")[['PMID', 'label', 'cluster_hypergraph']]

print("Número de columnas:", len(cora_clusters.columns))
print(pubmed_clusters.columns)
print(pubmed_clusters.head())
print("Filas en clusters:", len(pubmed_clusters))

loaded_embeddings = np.load("PubMed_embeddings.npz")["embeddings"]
print("Embeddings shape:", loaded_embeddings.shape)


if len(pubmed_clusters) == loaded_embeddings.shape[0]:
    pubmed_clusters['embeddings'] = list(loaded_embeddings)  
    pubmed_clusters['embeddings'] = [emb for emb in loaded_embeddings]


print(pubmed_clusters.head(2))
print(f"Column: {pubmed_clusters.columns}")

df = pubmed_clusters.copy()

Número de columnas: 9
Index(['PMID', 'label', 'cluster_hypergraph'], dtype='object')
       PMID  label  cluster_hypergraph
0  12187484      1                  31
1   2344352      1                  31
2  14654069      1                  31
3  16443886      2                  44
4   2684155      1                  31
Filas en clusters: 19716
Embeddings shape: (19716, 768)
       PMID  label  cluster_hypergraph  \
0  12187484      1                  31   
1   2344352      1                  31   

                                          embeddings  
0  [-0.2475476, 0.6143033, 1.0025382, -0.2145277,...  
1  [-0.045406226, 0.7384806, 0.7458422, -0.247402...  
Column: Index(['PMID', 'label', 'cluster_hypergraph', 'embeddings'], dtype='object')


In [None]:
nodes_to_remove = {
    -1:  [3277013, 17720018],
    0:   [16921608, 15691219],
    1:   [17988433, 17519307],
    2:   [17476355, 14652300, 1349989],
    3:   [8269790, 7956637],
    4:   [1568757, 2351024],
    10:  [6480821, 8024653],
    13:  [7971976, 8772485],
    17:  [3540010, 7674911, 14598880],
    22:  [2200729, 2204503, 10411548, 2529158],
    25:  [9063410, 7702375, 2491424],
    26:  [3065002, 8281737],
    28:  [16242708, 17212763, 8278373, 10797469],
    29:  [8095192, 7575994],
    31:  [15946965, 8893974],
    33:  [8777718, 18981116],
    35:  [6236118, 143386],
    37:  [16342958, 1899406],
    41:  [8549009, 3338379],
    44:  [7556961, 8886558, 16530770, 17603822],
    45:  [19734534, 7702885],
    48:  [7758877, 8039603, 9096978]
}



inconsistencies = []

for cluster_id, paper_ids in nodes_to_remove.items():
    for pid in paper_ids:
        row = df[df["PMID"] == pid]

        if row.empty:
            inconsistencies.append(
                (cluster_id, pid, "paper_id not found in df")
            )
        else:
            real_cluster = row["cluster_hypergraph"].iloc[0]
            if real_cluster != cluster_id:
                inconsistencies.append(
                    (cluster_id, pid, f"belongs to cluster {real_cluster}")
                )

if len(inconsistencies) == 0:
    print("No inconsistencies found. All nodes belong to their specified clusters.")
else:
    print("Inconsistencies found:")
    for cluster_id, pid, reason in inconsistencies:
        print(f"Cluster {cluster_id} | PMID {pid} | {reason}")

✅ No inconsistencies found. All nodes belong to their specified clusters.


In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

df = df.reset_index(drop=True)

# X: embeddings CLS
X = torch.tensor(
    np.stack(df["embeddings"].values),
    dtype=torch.float32
).to(device)
print("X shape:", X.shape)

# Y: label codificado
le = LabelEncoder()
y_np = le.fit_transform(df["label"])
y = torch.tensor(y_np, dtype=torch.long).to(device)
print("Classes:", le.classes_)
print("y shape:", y.shape)

nodes_to_remove_sets = {k: set(v) for k, v in nodes_to_remove.items()}

clusters = df["cluster_hypergraph"].unique()
cluster_to_idx = {c: i for i, c in enumerate(clusters)}

num_nodes = len(df)
num_hyperedges = len(clusters)
print("Num nodes:", num_nodes, "| Num hyperedges:", num_hyperedges)

# Incidence matrix
row_idx = []
col_idx = []

for node_idx, cluster in enumerate(df["cluster_hypergraph"]):
    paper_id = df["PMID"].iloc[node_idx]
    if cluster in nodes_to_remove_sets and paper_id in nodes_to_remove_sets[cluster]:
        continue

    hyperedge_idx = cluster_to_idx[cluster]
    row_idx.append(node_idx)
    col_idx.append(hyperedge_idx)

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)

print("Incidence matrix shape:", incidence_1.shape)

x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)

# Lo mismo de siempre...
class Network(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)

in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level,
).to(device)

perm = torch.randperm(num_nodes)
train_size = int(0.6 * num_nodes)
val_size = int(0.2 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()
num_epochs = 50
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()
    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(y[train_mask].cpu(), y_hat.argmax(dim=-1)[train_mask].cpu())

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat_eval = model(X, x_1, incidence_1)
            val_acc = accuracy_score(y[val_mask].cpu(), y_hat_eval.argmax(dim=-1)[val_mask].cpu())
            test_acc = accuracy_score(y[test_mask].cpu(), y_hat_eval.argmax(dim=-1)[test_mask].cpu())

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )


cpu
X shape: torch.Size([19716, 768])
Classes: [1 2 3]
y shape: torch.Size([19716])
Num nodes: 19716 | Num hyperedges: 50
Incidence matrix shape: torch.Size([19716, 50])
Epoch 005 | Loss 0.5743 | Train 0.88 | Val 0.87 | Test 0.88
Epoch 010 | Loss 0.4201 | Train 0.89 | Val 0.89 | Test 0.89
Epoch 015 | Loss 0.3425 | Train 0.90 | Val 0.89 | Test 0.90
Epoch 020 | Loss 0.3083 | Train 0.90 | Val 0.89 | Test 0.90
Epoch 025 | Loss 0.2833 | Train 0.90 | Val 0.89 | Test 0.90
Epoch 030 | Loss 0.2745 | Train 0.90 | Val 0.90 | Test 0.90
Epoch 035 | Loss 0.2679 | Train 0.91 | Val 0.90 | Test 0.90
Epoch 040 | Loss 0.2666 | Train 0.90 | Val 0.90 | Test 0.91
Epoch 045 | Loss 0.2629 | Train 0.91 | Val 0.90 | Test 0.91
Epoch 050 | Loss 0.2602 | Train 0.91 | Val 0.90 | Test 0.91


# **5.Cora citationwise**



In [122]:
cora = pd.read_csv("./datos_para_cora_citation.csv")
cora_citation_ids = set(cora.paper_id)
print(cora_citation_ids)

with open('cora_with_embeddings.pkl', 'rb') as f:
    datos = pickle.load(f)

datos = datos[['paper_id', 'topic2', 'cls_embedding']]

print(datos.head())

{851968, 1122304, 1155073, 1114118, 1105932, 1114125, 368657, 253971, 8213, 696342, 696343, 696345, 696346, 8224, 630817, 262178, 35, 950305, 40, 1114153, 385067, 131117, 131122, 16437, 188471, 16451, 1138755, 1130567, 1130568, 16461, 180301, 1114192, 16471, 155736, 16474, 155738, 16476, 1130586, 106590, 573535, 16485, 32872, 1130600, 1114222, 573553, 114, 65650, 117, 65653, 221302, 270456, 1122425, 409725, 1114239, 128, 130, 1106052, 1130634, 1130637, 647315, 1122460, 1130653, 164, 82087, 82090, 180399, 82098, 1130678, 1106103, 1130680, 1106112, 245955, 590022, 1114331, 1114336, 385251, 631015, 688361, 647408, 1114352, 131315, 73972, 33013, 131317, 131318, 647413, 1106172, 213246, 41216, 229635, 270600, 631052, 1122574, 606479, 1114388, 1122580, 114966, 647447, 1138968, 1138970, 139547, 1130780, 1114398, 213279, 288, 1130808, 1106236, 672064, 672070, 672071, 49482, 1114442, 1122642, 1130847, 90470, 1106287, 1106298, 8581, 24966, 98693, 205192, 1114502, 98698, 205196, 8591, 1114512, 11

In [None]:
cora = pd.read_csv("./datos_para_cora_citation.csv")
cora_citation_ids = set(cora['paper_id'])   

print(f"Número de paper_ids válidos en el grafo de citas: {len(cora_citation_ids)}")

# DataFrame con embeddings
with open('cora_with_embeddings.pkl', 'rb') as f:
    datos = pickle.load(f)
datos = datos[['paper_id', 'topic2', 'cls_embedding']]

print("Tamaño original de datos:", len(datos))

datos_filtrado = datos[datos['paper_id'].isin(cora_citation_ids)].copy()

## JS0N HYperedges:
import json
with open('hyperedges_cora.json', 'r', encoding='utf-8') as archivo:
    datos = json.load(archivo)

print(datos)

Número de paper_ids válidos en el grafo de citas: 2319
Tamaño original de datos: 2368

Primeras filas del DataFrame filtrado:
   paper_id                  topic2  \
0        35      Genetic_Algorithms   
1        40      Genetic_Algorithms   
2       114  Reinforcement_Learning   
3       117  Reinforcement_Learning   
4       128  Reinforcement_Learning   

                                       cls_embedding  
0  [-0.093049884, 0.8792597, 0.7796652, -0.175753...  
1  [-0.10484472, 0.42675668, 0.74734676, -0.07206...  
2  [0.89663494, 0.08120963, 1.0009942, 0.02328234...  
3  [-0.4162119, -0.17980981, 0.45048836, 0.100121...  
4  [0.44965452, -0.20131657, 0.6979245, 1.5324184...  
{'851968': [278394, 230879, 1140231], '1114118': [119712, 910], '1114125': [128, 114], '950305': [739707, 1365], '262178': [1138968, 85449, 1114442], '35': [66563, 1128453, 1127430, 1033, 1125386, 573964, 66556, 634902, 634904, 1128985, 568857, 573978, 128540, 141342, 45599, 141347, 206371, 1131557, 178727, 

In [None]:
import torch
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN
import pickle

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Cargar IDs válidos
cora = pd.read_csv("./datos_para_cora_citation.csv")
cora_citation_ids = set(cora['paper_id'])
print(f"Número de paper_ids válidos en el grafo de citas: {len(cora_citation_ids)}")

# Cargar embeddings
with open('cora_with_embeddings.pkl', 'rb') as f:
    datos = pickle.load(f)

datos = datos[['paper_id', 'topic2', 'cls_embedding']]
# Filtrar solo los paper_ids válidos
df = datos[datos['paper_id'].isin(cora_citation_ids)].reset_index(drop=True)
print("Filas después de filtrar:", len(df))
print(df.head())

# Cargar hyperedges del JSON
with open('hyperedges_cora.json', 'r', encoding='utf-8') as f:
    hyperedges_dict = json.load(f)

print(f"Number of hyperedges from JSON: {len(hyperedges_dict)}")
print(list(hyperedges_dict.items())[:3]) 

# X, y
X = torch.tensor(
    np.stack(df["cls_embedding"].values),
    dtype=torch.float32
).to(device)
print("X shape:", X.shape)

le = LabelEncoder()
y_np = le.fit_transform(df["topic2"])
y = torch.tensor(y_np, dtype=torch.long).to(device)
print("Classes:", le.classes_)
print("y shape:", y.shape)

paper_to_idx = {pid: idx for idx, pid in enumerate(df['paper_id'].values)}

# Incidence matrix y lo mismo de siempre...
row_idx = []
col_idx = []

for h_idx, (key, nodes) in enumerate(hyperedges_dict.items()):
    all_nodes = [int(key)] + nodes  

    for node in all_nodes:
        if node in paper_to_idx:  
            row_idx.append(paper_to_idx[node])
            col_idx.append(h_idx)

num_nodes = len(df)
num_hyperedges = len(hyperedges_dict)

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)

print("Incidence matrix shape:", incidence_1.shape)

x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)

class Network(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)

in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level
).to(device)

perm = torch.randperm(num_nodes)
train_size = int(0.8 * num_nodes)
val_size = int(0.1 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 100
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()

    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(
        y[train_mask].cpu(),
        y_hat.argmax(dim=-1)[train_mask].cpu()
    )

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat_eval = model(X, x_1, incidence_1)

            val_acc = accuracy_score(
                y[val_mask].cpu(),
                y_hat_eval.argmax(dim=-1)[val_mask].cpu()
            )
            test_acc = accuracy_score(
                y[test_mask].cpu(),
                y_hat_eval.argmax(dim=-1)[test_mask].cpu()
            )

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )


cpu
Número de paper_ids válidos en el grafo de citas: 2319
Filas después de filtrar: 2319
   paper_id                  topic2  \
0        35      Genetic_Algorithms   
1        40      Genetic_Algorithms   
2       114  Reinforcement_Learning   
3       117  Reinforcement_Learning   
4       128  Reinforcement_Learning   

                                       cls_embedding  
0  [-0.093049884, 0.8792597, 0.7796652, -0.175753...  
1  [-0.10484472, 0.42675668, 0.74734676, -0.07206...  
2  [0.89663494, 0.08120963, 1.0009942, 0.02328234...  
3  [-0.4162119, -0.17980981, 0.45048836, 0.100121...  
4  [0.44965452, -0.20131657, 0.6979245, 1.5324184...  
Number of hyperedges from JSON: 2708
[('851968', [278394, 230879, 1140231]), ('1114118', [119712, 910]), ('1114125', [128, 114])]
X shape: torch.Size([2319, 768])
Classes: ['Case_Based' 'Genetic_Algorithms' 'Neural_Networks'
 'Probabilistic_Methods' 'Reinforcement_Learning' 'Rule_Learning' 'Theory']
y shape: torch.Size([2319])
Incidence matrix

# **6.Cora citationwise LLM**

In [None]:
nodes_to_remove = {
    35:     [66563, 486840, 573964, 210872, 97645],
    114:    [1114125],
    910:    [1104379, 1105344, 1118848, 1120858, 1122460, 1135137],
    1365:   [1105062],
    230879: [696346],
    2653:   [102406, 107177],
    2658:   [1135899, 1132948],
    35922:  [1105116, 1140547],
    3191:   [105865, 129896, 129897, 137873, 162664, 308920, 310742, 423463, 561364],
    3229:   [1109392, 1154251],
    3231:   [328370, 636098, 1102761, 1115471],
    19621:  [1128846, 1135082],
    20193:  [112813, 1130653, 1152244],
    4330:   [1103737, 1109439],
    4584:   [1031453],
    8224:   [1132815, 1153148, 1110426, 1128531, 1111788],
    24966:  [27627, 1104449, 1105344, 1131149, 1123576],
    8703:   [10435, 27535, 51866, 51909, 1102751, 1119078],
    10169:  [636098],
    12182:  [12165, 12210, 321861, 429781, 1106418, 1117249],
    31353:  [10531, 31336, 31927, 43698, 194617, 686532, 1123576, 1129442, 1135746, 1152162],
    15429:  [10169, 1107572]
}



inconsistencies = []

for cluster_id, paper_ids in nodes_to_remove.items():
    for pid in paper_ids:
        row = df[df["paper_id"] == pid]

        if row.empty:
            inconsistencies.append(
                (cluster_id, pid, "paper_id not found in df")
            )

if len(inconsistencies) == 0:
    print("No inconsistencies found. All nodes belong to their specified clusters.")
else:
    print("Inconsistencies found:")
    for cluster_id, pid, reason in inconsistencies:
        print(f"Cluster {cluster_id} | PMID {pid} | {reason}")

✅ No inconsistencies found. All nodes belong to their specified clusters.


In [None]:
import torch
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN
import pickle

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Nodos y embeddings
cora = pd.read_csv("./datos_para_cora_citation.csv")
cora_citation_ids = set(cora['paper_id'])
print(f"Número de paper_ids válidos en el grafo de citas: {len(cora_citation_ids)}")

with open('cora_with_embeddings.pkl', 'rb') as f:
    datos = pickle.load(f)

datos = datos[['paper_id', 'topic2', 'cls_embedding']]
df = datos[datos['paper_id'].isin(cora_citation_ids)].reset_index(drop=True)
print("Filas después de filtrar:", len(df))
print(df.head())

# Hyperedges desde JSON
with open('hyperedges_cora.json', 'r', encoding='utf-8') as f:
    hyperedges_dict = json.load(f)

print(f"Number of hyperedges from JSON: {len(hyperedges_dict)}")

# x, y
X = torch.tensor(
    np.stack(df["cls_embedding"].values),
    dtype=torch.float32
).to(device)
print("X shape:", X.shape)

le = LabelEncoder()
y_np = le.fit_transform(df["topic2"])
y = torch.tensor(y_np, dtype=torch.long).to(device)
print("Classes:", le.classes_)
print("y shape:", y.shape)

paper_to_idx = {pid: idx for idx, pid in enumerate(df['paper_id'].values)}

row_idx = []
col_idx = []
hyperedge_counter = 0  

for key, nodes in hyperedges_dict.items():
    key_int = int(key)
    remove_nodes = set(nodes_to_remove.get(key_int, []))

    all_nodes = [key_int] + nodes
    filtered_nodes = [n for n in all_nodes if n not in remove_nodes]

    for node in filtered_nodes:
        if node in paper_to_idx:
            row_idx.append(paper_to_idx[node])
            col_idx.append(hyperedge_counter)
    hyperedge_counter += 1  

    # Crear hyperedges individuales para nodos que elimino
    for node in remove_nodes:
        if node in paper_to_idx:
            row_idx.append(paper_to_idx[node])
            col_idx.append(hyperedge_counter)
        hyperedge_counter += 1  

num_nodes = len(df)
num_hyperedges = hyperedge_counter

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)
print("Incidence matrix shape:", incidence_1.shape)

x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)

class Network(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)

in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level
).to(device)

# Dividir train/val/test
perm = torch.randperm(num_nodes)
train_size = int(0.8 * num_nodes)
val_size = int(0.1 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 100
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()

    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(
        y[train_mask].cpu(),
        y_hat.argmax(dim=-1)[train_mask].cpu()
    )

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat_eval = model(X, x_1, incidence_1)

            val_acc = accuracy_score(
                y[val_mask].cpu(),
                y_hat_eval.argmax(dim=-1)[val_mask].cpu()
            )
            test_acc = accuracy_score(
                y[test_mask].cpu(),
                y_hat_eval.argmax(dim=-1)[test_mask].cpu()
            )

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )


cpu
Número de paper_ids válidos en el grafo de citas: 2319
Filas después de filtrar: 2319
   paper_id                  topic2  \
0        35      Genetic_Algorithms   
1        40      Genetic_Algorithms   
2       114  Reinforcement_Learning   
3       117  Reinforcement_Learning   
4       128  Reinforcement_Learning   

                                       cls_embedding  
0  [-0.093049884, 0.8792597, 0.7796652, -0.175753...  
1  [-0.10484472, 0.42675668, 0.74734676, -0.07206...  
2  [0.89663494, 0.08120963, 1.0009942, 0.02328234...  
3  [-0.4162119, -0.17980981, 0.45048836, 0.100121...  
4  [0.44965452, -0.20131657, 0.6979245, 1.5324184...  
Number of hyperedges from JSON: 2708
X shape: torch.Size([2319, 768])
Classes: ['Case_Based' 'Genetic_Algorithms' 'Neural_Networks'
 'Probabilistic_Methods' 'Reinforcement_Learning' 'Rule_Learning' 'Theory']
y shape: torch.Size([2319])
Incidence matrix shape: torch.Size([2319, 2786])
Epoch 005 | Loss 1.3131 | Train 0.70 | Val 0.69 | Test 0.78


## **7.PubMed Citationwise**

In [None]:
pubmed_clusters = pd.read_csv("./PubMded_clusterwise.csv")[['PMID', 'label']]

print("Número de columnas:", len(cora_clusters.columns))
print(pubmed_clusters.columns)
print(pubmed_clusters.head())
print("Filas en clusters:", len(pubmed_clusters))

loaded_embeddings = np.load("PubMed_embeddings.npz")["embeddings"]
print("Embeddings shape:", loaded_embeddings.shape)


if len(pubmed_clusters) == loaded_embeddings.shape[0]:
    pubmed_clusters['embeddings'] = list(loaded_embeddings)  
    pubmed_clusters['embeddings'] = [emb for emb in loaded_embeddings]


print(pubmed_clusters.head(2))
print(f"Column: {pubmed_clusters.columns}")

## JS0N HYperedges:
import json
with open('hyperedges_pubmed.json', 'r', encoding='utf-8') as archivo:
    datos = json.load(archivo)

print(datos)

Número de columnas: 1
Index(['PMID', 'label'], dtype='object')
       PMID  label
0  12187484      1
1   2344352      1
2  14654069      1
3  16443886      2
4   2684155      1
Filas en clusters: 19716
Embeddings shape: (19716, 768)
       PMID  label                                         embeddings
0  12187484      1  [-0.2475476, 0.6143033, 1.0025382, -0.2145277,...
1   2344352      1  [-0.045406226, 0.7384806, 0.7458422, -0.247402...
Column: Index(['PMID', 'label', 'embeddings'], dtype='object')
{'11272194': [18483609, 16710474, 16537919], '11796484': [17472435, 16823478, 17433304, 18664617, 18048763, 17535961], '10616837': [11563971, 17623014, 15647337, 18474939, 18319310], '11272209': [12060768, 18654634], '11272210': [19654863], '11272211': [18654634, 16229747, 18297260, 19672314], '10616858': [17290035, 16801574], '9437215': [15201238], '16777248': [19364331], '1703973': [1737841, 8371347, 1359788, 1658791], '8388664': [7506712], '17039422': [19436648, 18628530, 18686043, 1876

In [None]:
import torch
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN
import pickle

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

pubmed_clusters = pd.read_csv("./PubMded_clusterwise.csv")[['PMID', 'label']]

print("Número de columnas:", len(cora_clusters.columns))
print(pubmed_clusters.columns)
print(pubmed_clusters.head())
print("Filas en clusters:", len(pubmed_clusters))

loaded_embeddings = np.load("PubMed_embeddings.npz")["embeddings"]
print("Embeddings shape:", loaded_embeddings.shape)


if len(pubmed_clusters) == loaded_embeddings.shape[0]:
    pubmed_clusters['embeddings'] = list(loaded_embeddings) 
    pubmed_clusters['embeddings'] = [emb for emb in loaded_embeddings]


print(pubmed_clusters.head(2))
print(f"Column: {pubmed_clusters.columns}")

df = pubmed_clusters.copy()
with open('hyperedges_pubmed.json', 'r', encoding='utf-8') as archivo:
    hyperedges_dict = json.load(archivo)

print(f"Number of hyperedges from JSON: {len(hyperedges_dict)}")
print(list(hyperedges_dict.items())[:3])

# X, y
X = torch.tensor(
    np.stack(df["embeddings"].values),
    dtype=torch.float32
).to(device)
print("X shape:", X.shape)

le = LabelEncoder()
y_np = le.fit_transform(df["label"])
y = torch.tensor(y_np, dtype=torch.long).to(device)
print("Classes:", le.classes_)
print("y shape:", y.shape)

paper_to_idx = {pid: idx for idx, pid in enumerate(df['PMID'].values)}

# Incidence matrix
row_idx = []
col_idx = []

for h_idx, (key, nodes) in enumerate(hyperedges_dict.items()):
    all_nodes = [int(key)] + nodes 

    for node in all_nodes:
        if node in paper_to_idx:  # solo nodos presentes en df
            row_idx.append(paper_to_idx[node])
            col_idx.append(h_idx)

num_nodes = len(df)
num_hyperedges = len(hyperedges_dict)

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)

print("Incidence matrix shape:", incidence_1.shape)

x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)

# Lo de siempre... (futura línea de trabajo: poner más layers)
class Network(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)

in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level
).to(device)

perm = torch.randperm(num_nodes)
train_size = int(0.6 * num_nodes)
val_size = int(0.2 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 100
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()

    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(
        y[train_mask].cpu(),
        y_hat.argmax(dim=-1)[train_mask].cpu()
    )

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat_eval = model(X, x_1, incidence_1)

            val_acc = accuracy_score(
                y[val_mask].cpu(),
                y_hat_eval.argmax(dim=-1)[val_mask].cpu()
            )
            test_acc = accuracy_score(
                y[test_mask].cpu(),
                y_hat_eval.argmax(dim=-1)[test_mask].cpu()
            )

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )


cpu
Número de columnas: 1
Index(['PMID', 'label'], dtype='object')
       PMID  label
0  12187484      1
1   2344352      1
2  14654069      1
3  16443886      2
4   2684155      1
Filas en clusters: 19716
Embeddings shape: (19716, 768)
       PMID  label                                         embeddings
0  12187484      1  [-0.2475476, 0.6143033, 1.0025382, -0.2145277,...
1   2344352      1  [-0.045406226, 0.7384806, 0.7458422, -0.247402...
Column: Index(['PMID', 'label', 'embeddings'], dtype='object')
Number of hyperedges from JSON: 19717
[('11272194', [18483609, 16710474, 16537919]), ('11796484', [17472435, 16823478, 17433304, 18664617, 18048763, 17535961]), ('10616837', [11563971, 17623014, 15647337, 18474939, 18319310])]
X shape: torch.Size([19716, 768])
Classes: [1 2 3]
y shape: torch.Size([19716])
Incidence matrix shape: torch.Size([19716, 19717])
Epoch 005 | Loss 0.6530 | Train 0.80 | Val 0.80 | Test 0.80
Epoch 010 | Loss 0.4159 | Train 0.91 | Val 0.91 | Test 0.90
Epoch 015 | 

# **8.PubMed Citationswise**

In [None]:
nodes_to_remove = {
    9742976:   [18620046],
    8366922:   [18840781, 18364392, 16259490],
    11832527:  [12941712, 16936143],
    19479186:  [15662004, 9217892],
    18776148:  [9259273],
    18664617:  [17982429],
    17349009:  [8958223, 18422727, 15161749, 19037920],
    16215165:  [8835919, 7955687],
    18561508:  [12788877, 15983242],
    19364331:  [17517853],
    12560454:  [10867717],
    3309680:   [16075062, 16109069],
    19436665:  [16978370, 16283239],
    11333990:  [17897465, 17877832],
    9732337:   [18697899, 19127292],
    3057885:   [3260201],
    17463246:  [18544707, 18366806],
    18078023:  [8922349, 16784180],
    18423879:  [16801515, 10761967],
    8637860:   [15184501, 10190896],
    1697648:   [8200974, 8423231],
    8232539:   [8920894, 9053453],
    19956106:  [19956105],
    18628530:  [12843147],
    3899825:   [18442638],
    9294791:   [8404431, 8968014],
    18292465:  [15161785],
    17293876:  [17688680],
    18782870:  [11017071],
    16847277:  [11978678, 10389838],
    7694152:   [8775937, 10430939],
    16371630:  [18070658],
    17463248:  [18366646, 19578398],
    17463249:  [18714373, 18544707],
    18729180:  [12663577, 14657818, 15857233, 1551485, 12401759],
    9362527:   [12975475],
    18493227:  [17437080],
    18437223:  [10218775]
}


inconsistencies = []

for cluster_id, paper_ids in nodes_to_remove.items():
    for pid in paper_ids:
        row = df[df["PMID"] == pid]

        if row.empty:
            inconsistencies.append(
                (cluster_id, pid, "paper_id not found in df")
            )

if len(inconsistencies) == 0:
    print("No inconsistencies found. All nodes belong to their specified clusters.")
else:
    print("Inconsistencies found:")
    for cluster_id, pid, reason in inconsistencies:
        print(f"Cluster {cluster_id} | PMID {pid} | {reason}")

✅ No inconsistencies found. All nodes belong to their specified clusters.


In [None]:
import torch
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from topomodelx.nn.hypergraph.hmpnn import HMPNN
import pickle

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

pubmed_clusters = pd.read_csv("./PubMded_clusterwise.csv")[['PMID', 'label']]

print("Número de columnas:", len(cora_clusters.columns))
print(pubmed_clusters.columns)
print(pubmed_clusters.head())
print("Filas en clusters:", len(pubmed_clusters))

loaded_embeddings = np.load("PubMed_embeddings.npz")["embeddings"]
print("Embeddings shape:", loaded_embeddings.shape)


if len(pubmed_clusters) == loaded_embeddings.shape[0]:
    pubmed_clusters['embeddings'] = list(loaded_embeddings)  
    pubmed_clusters['embeddings'] = [emb for emb in loaded_embeddings]


print(pubmed_clusters.head(2))
print(f"Column: {pubmed_clusters.columns}")

df = pubmed_clusters.copy()

with open('hyperedges_pubmed.json', 'r', encoding='utf-8') as archivo:
    hyperedges_dict = json.load(archivo)

print(f"Number of hyperedges from JSON: {len(hyperedges_dict)}")
print(list(hyperedges_dict.items())[:3])


X = torch.tensor(
    np.stack(df["embeddings"].values),
    dtype=torch.float32
).to(device)
print("X shape:", X.shape)

le = LabelEncoder()
y_np = le.fit_transform(df["label"])
y = torch.tensor(y_np, dtype=torch.long).to(device)
print("Classes:", le.classes_)
print("y shape:", y.shape)

paper_to_idx = {pid: idx for idx, pid in enumerate(df['PMID'].values)}

# Construir incidence matrix con nodos eliminados en hyperedges individuales
row_idx = []
col_idx = []
hyperedge_counter = 0  # contador de hyperedges

for key, nodes in hyperedges_dict.items():
    key_int = int(key)
    remove_nodes = set(nodes_to_remove.get(key_int, []))

    all_nodes = [key_int] + nodes
    filtered_nodes = [n for n in all_nodes if n not in remove_nodes]

    for node in filtered_nodes:
        if node in paper_to_idx:
            row_idx.append(paper_to_idx[node])
            col_idx.append(hyperedge_counter)
    hyperedge_counter += 1  

    for node in remove_nodes:
        if node in paper_to_idx:
            row_idx.append(paper_to_idx[node])
            col_idx.append(hyperedge_counter)
        hyperedge_counter += 1  # cada nodo removed es un hyperedge

num_nodes = len(df)
num_hyperedges = hyperedge_counter

indices = torch.tensor([row_idx, col_idx], dtype=torch.long)
values = torch.ones(len(row_idx), dtype=torch.float32)

incidence_1 = torch.sparse_coo_tensor(
    indices,
    values,
    size=(num_nodes, num_hyperedges)
).to(device)
print("Incidence matrix shape:", incidence_1.shape)

x_1 = torch.zeros(
    (num_hyperedges, X.shape[1]),
    dtype=torch.float32
).to(device)

class Network(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, task_level="node", **kwargs):
        super().__init__()
        self.base_model = HMPNN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            **kwargs
        )
        self.linear = torch.nn.Linear(hidden_channels, out_channels)
        self.out_pool = task_level == "graph"

    def forward(self, x_0, x_1, incidence_1):
        x_0, x_1 = self.base_model(x_0, x_1, incidence_1)
        x = torch.max(x_0, dim=0)[0] if self.out_pool else x_0
        return self.linear(x)

in_channels = X.shape[1]
hidden_channels = 128
n_layers = 1
out_channels = len(le.classes_)
task_level = "node"

model = Network(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=out_channels,
    n_layers=n_layers,
    task_level=task_level
).to(device)

perm = torch.randperm(num_nodes)
train_size = int(0.8 * num_nodes)
val_size = int(0.1 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 80
test_interval = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()

    y_hat = model(X, x_1, incidence_1)
    loss = loss_fn(y_hat[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    train_acc = accuracy_score(
        y[train_mask].cpu(),
        y_hat.argmax(dim=-1)[train_mask].cpu()
    )

    if epoch % test_interval == 0:
        model.eval()
        with torch.no_grad():
            y_hat_eval = model(X, x_1, incidence_1)

            val_acc = accuracy_score(
                y[val_mask].cpu(),
                y_hat_eval.argmax(dim=-1)[val_mask].cpu()
            )
            test_acc = accuracy_score(
                y[test_mask].cpu(),
                y_hat_eval.argmax(dim=-1)[test_mask].cpu()
            )

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss.item():.4f} | "
            f"Train {train_acc:.2f} | "
            f"Val {val_acc:.2f} | "
            f"Test {test_acc:.2f}"
        )


cpu
Número de columnas: 1
Index(['PMID', 'label'], dtype='object')
       PMID  label
0  12187484      1
1   2344352      1
2  14654069      1
3  16443886      2
4   2684155      1
Filas en clusters: 19716
Embeddings shape: (19716, 768)
       PMID  label                                         embeddings
0  12187484      1  [-0.2475476, 0.6143033, 1.0025382, -0.2145277,...
1   2344352      1  [-0.045406226, 0.7384806, 0.7458422, -0.247402...
Column: Index(['PMID', 'label', 'embeddings'], dtype='object')
Number of hyperedges from JSON: 19717
[('11272194', [18483609, 16710474, 16537919]), ('11796484', [17472435, 16823478, 17433304, 18664617, 18048763, 17535961]), ('10616837', [11563971, 17623014, 15647337, 18474939, 18319310])]
X shape: torch.Size([19716, 768])
Classes: [1 2 3]
y shape: torch.Size([19716])
Incidence matrix shape: torch.Size([19716, 19783])
Epoch 005 | Loss 0.6562 | Train 0.78 | Val 0.80 | Test 0.80
Epoch 010 | Loss 0.4199 | Train 0.91 | Val 0.91 | Test 0.90
Epoch 015 | 