2.6.0+cpu


In [7]:
import pandas as pd
import torch
import numpy as np
import networkx as nx
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, Node2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
import torch.nn.functional as F
from torch.nn import Linear

# 1. Load and prepare the dataset
df = pd.read_csv("diabetes.csv")
X = df.drop(columns=["Outcome"]).values
y = df["Outcome"].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_tensor = torch.tensor(X, dtype=torch.float)
y_tensor = torch.tensor(y, dtype=torch.long)

# 2. Build k-NN graph and extract edge index
A = kneighbors_graph(X, n_neighbors=10, mode='connectivity')
G = nx.from_scipy_sparse_array(A)
edge_index = torch.tensor(list(G.edges), dtype=torch.long).t().contiguous()
edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)  # Make it bidirectional

# 3. Apply Node2Vec (Skip-gram-style embeddings)
node2vec = Node2Vec(edge_index=edge_index, embedding_dim=64, walk_length=20,
                    context_size=10, walks_per_node=10, num_negative_samples=1,
                    p=1, q=1, sparse=True)

embed_optimizer = torch.optim.SparseAdam(list(node2vec.parameters()), lr=0.01)

def train_node2vec():
    node2vec.train()
    total_loss = 0
    for _ in range(100):
        embed_optimizer.zero_grad()
        loss = node2vec.loss()
        loss.backward()
        embed_optimizer.step()
        total_loss += loss.item()
    return total_loss / 100

print("Training Node2Vec embeddings...")
loss = train_node2vec()
print(f"Node2Vec training loss: {loss:.4f}")

# 4. Replace X with Node2Vec embeddings
data = Data(x=node2vec.embedding.weight.detach(), edge_index=edge_index, y=y_tensor)

# 5. Define the GNN model using GraphSAGE
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.conv3 = SAGEConv(hidden_dim, output_dim)
        self.fc = Linear(output_dim, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# 6. Train the GNN model
model = GNN(input_dim=64, hidden_dim=32, output_dim=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(300):
    loss = train()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

# 7. Evaluate the model
def test():
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
        acc = (pred == data.y).sum().item() / len(data.y)
        print(f'Accuracy: {acc:.4f}')

test()


ImportError: 'Node2Vec' requires either the 'pyg-lib' or 'torch-cluster' package