# Treinamento de GNN GraphSAGE

Este notebook demonstra como carregar um grafo de dependências, construir um modelo **GraphSAGE** usando PyTorch Geometric e registrar métricas e modelos no MLflow. O grafo é lido a partir do JSON gerado pelo script `generate_graph.py`. Caso você não tenha features para os nós, serão geradas features aleatórias combinadas com codificação one-hot das categorias.

> **Nota:** para executar este notebook, é necessário ter o PyTorch, PyTorch Geometric e MLflow instalados no ambiente.

In [None]:
import json
from pathlib import Path
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import mlflow

# Define função para carregar grafo
def load_graph(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Define função para construir dataset a partir do grafo
def build_dataset(graph):
    nodes = graph['nodes']
    edges = graph['edges']
    categories = sorted(set(n['category'] for n in nodes))
    cat_to_idx = {cat: i for i, cat in enumerate(categories)}
    num_nodes = len(nodes)
    num_cats = len(categories)
    rand_dim = 16
    x_rand = np.random.randn(num_nodes, rand_dim).astype(np.float32)
    x_onehot = np.zeros((num_nodes, num_cats), dtype=np.float32)
    for i, n in enumerate(nodes):
        x_onehot[i, cat_to_idx[n['category']]] = 1.0
    x = np.concatenate([x_rand, x_onehot], axis=1)
    y = np.array([cat_to_idx[n['category']] for n in nodes], dtype=np.int64)
    edge_list = []
    for e in edges:
        s = e['source'] - 1
        t = e['target'] - 1
        edge_list.append((s, t))
        edge_list.append((t, s))
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    data = Data(x=torch.tensor(x, dtype=torch.float32), edge_index=edge_index, y=torch.tensor(y))
    return data, categories

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

def train(data, num_classes, run_name='gnn_notebook', mlflow_uri='http://127.0.0.1:5000'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data = data.to(device)
    model = GraphSAGE(data.x.size(1), 32, num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    num_nodes = data.num_nodes
    idx = np.arange(num_nodes)
    np.random.shuffle(idx)
    split = int(0.8 * num_nodes)
    train_idx = torch.tensor(idx[:split], dtype=torch.long, device=device)
    test_idx = torch.tensor(idx[split:], dtype=torch.long, device=device)
    mlflow.set_tracking_uri(mlflow_uri)
    with mlflow.start_run(run_name=run_name):
        mlflow.log_param('num_nodes', num_nodes)
        mlflow.log_param('num_classes', num_classes)
        for epoch in range(1, 21):
            model.train()
            optimizer.zero_grad()
            out = model(data.x, data.edge_index)
            loss = F.cross_entropy(out[train_idx], data.y[train_idx])
            loss.backward()
            optimizer.step()
            model.eval()
            with torch.no_grad():
                logits = model(data.x, data.edge_index)
                pred = logits.argmax(dim=1)
                train_acc = (pred[train_idx] == data.y[train_idx]).float().mean().item()
                test_acc = (pred[test_idx] == data.y[test_idx]).float().mean().item()
            mlflow.log_metric('train_loss', loss.item(), step=epoch)
            mlflow.log_metric('train_accuracy', train_acc, step=epoch)
            mlflow.log_metric('test_accuracy', test_acc, step=epoch)
            print(f'Epoch {epoch:02d} loss={loss.item():.4f} train_acc={train_acc:.3f} test_acc={test_acc:.3f}')
        # salva modelo
        mlflow.pytorch.log_model(model, artifact_path='model')
        mlflow.log_param('framework', 'pytorch_geometric')
    return model


In [None]:
# Carrega o grafo de exemplo e constrói o dataset
graph_path = '../data/sample_graph.json'  # ajuste este caminho conforme necessário
graph = load_graph(graph_path)
data, categories = build_dataset(graph)
print(f'Número de nós: {data.num_nodes}, número de classes: {len(categories)}')


In [None]:
# Treina o modelo e registra no MLflow
trained_model = train(data, num_classes=len(categories), run_name='gnn_notebook_experiment')
