In [5]:
import os
import yaml
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import torch
from torch import nn
import torch.nn.functional as F
from torch_geometric.utils import from_networkx, train_test_split_edges, negative_sampling
from torch_geometric.nn import SAGEConv

print("Imports OK")

# %% Charger configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

CACHE_PATH = config['graph']['cache_path']
PARTITION_PATH = "../results/communities/louvain_partition.pkl"
MODEL_OUTPUT = config['graph']['MODEL_PATH']

print("Configuration charg√©e")


Imports OK
Configuration charg√©e


In [6]:
# %% Chargement du graphe
with open(CACHE_PATH, 'rb') as f:
    G_nx = pickle.load(f)

print(f"Graphe charg√© : {G_nx.number_of_nodes()} n≈ìuds, {G_nx.number_of_edges()} ar√™tes")

# Charger Louvain
with open(PARTITION_PATH, "rb") as f:
    partition = pickle.load(f)

print(f"Communaut√©s charg√©es : {len(set(partition.values()))}")


Graphe charg√© : 10363 n≈ìuds, 300869 ar√™tes
Communaut√©s charg√©es : 45


In [7]:
import torch
from torch_geometric.data import Data
import networkx as nx

print("üîÑ Conversion NetworkX ‚Üí PyTorch Geometric (avec remapping)...\n")

# ===== √âTAPE 1 : CR√âER UN MAPPING CONTINU =====
node_list = list(G_nx.nodes())
node_to_idx = {node: idx for idx, node in enumerate(node_list)}
print(f"‚úì Mapping cr√©√© pour {len(node_list)} n≈ìuds")
print(f"  Exemple: n≈ìud {node_list[0]} ‚Üí index 0")
print(f"  Exemple: n≈ìud {node_list[-1]} ‚Üí index {len(node_list)-1}")

# ===== √âTAPE 2 : CONVERTIR LES AR√äTES =====
edges_remapped = [
    (node_to_idx[u], node_to_idx[v]) 
    for u, v in G_nx.edges()
]
edge_index = torch.tensor(edges_remapped, dtype=torch.long).t().contiguous()

print(f"\n‚úì edge_index cr√©√©: {edge_index.shape}")
print(f"  Min index: {edge_index.min().item()}")
print(f"  Max index: {edge_index.max().item()}")
print(f"  Attendu max: {len(node_list) - 1}")

# V√©rification critique
assert edge_index.max().item() < len(node_list), \
    f"‚ùå ERREUR: edge_index contient {edge_index.max().item()} mais seulement {len(node_list)} n≈ìuds!"
assert edge_index.min().item() >= 0, \
    f"‚ùå ERREUR: edge_index contient des indices n√©gatifs!"

# ===== √âTAPE 3 : FEATURES (avec remapping) =====
print("\nCalcul des features...")
deg = dict(G_nx.degree())
pagerank = nx.pagerank(G_nx, max_iter=50)
clustering = nx.clustering(G_nx)

x = torch.tensor([
    [deg[node], pagerank[node], clustering[node]] 
    for node in node_list  # ‚Üê Utiliser node_list dans l'ordre
], dtype=torch.float)

print(f"‚úì Features cr√©√©es: {x.shape}")

# ===== √âTAPE 4 : COMMUNAUT√âS (avec remapping) =====
community = torch.tensor(
    [partition.get(node, -1) for node in node_list],
    dtype=torch.long
)

missing = (community == -1).sum().item()
if missing > 0:
    print(f"‚ö†Ô∏è {missing} n≈ìuds sans communaut√© (assign√©s √† -1)")

# ===== √âTAPE 5 : CR√âER L'OBJET DATA =====
data = Data(
    x=x,
    edge_index=edge_index,
    community=community,
    num_nodes=len(node_list)
)

print("\n" + "="*60)
print("üìä OBJET DATA CR√â√â ET V√âRIFI√â")
print("="*60)
print(f"  N≈ìuds: {data.num_nodes}")
print(f"  Ar√™tes: {data.edge_index.shape[1]}")
print(f"  Features: {data.x.shape}")
print(f"  edge_index range: [{edge_index.min().item()}, {edge_index.max().item()}]")
print(f"  Communaut√©s: {community.max().item() + 1}")
print("="*60)

# ===== V√âRIFICATION FINALE =====
assert data.edge_index.max() < data.num_nodes, "‚ùå Indices hors limites!"
print("\n‚úÖ Toutes les v√©rifications pass√©es - Pr√™t pour le split!\n")

üîÑ Conversion NetworkX ‚Üí PyTorch Geometric (avec remapping)...

‚úì Mapping cr√©√© pour 10363 n≈ìuds
  Exemple: n≈ìud 50562026 ‚Üí index 0
  Exemple: n≈ìud 2115828967 ‚Üí index 10362

‚úì edge_index cr√©√©: torch.Size([2, 300869])
  Min index: 0
  Max index: 10362
  Attendu max: 10362

Calcul des features...
‚úì Features cr√©√©es: torch.Size([10363, 3])

üìä OBJET DATA CR√â√â ET V√âRIFI√â
  N≈ìuds: 10363
  Ar√™tes: 300869
  Features: torch.Size([10363, 3])
  edge_index range: [0, 10362]
  Communaut√©s: 45

‚úÖ Toutes les v√©rifications pass√©es - Pr√™t pour le split!



In [8]:
# %% Split edges (VERSION CORRIG√âE)
from torch_geometric.transforms import RandomLinkSplit

print(f"Avant split - edge_index: {data.edge_index.shape}")

transform = RandomLinkSplit(
    num_val=0.05,
    num_test=0.1,
    is_undirected=True,
    add_negative_train_samples=True,
    neg_sampling_ratio=1.0
)

train_data, val_data, test_data = transform(data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

print("\n‚úÖ Train/val/test split OK")
print(f"  Train edges: {train_data.edge_index.size(1)}")
print(f"  Val edges: {val_data.edge_label_index.size(1)}")      # ‚Üê CORRIG√â ICI
print(f"  Test edges: {test_data.edge_label_index.size(1)}")    # ‚Üê CORRIG√â ICI
print(f"  Device: {device}")

Avant split - edge_index: torch.Size([2, 300869])

‚úÖ Train/val/test split OK
  Train edges: 511480
  Val edges: 30086
  Test edges: 60172
  Device: cpu


In [9]:
# %% Model d√©finition (VERSION AM√âLIOR√âE)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class GraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        self.dropout = dropout
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)  # ‚Üê Dropout ajout√©
        x = self.conv2(x, edge_index)
        return x

class EdgePredictor(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        # Option 1 : Concat√©nation (votre version)
        self.lin = nn.Linear(embedding_dim * 2, 1)
    
    def forward(self, z, edge_index):
        src = z[edge_index[0]]
        dst = z[edge_index[1]]
        # Concat√©nation
        out = torch.cat([src, dst], dim=1)
        return torch.sigmoid(self.lin(out)).squeeze()  # ‚Üê squeeze() ajout√©

# OU version alternative avec produit scalaire (plus simple et souvent meilleure)
class EdgePredictorDot(nn.Module):
    """Pr√©diction par produit scalaire (recommand√©)"""
    def __init__(self):
        super().__init__()
    
    def forward(self, z, edge_index):
        src = z[edge_index[0]]
        dst = z[edge_index[1]]
        # Produit scalaire
        return torch.sigmoid((src * dst).sum(dim=-1))

# Initialisation
embedding_dim = 64
gnn = GraphSAGE(
    in_channels=train_data.x.size(1),  # 3 features
    hidden_channels=128,
    out_channels=embedding_dim,
    dropout=0.5
).to(device)

edge_predictor = EdgePredictor(embedding_dim).to(device)
# OU
# edge_predictor = EdgePredictorDot().to(device)  # Version plus simple

print("‚úÖ Mod√®les cr√©√©s")
print(f"  GNN params: {sum(p.numel() for p in gnn.parameters()):,}")
print(f"  Edge predictor params: {sum(p.numel() for p in edge_predictor.parameters()):,}")

‚úÖ Mod√®les cr√©√©s
  GNN params: 17,344
  Edge predictor params: 129


In [10]:
# %% Init model
encoder = GraphSAGE(
    in_channels=train_data.x.size(1),  # ‚Üê Utiliser train_data, pas data
    hidden_channels=128,
    out_channels=64,
    dropout=0.5
).to(device)

predictor = EdgePredictor(64).to(device)  # ‚Üê Adapter √† out_channels

optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(predictor.parameters()),
    lr=0.01,
    weight_decay=5e-4  # ‚Üê R√©gularisation
)

criterion = nn.BCELoss()  # Loss pour classification binaire

print("‚úÖ Mod√®les initialis√©s")
print(f"  Encoder: {sum(p.numel() for p in encoder.parameters()):,} params")
print(f"  Predictor: {sum(p.numel() for p in predictor.parameters()):,} params")

‚úÖ Mod√®les initialis√©s
  Encoder: 17,344 params
  Predictor: 129 params


In [11]:
print("üìä Analyse de la distribution des degr√©s...\n")

# Calculer les degr√©s
node_degrees = dict(G_nx.degree())
degrees_array = np.array([node_degrees[node] for node in node_list])

# Calculer percentiles
degree_p50 = np.percentile(degrees_array, 50)
degree_p90 = np.percentile(degrees_array, 90)
degree_p95 = np.percentile(degrees_array, 95)

print(f"Distribution des degr√©s:")
print(f"   M√©diane (p50): {degree_p50:.0f}")
print(f"   p90: {degree_p90:.0f}")
print(f"   p95 (seuil hubs): {degree_p95:.0f}")
print(f"   Max: {degrees_array.max():.0f}")

# Identifier les n≈ìuds par cat√©gorie
low_degree_mask = degrees_array <= degree_p50
mid_degree_mask = (degrees_array > degree_p50) & (degrees_array <= degree_p90)
high_degree_mask = (degrees_array > degree_p90) & (degrees_array <= degree_p95)
extreme_hub_mask = degrees_array > degree_p95

low_nodes = np.where(low_degree_mask)[0]
mid_nodes = np.where(mid_degree_mask)[0]
high_nodes = np.where(high_degree_mask)[0]
extreme_hubs = np.where(extreme_hub_mask)[0]

print(f"\nCat√©gorisation:")
print(f"   Low (‚â§p50): {len(low_nodes)} n≈ìuds")
print(f"   Mid (p50-p90): {len(mid_nodes)} n≈ìuds")
print(f"   High (p90-p95): {len(high_nodes)} n≈ìuds")
print(f"   Extreme (>p95): {len(extreme_hubs)} n≈ìuds")
print(f"\n‚ö†Ô∏è Les extreme hubs seront EXCLUS du negative sampling")


üìä Analyse de la distribution des degr√©s...

Distribution des degr√©s:
   M√©diane (p50): 11
   p90: 278
   p95 (seuil hubs): 312
   Max: 568

Cat√©gorisation:
   Low (‚â§p50): 5263 n≈ìuds
   Mid (p50-p90): 4277 n≈ìuds
   High (p90-p95): 314 n≈ìuds
   Extreme (>p95): 509 n≈ìuds

‚ö†Ô∏è Les extreme hubs seront EXCLUS du negative sampling


In [12]:
def train_with_stratified_negatives():
    """Training avec negative sampling stratifi√© pour √©viter biais hub"""
    encoder.train()
    predictor.train()
    optimizer.zero_grad()
    
    # Encoder le graphe
    z = encoder(train_data.x, train_data.edge_index)
    
    # ===== NEGATIVE SAMPLING STRATIFI√â =====
    num_pos = train_data.edge_label_index[:, train_data.edge_label == 1].size(1)
    
    # Ratio: 50% low, 30% mid, 20% high (pas d'extreme hubs)
    num_low = int(num_pos * 0.5)
    num_mid = int(num_pos * 0.3)
    num_high = int(num_pos * 0.2)
    
    # Sources al√©atoires
    src_low = torch.randint(0, train_data.num_nodes, (num_low,), device=device)
    src_mid = torch.randint(0, train_data.num_nodes, (num_mid,), device=device)
    src_high = torch.randint(0, train_data.num_nodes, (num_high,), device=device)
    
    # Targets stratifi√©s (SANS extreme hubs)
    dst_low = torch.from_numpy(np.random.choice(low_nodes, num_low, replace=True)).to(device)
    dst_mid = torch.from_numpy(np.random.choice(mid_nodes, num_mid, replace=True)).to(device)
    dst_high = torch.from_numpy(np.random.choice(high_nodes, num_high, replace=True)).to(device)
    
    # Cr√©er edge_index n√©gatif
    neg_src = torch.cat([src_low, src_mid, src_high])
    neg_dst = torch.cat([dst_low, dst_mid, dst_high])
    neg_edge_index = torch.stack([neg_src, neg_dst])
    
    # ===== PR√âDICTIONS =====
    pos_edge_index = train_data.edge_label_index[:, train_data.edge_label == 1]
    pos_pred = predictor(z, pos_edge_index)
    neg_pred = predictor(z, neg_edge_index)
    
    # ===== LOSS AVEC POND√âRATION =====
    # P√©naliser plus les erreurs sur low-degree nodes
    pos_target_degrees = degrees_array[pos_edge_index[1].cpu().numpy()]
    pos_weights = torch.from_numpy(1.0 / np.log(pos_target_degrees + 2)).float().to(device)
    pos_weights = pos_weights / pos_weights.sum() * len(pos_weights)
    
    pos_loss = F.binary_cross_entropy(pos_pred, torch.ones_like(pos_pred), weight=pos_weights)
    neg_loss = F.binary_cross_entropy(neg_pred, torch.zeros_like(neg_pred))
    
    loss = pos_loss + neg_loss
    
    loss.backward()
    
    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1.0)
    torch.nn.utils.clip_grad_norm_(predictor.parameters(), max_norm=1.0)
    
    optimizer.step()
    
    return loss.item(), pos_pred.mean().item(), neg_pred.mean().item()

print("‚úÖ Fonction d'entra√Ænement cr√©√©e (avec stratification anti-hub)")



‚úÖ Fonction d'entra√Ænement cr√©√©e (avec stratification anti-hub)


In [13]:
from sklearn.metrics import roc_auc_score

@torch.no_grad()
def test_with_degree_analysis(data_split):
    """Test avec analyse par cat√©gorie de degr√©"""
    encoder.eval()
    predictor.eval()
    
    z = encoder(data_split.x, train_data.edge_index)
    pred = predictor(z, data_split.edge_label_index)
    
    pred_cpu = pred.cpu().numpy()
    label_cpu = data_split.edge_label.cpu().numpy()
    
    # M√©triques globales
    auc = roc_auc_score(label_cpu, pred_cpu)
    pred_binary = (pred > 0.5).cpu().numpy()
    acc = (pred_binary == label_cpu).mean()
    
    # Analyse par degr√©
    target_degrees = degrees_array[data_split.edge_label_index[1].cpu().numpy()]
    
    low_mask = target_degrees <= degree_p50
    high_mask = target_degrees > degree_p90
    
    auc_low = roc_auc_score(label_cpu[low_mask], pred_cpu[low_mask]) if low_mask.sum() > 10 else None
    auc_high = roc_auc_score(label_cpu[high_mask], pred_cpu[high_mask]) if high_mask.sum() > 10 else None
    
    return auc, acc, auc_low, auc_high

print("‚úÖ Fonction de test cr√©√©e (avec analyse par degr√©)")


‚úÖ Fonction de test cr√©√©e (avec analyse par degr√©)


In [14]:
print("üöÄ D√©but de l'entra√Ænement (anti-hub bias)...\n")

best_val_auc = 0
patience = 25
patience_counter = 0
history = {
    'train_loss': [], 
    'val_auc': [], 
    'val_auc_low': [], 
    'val_auc_high': []
}

for epoch in range(1, 201):
    loss, pos_mean, neg_mean = train_with_stratified_negatives()
    history['train_loss'].append(loss)
    
    if epoch % 5 == 0:
        val_auc, val_acc, val_auc_low, val_auc_high = test_with_degree_analysis(val_data)
        
        history['val_auc'].append(val_auc)
        history['val_auc_low'].append(val_auc_low if val_auc_low else 0)
        history['val_auc_high'].append(val_auc_high if val_auc_high else 0)
        
        print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Val AUC: {val_auc:.4f} | "
              f"Pos: {pos_mean:.3f} | Neg: {neg_mean:.3f}")
        
        if val_auc_low and val_auc_high:
            print(f"           | AUC Low-deg: {val_auc_low:.4f} | AUC High-deg: {val_auc_high:.4f}")
        
        # Early stopping
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            patience_counter = 0
            
            torch.save({
                'encoder': encoder.state_dict(),
                'predictor': predictor.state_dict(),
                'val_auc': val_auc,
                'epoch': epoch
            }, 'best_model.pt')
            print(f"           | ‚úÖ Meilleur mod√®le sauvegard√©")
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f"\n‚ö†Ô∏è Early stopping √† l'epoch {epoch}")
            break

print("\n‚úÖ Entra√Ænement termin√©!")



üöÄ D√©but de l'entra√Ænement (anti-hub bias)...

Epoch 005 | Loss: 75.5039 | Val AUC: 0.2366 | Pos: 0.000 | Neg: 0.001
           | AUC Low-deg: 0.6727 | AUC High-deg: 0.0748
           | ‚úÖ Meilleur mod√®le sauvegard√©
Epoch 010 | Loss: 29.2397 | Val AUC: 0.8332 | Pos: 0.856 | Neg: 0.610
           | AUC Low-deg: 0.6811 | AUC High-deg: 0.3993
           | ‚úÖ Meilleur mod√®le sauvegard√©
Epoch 015 | Loss: 29.4014 | Val AUC: 0.8381 | Pos: 0.938 | Neg: 0.510
           | AUC Low-deg: 0.6818 | AUC High-deg: 0.5067
           | ‚úÖ Meilleur mod√®le sauvegard√©
Epoch 020 | Loss: 27.1053 | Val AUC: 0.8724 | Pos: 0.965 | Neg: 0.684
           | AUC Low-deg: 0.6756 | AUC High-deg: 0.5000
           | ‚úÖ Meilleur mod√®le sauvegard√©
Epoch 025 | Loss: 13.2318 | Val AUC: 0.8231 | Pos: 0.221 | Neg: 0.337
           | AUC Low-deg: 0.4189 | AUC High-deg: 0.9737
Epoch 030 | Loss: 10.5426 | Val AUC: 0.2755 | Pos: 0.409 | Neg: 0.268
           | AUC Low-deg: 0.4965 | AUC High-deg: 0.9630
Epoch 035

In [15]:
print("üìä √âvaluation sur le set de test...\n")

# Charger meilleur mod√®le
checkpoint = torch.load('best_model.pt')
encoder.load_state_dict(checkpoint['encoder'])
predictor.load_state_dict(checkpoint['predictor'])

# Test final
test_auc, test_acc, test_auc_low, test_auc_high = test_with_degree_analysis(test_data)

print(f"{'='*60}")
print(f"R√âSULTATS FINAUX")
print(f"{'='*60}")
print(f"  Best Val AUC: {best_val_auc:.4f}")
print(f"  Test AUC (global): {test_auc:.4f}")
print(f"  Test Acc: {test_acc:.4f}")
if test_auc_low:
    print(f"  Test AUC (low-degree): {test_auc_low:.4f}")
if test_auc_high:
    print(f"  Test AUC (high-degree): {test_auc_high:.4f}")
print(f"{'='*60}")


üìä √âvaluation sur le set de test...

R√âSULTATS FINAUX
  Best Val AUC: 0.9143
  Test AUC (global): 0.9123
  Test Acc: 0.8498
  Test AUC (low-degree): 0.4903
  Test AUC (high-degree): 0.9686


In [16]:
print("\nüî¨ Diagnostic des embeddings...\n")

encoder.eval()
with torch.no_grad():
    final_embeddings = encoder(data.x.to(device), data.edge_index.to(device))

# Analyser similarit√© des hubs
hub_indices = extreme_hubs[:min(10, len(extreme_hubs))]
hub_embeds = final_embeddings[hub_indices].cpu().numpy()

from sklearn.metrics.pairwise import cosine_similarity
hub_sim = cosine_similarity(hub_embeds)
np.fill_diagonal(hub_sim, 0)
avg_hub_sim = hub_sim.mean()

print(f"Similarit√© moyenne entre hubs: {avg_hub_sim:.4f}")
if avg_hub_sim > 0.9:
    print(f"‚ö†Ô∏è Embeddings des hubs trop similaires (risque de biais)")
    print(f"   ‚Üí Consid√©rer : plus de dropout, plus d'epochs, ou moins de hidden_channels")
else:
    print(f"‚úÖ Embeddings des hubs bien diff√©renci√©s")

# Analyser tous les embeddings
all_embeds = final_embeddings.cpu().numpy()
all_sim = cosine_similarity(all_embeds[:100])  # √âchantillon
np.fill_diagonal(all_sim, 0)
avg_all_sim = all_sim.mean()

print(f"\nSimilarit√© moyenne (√©chantillon global): {avg_all_sim:.4f}")
print(f"√âcart-type des embeddings: {all_embeds.std():.4f}")




üî¨ Diagnostic des embeddings...

Similarit√© moyenne entre hubs: 0.6638
‚úÖ Embeddings des hubs bien diff√©renci√©s

Similarit√© moyenne (√©chantillon global): 0.5727
√âcart-type des embeddings: 1.6200


In [17]:
print("\nüíæ Sauvegarde COMPL√àTE du mod√®le...\n")

torch.save({
    'model_state': {
        'encoder': encoder.state_dict(),
        'predictor': predictor.state_dict()
    },
    'embeddings': final_embeddings.cpu(),
    'node_to_idx': node_to_idx,
    'metrics': {
        'val_auc': best_val_auc,
        'test_auc': test_auc,
        'test_acc': test_acc,
        'test_auc_low': test_auc_low if test_auc_low else 0,
        'test_auc_high': test_auc_high if test_auc_high else 0
    },
    'hyperparameters': {
        'in_channels': train_data.x.size(1),
        'hidden_channels': 128,
        'embedding_dim': 64,
        'dropout': 0.5
    },
    'degree_stats': {
        'p50': float(degree_p50),
        'p90': float(degree_p90),
        'p95': float(degree_p95),
        'max': float(degrees_array.max())
    },
    'metadata': {
        'num_nodes': len(node_to_idx),
        'num_edges': data.edge_index.shape[1],
        'training_date': str(pd.Timestamp.now()),
        'device': str(device),
        'hub_embedding_similarity': float(avg_hub_sim),
        'global_embedding_similarity': float(avg_all_sim),
        'training_method': 'stratified_negative_sampling_v2'
    }
}, MODEL_OUTPUT)

print(f"‚úÖ Mod√®le sauvegard√©: {MODEL_OUTPUT}")



üíæ Sauvegarde COMPL√àTE du mod√®le...

‚úÖ Mod√®le sauvegard√©: ../results/link_prediction_model.pt


In [18]:
print("\nüß™ V√©rification du fichier sauvegard√©...\n")

test_checkpoint = torch.load(MODEL_OUTPUT, map_location='cpu', weights_only=False)

print(f"‚úÖ Fichier valide:")
print(f"   Embeddings: {test_checkpoint['embeddings'].shape}")
print(f"   N≈ìuds: {len(test_checkpoint['node_to_idx'])}")
print(f"   Test AUC: {test_checkpoint['metrics']['test_auc']:.4f}")
print(f"   Hub similarity: {test_checkpoint['metadata']['hub_embedding_similarity']:.4f}")
print(f"   Training method: {test_checkpoint['metadata']['training_method']}")



üß™ V√©rification du fichier sauvegard√©...

‚úÖ Fichier valide:
   Embeddings: torch.Size([10363, 64])
   N≈ìuds: 10363
   Test AUC: 0.9123
   Hub similarity: 0.6638
   Training method: stratified_negative_sampling_v2


In [19]:
print("\nüß™ Test de pr√©dictions sur quelques exemples...\n")

# Prendre un chercheur al√©atoire
test_author_idx = np.random.choice(len(node_list))
test_author_id = node_list[test_author_idx]

print(f"Chercheur test: {test_author_id} (degree: {node_degrees[test_author_id]})")

# Prendre 10 candidats al√©atoires
candidate_indices = np.random.choice(len(node_list), 10, replace=False)

# Pr√©dire
encoder.eval()
with torch.no_grad():
    z = encoder(data.x.to(device), data.edge_index.to(device))
    
    edge_index = torch.tensor([
        [test_author_idx] * 10,
        candidate_indices.tolist()
    ], dtype=torch.long, device=device)
    
    scores = predictor(z, edge_index).cpu().numpy()

print(f"\nTop 5 scores:")
top_5_idx = np.argsort(scores)[-5:][::-1]
for rank, idx in enumerate(top_5_idx, 1):
    cand_id = node_list[candidate_indices[idx]]
    cand_deg = node_degrees[cand_id]
    print(f"  {rank}. Candidat {cand_id} (degree: {cand_deg}) ‚Üí score: {scores[idx]:.4f}")

print("\n‚úÖ Test de pr√©diction OK")




üß™ Test de pr√©dictions sur quelques exemples...

Chercheur test: 2064770039 (degree: 35)

Top 5 scores:
  1. Candidat 48650879 (degree: 128) ‚Üí score: 0.4601
  2. Candidat 2046135 (degree: 9) ‚Üí score: 0.3702
  3. Candidat 1700880 (degree: 9) ‚Üí score: 0.2607
  4. Candidat 39393520 (degree: 12) ‚Üí score: 0.2534
  5. Candidat 2302053110 (degree: 7) ‚Üí score: 0.1980

‚úÖ Test de pr√©diction OK


In [20]:
print("\nüìã Cr√©ation d'exemples pour tests API...\n")

# Top chercheurs par degr√©
top_by_degree = sorted(node_degrees.items(), key=lambda x: x[1], reverse=True)[:20]

# √âchantillon al√©atoire
random_sample = np.random.choice(node_list, min(10, len(node_list)), replace=False)

example_nodes = {
    'top_connected': [str(node) for node, deg in top_by_degree[:10]],
    'random_sample': [str(node) for node in random_sample],
    'low_degree': [str(node) for node in node_list if node_degrees[node] < degree_p50][:10],
    'mid_degree': [str(node) for node in node_list if degree_p50 < node_degrees[node] < degree_p90][:10]
}

examples_path = "../results/example_researchers.json"
import json
with open(examples_path, 'w') as f:
    json.dump(example_nodes, indent=2, fp=f)

print(f"‚úÖ Exemples sauvegard√©s: {examples_path}")
print(f"\nTop 5 chercheurs pour tester:")
for i, (node, deg) in enumerate(top_by_degree[:5], 1):
    print(f"   {i}. {node} ({deg} collaborations)")

print(f"\n{'='*60}")
print(f"‚úÖ PR√äT POUR L'API!")
print(f"{'='*60}")
print(f"\nFichiers cr√©√©s:")
print(f"   {MODEL_OUTPUT}")
print(f"   {PARTITION_PATH}")
print(f"   {examples_path}")
print(f"{'='*60}")


üìã Cr√©ation d'exemples pour tests API...

‚úÖ Exemples sauvegard√©s: ../results/example_researchers.json

Top 5 chercheurs pour tester:
   1. 46617804 (568 collaborations)
   2. 39589154 (535 collaborations)
   3. 1727524 (531 collaborations)
   4. 2010057 (528 collaborations)
   5. 2322150 (525 collaborations)

‚úÖ PR√äT POUR L'API!

Fichiers cr√©√©s:
   ../results/link_prediction_model.pt
   ../results/communities/louvain_partition.pkl
   ../results/example_researchers.json
