# Construction Professionnelle de Graphe d'Articles Scientifiques

**Objectif**: Construire un graphe d'articles optimis√© pour GNN et analyse de communaut√©s

## Pipeline:
1. Chargement des embeddings et m√©tadonn√©es
2. Construction du graphe k-NN avec poids de similarit√©
3. D√©tection des communaut√©s (Louvain)
4. Analyse topologique avanc√©e
5. Export pour GNN (PyTorch Geometric format)
6. Visualisations professionnelles

## üì¶ Imports et Configuration

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from pathlib import Path
from typing import Tuple, Dict, List, Optional
from dataclasses import dataclass
import pickle
import json

# Machine Learning
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from community import community_louvain

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# Configuration
import warnings
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

print("‚úÖ Imports r√©ussis")

‚úÖ Imports r√©ussis


In [2]:
# Configuration du projet
BASE_PATH = Path("../data/processed")
OUTPUT_PATH = BASE_PATH / "graph_outputs"
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

# Fichiers d'entr√©e
EMBEDDINGS_PATH = BASE_PATH / "embeddings.npy"
ARTICLES_PATH = BASE_PATH / "cleaned_articles.csv"

# Param√®tres du graphe
K_NEIGHBORS = 10
SIMILARITY_THRESHOLD = 0.2
USE_COSINE = True
SAMPLE_SIZE = None  # None = tout le dataset

print(f"üìÇ R√©pertoire de sortie: {OUTPUT_PATH}")
print(f"üîß K-voisins: {K_NEIGHBORS}")
print(f"üîß Seuil de similarit√©: {SIMILARITY_THRESHOLD}")

üìÇ R√©pertoire de sortie: ..\data\processed\graph_outputs
üîß K-voisins: 10
üîß Seuil de similarit√©: 0.2


## üìÇ 1. Chargement des Donn√©es

In [3]:
print("\n" + "="*70)
print("üìÇ CHARGEMENT DES DONN√âES")
print("="*70)

# Charger embeddings
print(f"\nüîÑ Chargement des embeddings...")
if not EMBEDDINGS_PATH.exists():
    raise FileNotFoundError(f"Embeddings introuvables: {EMBEDDINGS_PATH}")

embeddings = np.load(EMBEDDINGS_PATH)
print(f"   ‚úÖ Shape: {embeddings.shape}")
print(f"   ‚úÖ Type: {embeddings.dtype}")
print(f"   ‚úÖ Taille m√©moire: {embeddings.nbytes / 1024**2:.2f} MB")

# Charger articles
print(f"\nüîÑ Chargement des articles...")
if not ARTICLES_PATH.exists():
    raise FileNotFoundError(f"Articles introuvables: {ARTICLES_PATH}")

articles_df = pd.read_csv(ARTICLES_PATH)
print(f"   ‚úÖ {len(articles_df):,} articles charg√©s")
print(f"   ‚úÖ Colonnes: {list(articles_df.columns[:5])}...")

# Synchroniser tailles
n_samples = min(len(articles_df), embeddings.shape[0])
articles_df = articles_df.iloc[:n_samples].reset_index(drop=True)
embeddings = embeddings[:n_samples]

# √âchantillonnage optionnel
if SAMPLE_SIZE and n_samples > SAMPLE_SIZE:
    print(f"\n‚ö†Ô∏è  √âchantillonnage: {SAMPLE_SIZE} articles")
    indices = np.random.RandomState(42).choice(n_samples, SAMPLE_SIZE, replace=False)
    articles_df = articles_df.iloc[indices].reset_index(drop=True)
    embeddings = embeddings[indices]

print(f"\n‚úÖ Donn√©es finales: {len(articles_df):,} articles, dim={embeddings.shape[1]}")


üìÇ CHARGEMENT DES DONN√âES

üîÑ Chargement des embeddings...
   ‚úÖ Shape: (816359, 384)
   ‚úÖ Type: float32
   ‚úÖ Taille m√©moire: 1195.84 MB

üîÑ Chargement des articles...
   ‚úÖ 816,359 articles charg√©s
   ‚úÖ Colonnes: ['cord_uid', 'sha', 'source_x', 'title', 'doi']...

‚úÖ Donn√©es finales: 816,359 articles, dim=384


In [4]:
# Aper√ßu des donn√©es
print("\nüìä Aper√ßu des articles:")
display(articles_df[['title', 'abstract']].head(3))

print("\nüìä Statistiques des embeddings:")
print(f"   Moyenne: {embeddings.mean():.4f}")
print(f"   Std: {embeddings.std():.4f}")
print(f"   Min: {embeddings.min():.4f}")
print(f"   Max: {embeddings.max():.4f}")


üìä Aper√ßu des articles:


Unnamed: 0,title,abstract
0,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...
1,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...
2,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...



üìä Statistiques des embeddings:
   Moyenne: -0.0008
   Std: 0.0510
   Min: -0.2988
   Max: 0.2857


## üîó 2. Construction du Graphe k-NN

In [None]:
# ============================================================================
# CODE OPTIMIS√â √Ä REMPLACER DANS LE NOTEBOOK (SANS FAISS)
# Section: Construction du Graphe k-NN
# Optimisations: batch processing, sparse graph, memory efficient
# ============================================================================

from sklearn.neighbors import NearestNeighbors
from scipy.sparse import lil_matrix, csr_matrix
import gc

print("\n" + "="*70)
print("üîó CONSTRUCTION DU GRAPHE k-NN OPTIMIS√âE (sklearn)")
print("="*70)

# ========================
# PARAM√àTRES OPTIMIS√âS
# ========================
K_NEIGHBORS = 10
SIMILARITY_THRESHOLD = 0.35  # Plus √©lev√© = graphe plus sparse = plus rapide
BATCH_SIZE = 20000  # Batch size pour la recherche
N_JOBS = -1  # Utiliser tous les CPU disponibles

# Option 1: √âchantillonnage pour test rapide (recommand√© pour 800K articles)
# SAMPLE_SIZE = 100000  # D√©commenter pour tester sur 100K articles d'abord
SAMPLE_SIZE = None  # None = tous les articles (attention: tr√®s lent!)

print(f"\nüîß Configuration:")
print(f"   Articles: {len(articles_df):,}")
print(f"   K-voisins: {K_NEIGHBORS}")
print(f"   Seuil: {SIMILARITY_THRESHOLD}")
print(f"   Batch size: {BATCH_SIZE:,}")
print(f"   CPU cores: tous disponibles")
if SAMPLE_SIZE:
    print(f"   ‚ö†Ô∏è  MODE √âCHANTILLON: {SAMPLE_SIZE:,} articles")

# ========================
# √âCHANTILLONNAGE (optionnel mais recommand√©)
# ========================
if SAMPLE_SIZE and len(embeddings) > SAMPLE_SIZE:
    print(f"\nüìä √âchantillonnage de {SAMPLE_SIZE:,} articles...")
    np.random.seed(42)
    sample_indices = np.random.choice(len(embeddings), SAMPLE_SIZE, replace=False)
    sample_indices = np.sort(sample_indices)  # Garder l'ordre
    
    embeddings = embeddings[sample_indices]
    articles_df = articles_df.iloc[sample_indices].reset_index(drop=True)
    
    print(f"   ‚úÖ √âchantillon cr√©√©: {len(embeddings):,} articles")

n_samples = len(embeddings)

# ========================
# NORMALISATION
# ========================
print(f"\nüîß Normalisation des embeddings...")
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
norms[norms == 0] = 1.0
embeddings_normalized = embeddings / norms
print(f"   ‚úÖ Normes moyennes: {np.linalg.norm(embeddings_normalized, axis=1).mean():.4f}")

# ========================
# OPTION A: Construction par batch (RECOMMAND√â pour 800K articles)
# ========================
if n_samples > 50000:
    print(f"\nüöÄ MODE BATCH (dataset large: {n_samples:,} articles)")
    print(f"   Construction de l'index k-NN une seule fois...")
    
    # Construire l'index une fois
    nn_model = NearestNeighbors(
        n_neighbors=K_NEIGHBORS + 1,
        metric='cosine',
        algorithm='brute',  # Plus rapide pour cosine sur datasets moyens
        n_jobs=N_JOBS
    )
    nn_model.fit(embeddings_normalized)
    print(f"   ‚úÖ Index k-NN construit")
    
    # Recherche par batch pour √©conomiser m√©moire
    print(f"\nüîç Recherche k-NN par batch de {BATCH_SIZE:,}...")
    
    all_distances = []
    all_indices = []
    
    for i in tqdm(range(0, n_samples, BATCH_SIZE), desc="k-NN batch search"):
        end_idx = min(i + BATCH_SIZE, n_samples)
        batch = embeddings_normalized[i:end_idx]
        
        # Recherche pour ce batch
        distances, indices = nn_model.kneighbors(batch, return_distance=True)
        
        all_distances.append(distances)
        all_indices.append(indices)
        
        # Lib√©rer m√©moire toutes les 5 batchs
        if (i // BATCH_SIZE) % 5 == 0:
            gc.collect()
    
    # Concat√©ner tous les r√©sultats
    distances = np.vstack(all_distances)
    indices = np.vstack(all_indices)
    
    # Convertir en similarit√©s
    similarities = 1.0 - distances
    
    # Lib√©rer m√©moire
    del all_distances, all_indices, nn_model
    gc.collect()

# ========================
# OPTION B: Construction directe (pour datasets < 50K)
# ========================
else:
    print(f"\nüöÄ MODE DIRECT (dataset petit: {n_samples:,} articles)")
    
    nn_model = NearestNeighbors(
        n_neighbors=K_NEIGHBORS + 1,
        metric='cosine',
        algorithm='auto',
        n_jobs=N_JOBS
    )
    nn_model.fit(embeddings_normalized)
    
    print(f"   Recherche k-NN...")
    distances, indices = nn_model.kneighbors(embeddings_normalized, return_distance=True)
    similarities = 1.0 - distances
    
    del nn_model
    gc.collect()

print(f"\n‚úÖ k-NN termin√©")
print(f"   Similarit√© moyenne: {similarities[:, 1:].mean():.4f}")
print(f"   Similarit√© min: {similarities[:, 1:].min():.4f}")
print(f"   Similarit√© max: {similarities[:, 1:].max():.4f}")

# Lib√©rer m√©moire
del embeddings_normalized
gc.collect()

# ========================
# CONSTRUCTION DES AR√äTES (OPTIMIS√âE avec matrice sparse)
# ========================
print(f"\nüèóÔ∏è  Construction des ar√™tes (seuil={SIMILARITY_THRESHOLD})...")
print(f"   Utilisation de matrice sparse pour efficacit√© m√©moire...")

# Utiliser une matrice sparse pour construction plus rapide
edge_matrix = lil_matrix((n_samples, n_samples), dtype=np.float32)
edge_count = 0

for i in tqdm(range(indices.shape[0]), desc="Building edges", mininterval=1.0):
    for j in range(1, indices.shape[1]):  # Skip self (j=0)
        neighbor_idx = int(indices[i, j])
        similarity = float(similarities[i, j])
        
        # Filtrer par seuil
        if similarity >= SIMILARITY_THRESHOLD:
            # √âviter doublons (stocker seulement i < j)
            if i < neighbor_idx:
                edge_matrix[i, neighbor_idx] = similarity
                edge_count += 1
            elif i > neighbor_idx:
                edge_matrix[neighbor_idx, i] = similarity
                edge_count += 1
    
    # Affichage p√©riodique
    if (i + 1) % 50000 == 0:
        print(f"      {edge_count:,} ar√™tes cr√©√©es jusqu'√† pr√©sent...")
        gc.collect()

# Convertir en format CSR (plus efficace pour NetworkX)
edge_matrix = edge_matrix.tocsr()

print(f"\n‚úÖ {edge_count:,} ar√™tes uniques cr√©√©es")
print(f"   Ar√™tes par n≈ìud (moyenne): {edge_count / n_samples:.2f}")

# V√©rifier si le graphe n'est pas trop sparse
if edge_count / n_samples < 2:
    print(f"   ‚ö†Ô∏è  Graphe tr√®s sparse! R√©duisez SIMILARITY_THRESHOLD √† 0.25-0.3")
elif edge_count / n_samples > 50:
    print(f"   ‚ö†Ô∏è  Graphe tr√®s dense! Augmentez SIMILARITY_THRESHOLD √† 0.4-0.5")

# Lib√©rer m√©moire
del distances, indices, similarities
gc.collect()

# ========================
# CR√âATION DU GRAPHE NETWORKX (OPTIMIS√âE)
# ========================
print(f"\nüî® Cr√©ation du graphe NetworkX...")

# Cr√©er graphe vide
G = nx.Graph()

# Ajouter n≈ìuds avec attributs (optimis√©)
print(f"   Ajout de {n_samples:,} n≈ìuds...")
node_data = [
    (i, {
        'title': str(row.get('title', ''))[:150],
        'source': str(row.get('source_x', ''))[:50]
    })
    for i, row in articles_df.iterrows()
]
G.add_nodes_from(node_data)
del node_data
gc.collect()

# Ajouter ar√™tes depuis la matrice sparse
print(f"   Ajout de {edge_count:,} ar√™tes...")
edges_to_add = []

# Extraire les ar√™tes de la matrice sparse
rows, cols = edge_matrix.nonzero()
for idx in tqdm(range(len(rows)), desc="Extracting edges", mininterval=1.0):
    i, j = rows[idx], cols[idx]
    weight = edge_matrix[i, j]
    edges_to_add.append((int(i), int(j), float(weight)))
    
    # Ajouter par batch de 50K pour √©viter OOM
    if len(edges_to_add) >= 50000:
        G.add_weighted_edges_from(edges_to_add)
        edges_to_add = []
        gc.collect()

# Ajouter les ar√™tes restantes
if edges_to_add:
    G.add_weighted_edges_from(edges_to_add)

del edge_matrix, edges_to_add
gc.collect()

print(f"\n‚úÖ Graphe cr√©√©:")
print(f"   N≈ìuds: {G.number_of_nodes():,}")
print(f"   Ar√™tes: {G.number_of_edges():,}")
print(f"   Densit√©: {nx.density(G):.8f}")

# ========================
# STATISTIQUES RAPIDES
# ========================
print(f"\nüìä Statistiques rapides:")
degrees = dict(G.degree())
degree_values = list(degrees.values())

print(f"   Degr√© moyen: {np.mean(degree_values):.2f}")
print(f"   Degr√© m√©dian: {np.median(degree_values):.0f}")
print(f"   Degr√© min: {min(degree_values)}")
print(f"   Degr√© max: {max(degree_values)}")

# N≈ìuds isol√©s
isolated = [n for n in G.nodes() if G.degree(n) == 0]
if isolated:
    print(f"   ‚ö†Ô∏è  N≈ìuds isol√©s: {len(isolated)} ({len(isolated)/n_samples*100:.2f}%)")
    print(f"       Consid√©rez r√©duire SIMILARITY_THRESHOLD")

print(f"\nüíæ M√©moire lib√©r√©e, pr√™t pour la d√©tection de communaut√©s...")

# ========================
# CONSEIL POUR ACC√âL√âRER ENCORE PLUS
# ========================
print(f"\nüí° CONSEIL PERFORMANCE:")
print(f"   Pour 800K+ articles, consid√©rez:")
print(f"   1. √âchantillonner 100K-200K articles (d√©commenter SAMPLE_SIZE)")
print(f"   2. Augmenter SIMILARITY_THRESHOLD √† 0.4-0.5")
print(f"   3. R√©duire K_NEIGHBORS √† 5-8")
print(f"   4. Ou installer FAISS: pip install faiss-cpu")


üîó CONSTRUCTION DU GRAPHE k-NN OPTIMIS√âE (sklearn)

üîß Configuration:
   Articles: 100,000
   K-voisins: 10
   Seuil: 0.35
   Batch size: 20,000
   CPU cores: tous disponibles

üîß Normalisation des embeddings...
   ‚úÖ Normes moyennes: 1.0000

üöÄ MODE BATCH (dataset large: 100,000 articles)
   Construction de l'index k-NN une seule fois...
   ‚úÖ Index k-NN construit

üîç Recherche k-NN par batch de 20,000...


k-NN batch search:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [7]:
print("\n" + "="*70)
print("üîó CONSTRUCTION DU GRAPHE k-NN")
print("="*70)

# Normaliser les embeddings pour similarit√© cosinus
if USE_COSINE:
    print("\nüîß Normalisation des embeddings (cosine similarity)...")
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    embeddings_normalized = embeddings / norms
    print(f"   ‚úÖ Normes moyennes: {np.linalg.norm(embeddings_normalized, axis=1).mean():.4f}")
else:
    embeddings_normalized = embeddings

# Construire l'index k-NN
print(f"\nüîÑ Construction de l'index k-NN (k={K_NEIGHBORS})...")
metric = "cosine" if USE_COSINE else "euclidean"
nn_model = NearestNeighbors(n_neighbors=K_NEIGHBORS + 1, metric=metric, n_jobs=-1)
nn_model.fit(embeddings_normalized)
print(f"   ‚úÖ Index k-NN construit (metric={metric})")

# Recherche des voisins
print(f"\nüîç Recherche des {K_NEIGHBORS} plus proches voisins...")
distances, indices = nn_model.kneighbors(embeddings_normalized, return_distance=True)

# Calculer les poids (similarit√©s)
if USE_COSINE:
    weights = 1.0 - distances  # Cosine similarity
else:
    weights = 1.0 / (1.0 + distances)  # Inverse distance

print(f"   ‚úÖ Voisins trouv√©s")
print(f"   ‚úÖ Similarit√© moyenne: {weights[:, 1:].mean():.4f}")
print(f"   ‚úÖ Similarit√© min: {weights[:, 1:].min():.4f}")
print(f"   ‚úÖ Similarit√© max: {weights[:, 1:].max():.4f}")


üîó CONSTRUCTION DU GRAPHE k-NN

üîß Normalisation des embeddings (cosine similarity)...
   ‚úÖ Normes moyennes: 1.0000

üîÑ Construction de l'index k-NN (k=10)...
   ‚úÖ Index k-NN construit (metric=cosine)

üîç Recherche des 10 plus proches voisins...


KeyboardInterrupt: 

In [None]:
# Construire la liste des ar√™tes avec filtrage
print(f"\nüèóÔ∏è  Construction des ar√™tes (seuil={SIMILARITY_THRESHOLD})...")

edges = []
for i in tqdm(range(indices.shape[0]), desc="Building edges"):
    for j in range(1, indices.shape[1]):  # Skip self (j=0)
        neighbor_idx = int(indices[i, j])
        similarity = float(weights[i, j])
        
        # Filtrer par seuil de similarit√©
        if similarity >= SIMILARITY_THRESHOLD:
            edges.append((i, neighbor_idx, similarity))

print(f"\n‚úÖ {len(edges):,} ar√™tes cr√©√©es")
print(f"   Ar√™tes par n≈ìud (moyenne): {len(edges) / len(articles_df):.2f}")

In [None]:
# Cr√©er le graphe NetworkX
print("\nüî® Cr√©ation du graphe NetworkX...")

G = nx.Graph()

# Ajouter les n≈ìuds avec attributs
print("   Ajout des n≈ìuds...")
for i, row in articles_df.iterrows():
    G.add_node(i, 
               title=str(row.get('title', '')),
               abstract=str(row.get('abstract', ''))[:500],  # Limiter taille
               source=str(row.get('source_x', '')),
               year=int(row.get('year', 0)) if pd.notna(row.get('year')) else 0)

# Ajouter les ar√™tes avec poids
print("   Ajout des ar√™tes...")
for i, j, weight in tqdm(edges, desc="Adding edges"):
    if i != j:  # √âviter self-loops
        if G.has_edge(i, j):
            # Garder le poids maximum si ar√™te existe d√©j√†
            if weight > G[i][j]['weight']:
                G[i][j]['weight'] = weight
        else:
            G.add_edge(i, j, weight=weight)

print(f"\n‚úÖ Graphe cr√©√©:")
print(f"   N≈ìuds: {G.number_of_nodes():,}")
print(f"   Ar√™tes: {G.number_of_edges():,}")
print(f"   Densit√©: {nx.density(G):.6f}")

## üß© 3. D√©tection des Communaut√©s

In [None]:
print("\n" + "="*70)
print("üß© D√âTECTION DES COMMUNAUT√âS (Louvain)")
print("="*70)

print("\nüîÑ Ex√©cution de l'algorithme de Louvain...")
partition = community_louvain.best_partition(G, weight='weight', random_state=42)

# Ajouter les communaut√©s au dataframe
communities = pd.Series(partition).sort_index().values
articles_df['community'] = communities

# Statistiques
n_communities = int(articles_df['community'].nunique())
community_sizes = articles_df['community'].value_counts().sort_values(ascending=False)

print(f"\n‚úÖ {n_communities} communaut√©s d√©tect√©es")
print(f"\nüìä Distribution des tailles:")
print(f"   Taille moyenne: {community_sizes.mean():.1f} articles")
print(f"   M√©diane: {community_sizes.median():.0f} articles")
print(f"   Plus grande: {community_sizes.max():,} articles")
print(f"   Plus petite: {community_sizes.min()} articles")

print(f"\nüèÜ Top 10 des communaut√©s:")
for comm_id, size in community_sizes.head(10).items():
    print(f"   Community {comm_id}: {size:,} articles ({size/len(articles_df)*100:.1f}%)")

In [None]:
# Calculer la modularit√©
modularity = community_louvain.modularity(partition, G, weight='weight')
print(f"\nüìà Modularit√©: {modularity:.4f}")
if modularity > 0.4:
    print("   ‚úÖ Excellente structure communautaire")
elif modularity > 0.3:
    print("   ‚úÖ Bonne structure communautaire")
else:
    print("   ‚ö†Ô∏è  Structure communautaire faible")

## üìä 4. Analyse Topologique Avanc√©e

In [None]:
print("\n" + "="*70)
print("üìä ANALYSE TOPOLOGIQUE")
print("="*70)

# M√©triques de base
print("\nüî¢ M√©triques de base:")
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
density = nx.density(G)

print(f"   N≈ìuds: {n_nodes:,}")
print(f"   Ar√™tes: {n_edges:,}")
print(f"   Densit√©: {density:.6f}")
print(f"   Type: {'Connexe' if nx.is_connected(G) else 'D√©connect√©'}")

# Composantes connexes
print("\nüîó Composantes connexes:")
components = list(nx.connected_components(G))
print(f"   Nombre: {len(components)}")
print(f"   Plus grande: {len(max(components, key=len)):,} n≈ìuds ({len(max(components, key=len))/n_nodes*100:.1f}%)")

if len(components) > 1:
    component_sizes = sorted([len(c) for c in components], reverse=True)
    print(f"   Top 5 tailles: {component_sizes[:5]}")

In [None]:
# Distribution des degr√©s
print("\nüìà Distribution des degr√©s:")
degrees = dict(G.degree())
degree_values = list(degrees.values())

print(f"   Degr√© moyen: {np.mean(degree_values):.2f}")
print(f"   Degr√© m√©dian: {np.median(degree_values):.0f}")
print(f"   Degr√© min: {min(degree_values)}")
print(f"   Degr√© max: {max(degree_values)}")
print(f"   Std: {np.std(degree_values):.2f}")

# N≈ìuds les plus connect√©s
top_nodes = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:5]
print(f"\nüåü Top 5 n≈ìuds par degr√©:")
for node_id, degree in top_nodes:
    title = G.nodes[node_id].get('title', 'N/A')[:60]
    print(f"   Node {node_id}: {degree} connexions - {title}...")

In [None]:
# Clustering coefficient
print("\nüî∫ Coefficient de clustering:")
avg_clustering = nx.average_clustering(G, weight='weight')
print(f"   Moyenne: {avg_clustering:.4f}")

if avg_clustering > 0.5:
    print("   ‚úÖ Fort clustering (structure hi√©rarchique)")
elif avg_clustering > 0.3:
    print("   ‚úÖ Clustering mod√©r√©")
else:
    print("   ‚ö†Ô∏è  Faible clustering")

In [None]:
# Distribution des poids des ar√™tes
print("\n‚öñÔ∏è  Distribution des poids des ar√™tes:")
edge_weights = [data['weight'] for _, _, data in G.edges(data=True)]

print(f"   Poids moyen: {np.mean(edge_weights):.4f}")
print(f"   Poids m√©dian: {np.median(edge_weights):.4f}")
print(f"   Poids min: {min(edge_weights):.4f}")
print(f"   Poids max: {max(edge_weights):.4f}")

## üíæ 5. Sauvegarde des R√©sultats

In [None]:
print("\n" + "="*70)
print("üíæ SAUVEGARDE DES R√âSULTATS")
print("="*70)

# 1. Sauvegarder le graphe NetworkX
graph_path = OUTPUT_PATH / 'article_graph.gpickle'
nx.write_gpickle(G, graph_path)
print(f"\n‚úÖ Graphe NetworkX: {graph_path}")
print(f"   Taille: {graph_path.stat().st_size / 1024**2:.2f} MB")

# 2. Sauvegarder les articles avec communaut√©s
csv_path = OUTPUT_PATH / 'articles_with_communities.csv'
articles_df.to_csv(csv_path, index=False)
print(f"\n‚úÖ Articles + communaut√©s: {csv_path}")
print(f"   {len(articles_df):,} articles avec {n_communities} communaut√©s")

# 3. Sauvegarder les m√©tadonn√©es
metadata = {
    'graph_stats': {
        'n_nodes': n_nodes,
        'n_edges': n_edges,
        'density': float(density),
        'n_communities': n_communities,
        'modularity': float(modularity),
        'avg_clustering': float(avg_clustering),
        'avg_degree': float(np.mean(degree_values))
    },
    'construction_params': {
        'k_neighbors': K_NEIGHBORS,
        'similarity_threshold': SIMILARITY_THRESHOLD,
        'use_cosine': USE_COSINE,
        'embedding_dim': embeddings.shape[1]
    },
    'community_stats': {
        'n_communities': n_communities,
        'avg_size': float(community_sizes.mean()),
        'median_size': float(community_sizes.median()),
        'max_size': int(community_sizes.max()),
        'min_size': int(community_sizes.min())
    }
}

metadata_path = OUTPUT_PATH / 'graph_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"\n‚úÖ M√©tadonn√©es: {metadata_path}")

In [None]:
# 4. Export pour PyTorch Geometric (GNN)
print("\nüî• Export pour PyTorch Geometric...")

# Cr√©er edge_index (format COO)
edge_list = list(G.edges())
edge_index = np.array([[e[0], e[1]] for e in edge_list] + 
                      [[e[1], e[0]] for e in edge_list])  # Bidirectionnel
edge_index = edge_index.T  # Shape: (2, num_edges)

# Cr√©er edge_attr (poids)
edge_weights_list = [G[e[0]][e[1]]['weight'] for e in edge_list]
edge_attr = np.array(edge_weights_list + edge_weights_list)  # Bidirectionnel

# Sauvegarder
pyg_data = {
    'edge_index': edge_index,
    'edge_attr': edge_attr,
    'x': embeddings,  # Node features
    'y': communities,  # Node labels (communaut√©s)
    'num_nodes': n_nodes
}

pyg_path = OUTPUT_PATH / 'graph_pyg_format.pkl'
with open(pyg_path, 'wb') as f:
    pickle.dump(pyg_data, f)

print(f"‚úÖ Format PyG: {pyg_path}")
print(f"   edge_index shape: {edge_index.shape}")
print(f"   edge_attr shape: {edge_attr.shape}")
print(f"   x shape: {embeddings.shape}")
print(f"   y shape: {communities.shape}")

## üìä 6. Visualisations Professionnelles

In [None]:
print("\n" + "="*70)
print("üìä VISUALISATIONS")
print("="*70)

# Cr√©er la figure
fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Distribution des degr√©s
ax1 = fig.add_subplot(gs[0, 0])
ax1.hist(degree_values, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
ax1.set_xlabel('Degr√©', fontsize=11)
ax1.set_ylabel('Nombre de n≈ìuds', fontsize=11)
ax1.set_title('Distribution des Degr√©s', fontsize=12, fontweight='bold')
ax1.axvline(np.mean(degree_values), color='red', linestyle='--', 
            label=f'Moyenne: {np.mean(degree_values):.1f}')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Distribution des poids d'ar√™tes
ax2 = fig.add_subplot(gs[0, 1])
ax2.hist(edge_weights, bins=50, edgecolor='black', alpha=0.7, color='coral')
ax2.set_xlabel('Poids de similarit√©', fontsize=11)
ax2.set_ylabel('Nombre d\'ar√™tes', fontsize=11)
ax2.set_title('Distribution des Poids d\'Ar√™tes', fontsize=12, fontweight='bold')
ax2.axvline(SIMILARITY_THRESHOLD, color='red', linestyle='--',
            label=f'Seuil: {SIMILARITY_THRESHOLD}')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Tailles des communaut√©s
ax3 = fig.add_subplot(gs[0, 2])
top_comms = community_sizes.head(15)
ax3.barh(range(len(top_comms)), top_comms.values, color='mediumseagreen', alpha=0.8)
ax3.set_yticks(range(len(top_comms)))
ax3.set_yticklabels([f'Comm {i}' for i in top_comms.index])
ax3.set_xlabel('Nombre d\'articles', fontsize=11)
ax3.set_title('Top 15 Communaut√©s', fontsize=12, fontweight='bold')
ax3.invert_yaxis()
ax3.grid(True, alpha=0.3, axis='x')

# 4. Distribution log des degr√©s
ax4 = fig.add_subplot(gs[1, 0])
ax4.hist(degree_values, bins=50, edgecolor='black', alpha=0.7, color='purple')
ax4.set_xlabel('Degr√©', fontsize=11)
ax4.set_ylabel('Nombre de n≈ìuds (log)', fontsize=11)
ax4.set_yscale('log')
ax4.set_title('Distribution des Degr√©s (√©chelle log)', fontsize=12, fontweight='bold')
ax4.grid(True, alpha=0.3)

# 5. Statistiques globales
ax5 = fig.add_subplot(gs[1, 1])
ax5.axis('off')
stats_text = f"""
STATISTIQUES DU GRAPHE
{'='*40}

üìä Structure:
   ‚Ä¢ N≈ìuds: {n_nodes:,}
   ‚Ä¢ Ar√™tes: {n_edges:,}
   ‚Ä¢ Densit√©: {density:.6f}
   ‚Ä¢ Degr√© moyen: {np.mean(degree_values):.2f}

üß© Communaut√©s:
   ‚Ä¢ Nombre: {n_communities}
   ‚Ä¢ Taille moyenne: {community_sizes.mean():.1f}
   ‚Ä¢ Modularit√©: {modularity:.4f}

üî∫ Clustering:
   ‚Ä¢ Coefficient: {avg_clustering:.4f}

‚öñÔ∏è  Poids des ar√™tes:
   ‚Ä¢ Moyenne: {np.mean(edge_weights):.4f}
   ‚Ä¢ M√©diane: {np.median(edge_weights):.4f}
"""
ax5.text(0.1, 0.5, stats_text, fontsize=11, verticalalignment='center',
         family='monospace', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

# 6. Distribution cumulative des degr√©s
ax6 = fig.add_subplot(gs[1, 2])
sorted_degrees = np.sort(degree_values)
cumulative = np.arange(1, len(sorted_degrees) + 1) / len(sorted_degrees)
ax6.plot(sorted_degrees, cumulative, linewidth=2, color='darkblue')
ax6.set_xlabel('Degr√©', fontsize=11)
ax6.set_ylabel('Proportion cumulative', fontsize=11)
ax6.set_title('Distribution Cumulative des Degr√©s', fontsize=12, fontweight='bold')
ax6.grid(True, alpha=0.3)
ax6.set_xlim(left=0)

# 7. Heatmap des m√©triques par communaut√© (top 20)
ax7 = fig.add_subplot(gs[2, :])
top_20_comms = community_sizes.head(20).index
comm_metrics = []
for comm_id in top_20_comms:
    nodes_in_comm = [n for n in G.nodes() if communities[n] == comm_id]
    subgraph = G.subgraph(nodes_in_comm)
    comm_metrics.append([
        len(nodes_in_comm),
        subgraph.number_of_edges(),
        nx.density(subgraph) if len(nodes_in_comm) > 1 else 0,
        np.mean([d for _, d in subgraph.degree()]) if len(nodes_in_comm) > 0 else 0
    ])

comm_metrics_df = pd.DataFrame(comm_metrics, 
                               columns=['Taille', 'Ar√™tes', 'Densit√©', 'Degr√© moy'],
                               index=[f'C{i}' for i in top_20_comms])

# Normaliser pour la heatmap
comm_metrics_norm = (comm_metrics_df - comm_metrics_df.min()) / (comm_metrics_df.max() - comm_metrics_df.min())
sns.heatmap(comm_metrics_norm.T, annot=comm_metrics_df.T, fmt='.0f', 
            cmap='YlOrRd', cbar_kws={'label': 'Valeur normalis√©e'},
            linewidths=0.5, ax=ax7)
ax7.set_title('M√©triques des Top 20 Communaut√©s', fontsize=12, fontweight='bold')
ax7.set_xlabel('Communaut√©', fontsize=11)
ax7.set_ylabel('M√©trique', fontsize=11)

plt.suptitle('Analyse Compl√®te du Graphe d\'Articles Scientifiques', 
             fontsize=16, fontweight='bold', y=0.995)

# Sauvegarder
viz_path = OUTPUT_PATH / 'graph_analysis.png'
plt.savefig(viz_path, dpi=300, bbox_inches='tight')
print(f"\n‚úÖ Visualisation sauvegard√©e: {viz_path}")
plt.show()

print("\n‚úÖ Visualisations termin√©es!")

# BONUS: Visualisation UMAP des embeddings par communaut√©
try:
    from umap import UMAP
    
    print("\nüó∫Ô∏è  G√©n√©ration de la visualisation UMAP...")
    
    # √âchantillonner si trop grand
    sample_size = min(3000, len(embeddings))
    if len(embeddings) > sample_size:
        indices = np.random.RandomState(42).choice(len(embeddings), sample_size, replace=False)
        emb_sample = embeddings[indices]
        comm_sample = communities[indices]
    else:
        emb_sample = embeddings
        comm_sample = communities
    
    # UMAP
    reducer = UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
    embedding_2d = reducer.fit_transform(emb_sample)
    
    # Visualiser
    plt.figure(figsize=(14, 10))
    scatter = plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], 
                         c=comm_sample, s=15, alpha=0.6, cmap='tab20')
    plt.colorbar(scatter, label='ID Communaut√©')
    plt.title(f'Projection UMAP des Embeddings (color√©s par communaut√©)\n{sample_size} articles √©chantillonn√©s',
             fontsize=14, fontweight='bold')
    plt.xlabel('UMAP 1', fontsize=12)
    plt.ylabel('UMAP 2', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    umap_path = OUTPUT_PATH / 'embeddings_umap_communities.png'
    plt.savefig(umap_path, dpi=300, bbox_inches='tight')
    print(f"‚úÖ UMAP sauvegard√©: {umap_path}")
    plt.show()
    
except ImportError:
    print("‚ö†Ô∏è  UMAP non install√© (pip install umap-learn)")

print("\n" + "="*70)
print("‚úÖ CONSTRUCTION DU GRAPHE TERMIN√âE!")
print("="*70)

print(f"\nüì¶ Fichiers g√©n√©r√©s dans {OUTPUT_PATH}:")
print(f"   1. article_graph.gpickle - Graphe NetworkX complet")
print(f"   2. articles_with_communities.csv - Articles + communaut√©s")
print(f"   3. graph_metadata.json - M√©tadonn√©es et statistiques")
print(f"   4. graph_pyg_format.pkl - Format PyTorch Geometric")
print(f"   5. graph_analysis.png - Visualisations compl√®tes")
print(f"   6. embeddings_umap_communities.png - Projection UMAP")

print(f"\nüìä R√©sum√© final:")
print(f"   ‚Ä¢ {n_nodes:,} articles dans le graphe")
print(f"   ‚Ä¢ {n_edges:,} connexions s√©mantiques")
print(f"   ‚Ä¢ {n_communities} communaut√©s d√©tect√©es")
print(f"   ‚Ä¢ Modularit√©: {modularity:.4f}")
print(f"   ‚Ä¢ Pr√™t pour GNN et analyses avanc√©es!")


   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üî• 7. Exemple d'Utilisation avec PyTorch Geometric\n",
    "\n",
    "Code pour charger et utiliser le graphe avec des GNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Exemple de chargement pour PyTorch Geometric\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"üî• EXEMPLE PYTORCH GEOMETRIC\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "# Code d'exemple (n√©cessite: pip install torch-geometric)\n",
    "example_code = \"\"\"\n",
    "import torch\n",
    "from torch_geometric.data import Data\n",
    "import pickle\n",
    "\n",
    "# 1. Charger les donn√©es\n",
    "with open('graph_outputs/graph_pyg_format.pkl', 'rb') as f:\n",
    "    pyg_data = pickle.load(f)\n",
    "\n",
    "# 2. Cr√©er l'objet Data PyG\n",
    "data = Data(\n",
    "    x=torch.FloatTensor(pyg_data['x']),           # Node features (embeddings)\n",
    "    edge_index=torch.LongTensor(pyg_data['edge_index']),  # Edges\n",
    "    edge_attr=torch.FloatTensor(pyg_data['edge_attr']).unsqueeze(1),  # Edge weights\n",
    "    y=torch.LongTensor(pyg_data['y'])             # Labels (communities)\n",
    ")\n",
    "\n",
    "print(f\"Graph loaded: {data}\")\n",
    "print(f\"  - Nodes: {data.num_nodes}\")\n",
    "print(f\"  - Edges: {data.num_edges}\")\n",
    "print(f\"  - Features: {data.num_node_features}\")\n",
    "print(f\"  - Classes: {data.y.unique().numel()}\")\n",
    "\n",
    "# 3. Exemple de GNN simple\n",
    "from torch_geometric.nn import GCNConv, global_mean_pool\n",
    "import torch.nn.functional as F\n",
    "\n",
    "class ArticleGNN(torch.nn.Module):\n",
    "    def __init__(self, in_channels, hidden_channels, num_classes):\n",
    "        super().__init__()\n",
    "        self.conv1 = GCNConv(in_channels, hidden_channels)\n",
    "        self.conv2 = GCNConv(hidden_channels, hidden_channels)\n",
    "        self.conv3 = GCNConv(hidden_channels, num_classes)\n",
    "    \n",
    "    def forward(self, x, edge_index, edge_weight=None):\n",
    "        x = self.conv1(x, edge_index, edge_weight)\n",
    "        x = F.relu(x)\n",
    "        x = F.dropout(x, p=0.5, training=self.training)\n",
    "        \n",
    "        x = self.conv2(x, edge_index, edge_weight)\n",
    "        x = F.relu(x)\n",
    "        x = F.dropout(x, p=0.5, training=self.training)\n",
    "        \n",
    "        x = self.conv3(x, edge_index, edge_weight)\n",
    "        return F.log_softmax(x, dim=1)\n",
    "\n",
    "# 4. Initialiser le mod√®le\n",
    "model = ArticleGNN(\n",
    "    in_channels=data.num_node_features,\n",
    "    hidden_channels=64,\n",
    "    num_classes=data.y.unique().numel()\n",
    ")\n",
    "\n",
    "print(f\"\\nMod√®le cr√©√©: {model}\")\n",
    "print(f\"Nombre de param√®tres: {sum(p.numel() for p in model.parameters())}\")\n",
    "\n",
    "# 5. Training loop (exemple simplifi√©)\n",
    "# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)\n",
    "# criterion = torch.nn.NLLLoss()\n",
    "# ...\n",
    "\"\"\"\n",
    "\n",
    "print(\"\\nüìù Code d'exemple pour GNN:\")\n",
    "print(example_code)\n",
    "\n",
    "print(\"\\nüí° Pour utiliser ce code:\")\n",
    "print(\"   1. pip install torch torch-geometric\")\n",
    "print(\"   2. Chargez graph_pyg_format.pkl\")\n",
    "print(\"   3. Adaptez l'architecture GNN √† vos besoins\")\n",
    "print(\"   4. Entra√Ænez sur les communaut√©s ou d'autres t√¢ches\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}