# Construction du graphe d'articles et communaut√©s

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import networkx as nx
from sklearn.neighbors import NearestNeighbors
from community import community_louvain
import matplotlib.pyplot as plt
BASE_PATH = Path("../data/processed")
EMB_PATH = BASE_PATH / "embeddings.npy"
DF1 = BASE_PATH / "articles_for_embeddings.csv"
DF2 = BASE_PATH / "cleaned_articles.csv"
K = 10
SIM_THRESHOLD = 0.2
USE_COSINE = True
SAMPLE_SIZE = None


In [2]:
print("üìÇ Chargement des embeddings et des articles...")
if not EMB_PATH.exists():
    raise FileNotFoundError(str(EMB_PATH))
emb = np.load(EMB_PATH)
if DF1.exists():
    df = pd.read_csv(DF1)
elif DF2.exists():
    df = pd.read_csv(DF2)
else:
    raise FileNotFoundError("Aucun CSV d'articles trouv√©")
n = min(len(df), emb.shape[0])
df = df.iloc[:n].reset_index(drop=True)
emb = emb[:n]
if SAMPLE_SIZE and n > SAMPLE_SIZE:
    df = df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
    emb = emb[df.index.values]
print(f"‚úÖ {len(df):,} articles, dim={emb.shape[1]}")
df[['title']].head(2)


üìÇ Chargement des embeddings et des articles...


  df = pd.read_csv(DF1)


‚úÖ 772,650 articles, dim=384


Unnamed: 0,title
0,Clinical features of culture-proven Mycoplasma...
1,Nitric oxide: a pro-inflammatory mediator in l...


In [3]:
print("üîó Construction k-NN...")
if USE_COSINE:
    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    emb_n = emb / norms
    nn = NearestNeighbors(n_neighbors=K+1, metric="cosine")
    nn.fit(emb_n)
    dists, indices = nn.kneighbors(emb_n, return_distance=True)
    sims = 1.0 - dists
else:
    nn = NearestNeighbors(n_neighbors=K+1, metric="euclidean")
    nn.fit(emb)
    dists, indices = nn.kneighbors(emb, return_distance=True)
    sims = 1.0 / (1.0 + dists)
edges = []
for i in range(indices.shape[0]):
    for j in range(1, indices.shape[1]):
        nbr = int(indices[i, j])
        w = float(sims[i, j])
        if w >= SIM_THRESHOLD:
            edges.append((i, nbr, w))
len(edges)


üîó Construction k-NN...


KeyboardInterrupt: 

In [None]:
print("üß± Cr√©ation du graphe NetworkX...")
G = nx.Graph()
G.add_nodes_from(range(len(df)))
for i, row in df.iterrows():
    G.nodes[i]['title'] = row.get('title','')
for i, j, w in edges:
    if i != j:
        if G.has_edge(i, j):
            if w > G[i][j].get('weight', 0.0):
                G[i][j]['weight'] = w
        else:
            G.add_edge(i, j, weight=w)
G.number_of_nodes(), G.number_of_edges()


In [None]:
print("üß© D√©tection des communaut√©s (Louvain)...")
partition = community_louvain.best_partition(G, weight='weight', random_state=42)
comm = pd.Series(partition).sort_index().values
df_out = df.copy()
df_out['community'] = comm
df_out.head(3)


In [None]:
graph_path = BASE_PATH / 'article_graph.gpickle'
csv_path = BASE_PATH / 'articles_with_communities.csv'
nx.write_gpickle(G, graph_path)
df_out.to_csv(csv_path, index=False)
print(f"üíæ Graphe: {graph_path}")
print(f"üíæ Communaut√©s: {csv_path}")


In [None]:
print("üìà Statistiques")
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
deg = [d for _, d in G.degree()]
n_comm = int(df_out['community'].nunique())
sizes = df_out['community'].value_counts().head(10)
print(f"Noeuds: {n_nodes:,}, Ar√™tes: {n_edges:,}, Communaut√©s: {n_comm}")
print("Top tailles de communaut√©s:")
print(sizes.to_string())
plt.figure(figsize=(10,4))
plt.hist(deg, bins=50, edgecolor='black')
plt.title('Distribution des degr√©s')
plt.xlabel('Degr√©')
plt.ylabel("Nombre d'articles")
plt.tight_layout()
plt.show()


In [None]:
from umap import UMAP
print("üó∫Ô∏è Visualisation UMAP par communaut√© (√©chantillon)")
n = min(2000, emb.shape[0])
ix = np.random.RandomState(42).choice(emb.shape[0], n, replace=False)
emb_s = emb[ix]
lab_s = df_out.iloc[ix]['community'].values
um = UMAP(n_components=2, random_state=42)
xy = um.fit_transform(emb_s)
plt.figure(figsize=(10,8))
plt.scatter(xy[:,0], xy[:,1], c=lab_s, s=4, cmap='tab20', alpha=0.7)
plt.title('UMAP des embeddings color√©s par communaut√©')
plt.tight_layout()
plt.show()
