# Analyse du graphe de connaissances

Ce notebook analyse le graphe de connaissances construit à partir des documents.

In [None]:
import sys
sys.path.append('..')

import json
from pathlib import Path
from src.graph.graph_queries import GraphQueries
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
from collections import Counter

## 1. Connexion au graphe Neo4j

In [None]:
# Charger les variables d'environnement
load_dotenv()

# Initialiser les requêtes
graph_queries = GraphQueries(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USER"),
    password=os.getenv("NEO4J_PASSWORD")
)

print("Connexion établie au graphe Neo4j")

## 2. Statistiques du graphe

In [None]:
# Compter les entités par type
entity_types = ['PERSON', 'ORG', 'GPE', 'DATE', 'EVENT', 'PRODUCT']
type_counts = {}

for etype in entity_types:
    entities = graph_queries.search_entities_by_type(etype, limit=1000)
    type_counts[etype] = len(entities)

print("Nombre d'entités par type:")
for etype, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {etype}: {count}")

total_entities = sum(type_counts.values())
print(f"\nTotal d'entités: {total_entities}")

## 3. Analyse des relations

In [None]:
# Analyser quelques entités importantes
important_entities = []

# Chercher des entités avec beaucoup de relations
for etype in ['PERSON', 'ORG', 'GPE']:
    entities = graph_queries.search_entities_by_type(etype, limit=20)
    for entity in entities:
        entity_name = entity['name']
        neighbors = graph_queries.get_neighbors(entity_name, max_depth=1)
        if len(neighbors) > 5:  # Entités avec au moins 5 connexions
            important_entities.append({
                'name': entity_name,
                'type': etype,
                'connections': len(neighbors),
                'neighbors': neighbors
            })

# Trier par nombre de connexions
important_entities.sort(key=lambda x: x['connections'], reverse=True)

print("Entités les plus connectées:")
for i, entity in enumerate(important_entities[:10], 1):
    print(f"  {i}. {entity['name']} ({entity['type']}): {entity['connections']} connexions")

## 4. Visualisation du graphe

In [None]:
# Créer un sous-graphe pour visualisation
import networkx as nx

# Prendre les 5 entités les plus connectées
top_entities = important_entities[:5]
G = nx.Graph()

# Ajouter les nœuds
for entity in top_entities:
    G.add_node(entity['name'], type=entity['type'], size=entity['connections'])
    
    # Ajouter quelques voisins
    for neighbor in entity['neighbors'][:3]:  # Prendre les 3 premiers voisins
        neighbor_name = neighbor['entity']['name']
        neighbor_type = neighbor['entity'].get('type', 'unknown')
        G.add_node(neighbor_name, type=neighbor_type)
        G.add_edge(entity['name'], neighbor_name)

# Visualiser
plt.figure(figsize=(12, 8))

# Positions
pos = nx.spring_layout(G, k=2, iterations=50)

# Couleurs par type
color_map = {
    'PERSON': '#FF6B6B',
    'ORG': '#4ECDC4',
    'GPE': '#45B7D1',
    'DATE': '#FFA07A',
    'EVENT': '#98D8C8',
    'PRODUCT': '#F7DC6F',
    'unknown': '#95A5A6'
}

node_colors = [color_map.get(G.nodes[node].get('type', 'unknown'), '#95A5A6') for node in G.nodes()]
node_sizes = [G.nodes[node].get('size', 300) * 10 if 'size' in G.nodes[node] else 300 for node in G.nodes()]

# Dessiner
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, alpha=0.8)
nx.draw_networkx_edges(G, pos, alpha=0.5, width=1)
nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')

plt.title("Sous-graphe des entités les plus connectées", fontsize=16)
plt.axis('off')
plt.tight_layout()
plt.show()

## 5. Analyse des chemins

In [None]:
# Trouver des chemins entre entités importantes
if len(top_entities) >= 2:
    entity1 = top_entities[0]['name']
    entity2 = top_entities[1]['name']
    
    print(f"Recherche du chemin entre {entity1} et {entity2}...")
    path = graph_queries.find_path(entity1, entity2)
    
    if path:
        print("\nChemin trouvé:")
        nodes = path['nodes']
        relationships = path['relationships']
        
        for i, node in enumerate(nodes):
            print(f"  {i+1}. {node['name']} ({node.get('type', 'unknown')})")
            if i < len(relationships):
                rel = relationships[i]
                print(f"     → [{rel.get('type', 'relates_to')}] →")
    else:
        print("Aucun chemin direct trouvé.")

## 6. Métriques du graphe

In [None]:
# Calculer quelques métriques de base
print("Métriques du graphe:")

# Degré moyen
degrees = []
for entity in important_entities[:20]:  # Échantillon
    degrees.append(entity['connections'])

if degrees:
    avg_degree = sum(degrees) / len(degrees)
    max_degree = max(degrees)
    min_degree = min(degrees)
    
    print(f"  Degré moyen: {avg_degree:.2f}")
    print(f"  Degré maximum: {max_degree}")
    print(f"  Degré minimum: {min_degree}")
    
    # Distribution des degrés
    degree_counts = Counter(degrees)
    print("\nDistribution des degrés:")
    for degree, count in sorted(degree_counts.items()):
        print(f"  Degré {degree}: {count} entités")

## 7. Sauvegarde des analyses

In [None]:
# Préparer les données d'analyse
analysis_data = {
    'entity_type_distribution': type_counts,
    'important_entities': important_entities[:20],
    'graph_metrics': {
        'total_entities': total_entities,
        'avg_degree': avg_degree if 'avg_degree' in locals() else 0,
        'max_degree': max_degree if 'max_degree' in locals() else 0
    }
}

# Sauvegarder
Path("../data/analysis").mkdir(parents=True, exist_ok=True)
with open("../data/analysis/graph_analysis.json", 'w') as f:
    json.dump(analysis_data, f, ensure_ascii=False, indent=2)

print("\nAnalyse sauvegardée dans ../data/analysis/graph_analysis.json")

# Fermer la connexion
graph_queries.close()
print("Connexion fermée.")