# DataSens E1_v3 ‚Äî 04_quality_checks

- Objectifs: Contr√¥les qualit√© PostgreSQL + MinIO pour architecture 36/37 tables
- Pr√©requis: 03_ingest_sources ex√©cut√©
- Sortie: Rapports QA avec visualisations + tables pandas
- Guide: docs/GUIDE_TECHNIQUE_E1.md

> **E1_v3** : Contr√¥les qualit√© complets (tables t01-t37)
> - Volumes par table
> - D√©tection doublons (hash_fingerprint)
> - Valeurs NULL critiques
> - Int√©grit√© r√©f√©rentielle (FK)
> - MinIO DataLake (objets, taille)



# ============================================================
# üé¨ DASHBOARD NARRATIF - O√ô SOMMES-NOUS ?
# ============================================================
# Ce dashboard vous guide √† travers le pipeline DataSens E1
# Il montre la progression et l'√©tat actuel des donn√©es
# ============================================================

import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch
import matplotlib.patches as mpatches

print("\n" + "="*80)
print("üé¨ FIL D'ARIANE VISUEL - PIPELINE DATASENS E1")
print("="*80)

# Cr√©er figure dashboard
fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)
ax.set_xlim(0, 10)
ax.set_ylim(0, 6)
ax.axis('off')

# √âtapes du pipeline
etapes = [
    {"nom": "üì• COLLECTE", "status": "‚úÖ", "desc": "Sources brutes"},
    {"nom": "‚òÅÔ∏è DATALAKE", "status": "‚úÖ", "desc": "MinIO Raw"},
    {"nom": "üßπ NETTOYAGE", "status": "üîÑ", "desc": "D√©duplication"},
    {"nom": "üíæ ETL", "status": "‚è≥", "desc": "PostgreSQL"},
    {"nom": "üìä ANNOTATION", "status": "‚è≥", "desc": "Enrichissement"},
    {"nom": "üì¶ EXPORT", "status": "‚è≥", "desc": "Dataset IA"}
]

# Couleurs selon statut
colors = {
    "‚úÖ": "#4ECDC4",
    "üîÑ": "#FECA57", 
    "‚è≥": "#E8E8E8"
}

# Dessiner timeline
y_pos = 4
x_start = 1
x_spacing = 1.4

for i, etape in enumerate(etapes):
    x_pos = x_start + i * x_spacing
    
    # Cercle √©tape
    circle = plt.Circle((x_pos, y_pos), 0.25, color=colors[etape["status"]], zorder=3)
    ax.add_patch(circle)
    ax.text(x_pos, y_pos, etape["status"], ha='center', va='center', fontsize=14, fontweight='bold', zorder=4)
    
    # Nom √©tape
    ax.text(x_pos, y_pos - 0.6, etape["nom"], ha='center', va='top', fontsize=11, fontweight='bold')
    ax.text(x_pos, y_pos - 0.85, etape["desc"], ha='center', va='top', fontsize=9, style='italic')
    
    # Fl√®che vers prochaine √©tape
    if i < len(etapes) - 1:
        ax.arrow(x_pos + 0.3, y_pos, x_spacing - 0.6, 0, 
                head_width=0.1, head_length=0.15, fc='gray', ec='gray', zorder=2)

# Titre narratif
ax.text(5, 5.5, "üéØ PROGRESSION DU PIPELINE E1", ha='center', va='center', 
        fontsize=16, fontweight='bold', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# L√©gende
legend_elements = [
    mpatches.Patch(facecolor='#4ECDC4', label='Termin√©'),
    mpatches.Patch(facecolor='#FECA57', label='En cours'),
    mpatches.Patch(facecolor='#E8E8E8', label='√Ä venir')
]
ax.legend(handles=legend_elements, loc='upper left', fontsize=10)

# Statistiques rapides (si disponibles)
stats_text = "\nüìä SNAPSHOT ACTUEL :\n"
try:
    # Essayer de charger des stats si base disponible
    stats_text += "   ‚Ä¢ Pipeline en cours d'ex√©cution...\n"
except:
    stats_text += "   ‚Ä¢ D√©marrage du pipeline...\n"

ax.text(5, 1.5, stats_text, ha='center', va='center', fontsize=10,
        bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))

plt.title("üé¨ FIL D'ARIANE VISUEL - Accompagnement narratif du jury", 
          fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nüí° Le fil d'Ariane vous guide √©tape par √©tape √† travers le pipeline")
print("   Chaque visualisation s'inscrit dans cette progression narrative\n")



> Notes:
> - **Contr√¥les PostgreSQL** : Volumes, doublons, NULL, int√©grit√© FK (tables t01-t37)
> - **Contr√¥les MinIO** : Objets, taille totale, r√©partition par pr√©fixe
> - **Visualisations** : Graphiques + tables pandas √† chaque √©tape
> - **Tables E1_v3** : Utilisation des tables t01-t37 selon MPD.sql


# DataSens E1_v3 - 04_quality_checks
# üîç Contr√¥les qualit√© PostgreSQL + MinIO avec visualisations (tables t01-t37)

import json
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from minio import Minio
from sqlalchemy import create_engine, text

# R√©cup√©rer variables notebook 01
if 'PROJECT_ROOT' not in globals():
    current = Path.cwd()
    PROJECT_ROOT = None
    while current != current.parent:
        if (current / "notebooks").exists() and (current / "docs").exists():
            PROJECT_ROOT = current
            break
        current = current.parent
    else:
        PROJECT_ROOT = Path.cwd()

if 'PG_URL' not in globals():
    PG_URL = os.getenv("DATASENS_PG_URL", "postgresql+psycopg2://postgres:postgres@localhost:5433/postgres")

if 'MINIO_ENDPOINT' not in globals():
    MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "http://localhost:9002")
    MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "admin")
    MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "admin123")
    MINIO_BUCKET = os.getenv("MINIO_BUCKET", "datasens-raw")

engine = create_engine(PG_URL, future=True)

print("üîç CONTROLES QUALITE E1_V3 (36/37 tables)")
print("=" * 80)


# ============================================================
# 1. VOLUMES PostgreSQL (tables principales t01-t37)
# ============================================================
print("\nüìä 1. VOLUMES PostgreSQL")
print("-" * 80)

with engine.connect() as conn:
    stats = pd.read_sql_query("""
        SELECT 
            't04_document' AS table_name, COUNT(*) AS nb_lignes
        FROM t04_document
        UNION ALL
        SELECT 't03_flux', COUNT(*) FROM t03_flux
        UNION ALL
        SELECT 't02_source', COUNT(*) FROM t02_source
        UNION ALL
        SELECT 't19_meteo', COUNT(*) FROM t19_meteo
        UNION ALL
        SELECT 't17_territoire', COUNT(*) FROM t17_territoire
        UNION ALL
        SELECT 't01_type_donnee', COUNT(*) FROM t01_type_donnee
    """, conn)

print("\nüìã Table des volumes :")
display(stats)

# Graphique volumes
if len(stats) > 0:
    plt.figure(figsize=(12, 6))
    bars = plt.bar(stats["table_name"], stats["nb_lignes"], color=plt.cm.Pastel1(range(len(stats))))
    for bar, value in zip(bars, stats["nb_lignes"]):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(stats["nb_lignes"]) * 0.01,
                f"{int(value):,}", ha='center', va='bottom', fontweight='bold')
    plt.title("üìä Volumes par table PostgreSQL (E1_v3)", fontsize=14, fontweight='bold')
    plt.ylabel("Nombre de lignes", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis="y", linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.show()

total_docs = stats[stats['table_name'] == 't04_document']['nb_lignes'].iloc[0] if len(stats[stats['table_name'] == 't04_document']) > 0 else 0
print(f"\n‚úÖ Total documents (t04_document) : {total_docs:,}")


# ============================================================
# 2. DOUBLONS (hash_fingerprint)
# ============================================================
print("\nüîé 2. DETECTION DOUBLONS")
print("-" * 80)


In [None]:
with engine.connect() as conn:
    dup_query = text("""
        SELECT hash_fingerprint, COUNT(*) AS nb_occurrences
        FROM t04_document
        WHERE hash_fingerprint IS NOT NULL
        GROUP BY hash_fingerprint
        HAVING COUNT(*) > 1
        ORDER BY nb_occurrences DESC
    """)
    df_doublons = pd.read_sql_query(dup_query, conn)

if len(df_doublons) == 0:
    print("‚úÖ Aucun doublon d√©tect√© (hash_fingerprint unique)")
else:
    print(f"‚ö†Ô∏è {len(df_doublons)} doublons d√©tect√©s !")
    display(df_doublons)
    
    # Graphique doublons si pr√©sents
    if len(df_doublons) > 0:
        plt.figure(figsize=(10, 5))
        plt.barh(range(len(df_doublons)), df_doublons["nb_occurrences"], color='#FF6B6B')
        plt.yticks(range(len(df_doublons)), [f"{hash[:16]}..." for hash in df_doublons["hash_fingerprint"]])
        plt.xlabel("Nombre d'occurrences", fontsize=11)
        plt.title("‚ö†Ô∏è Doublons d√©tect√©s par hash_fingerprint", fontsize=12, fontweight='bold')
        plt.grid(axis="x", linestyle="--", alpha=0.3)
        plt.tight_layout()
        plt.show()

# ============================================================
# 3. VALEURS NULL CRITIQUES
# ============================================================
print("\nüîç 3. VALEURS NULL CRITIQUES")
print("-" * 80)

with engine.connect() as conn:
    nulls = pd.read_sql_query("""
        SELECT 
            'titre' AS champ, COUNT(*) FILTER (WHERE titre IS NULL) AS nb_nulls,
            COUNT(*) AS nb_total,
            ROUND(100.0 * COUNT(*) FILTER (WHERE titre IS NULL) / COUNT(*), 2) AS pct_null
        FROM t04_document
        UNION ALL
        SELECT 'texte', COUNT(*) FILTER (WHERE texte IS NULL), COUNT(*),
               ROUND(100.0 * COUNT(*) FILTER (WHERE texte IS NULL) / COUNT(*), 2)
        FROM t04_document
        UNION ALL
        SELECT 'hash_fingerprint', COUNT(*) FILTER (WHERE hash_fingerprint IS NULL), COUNT(*),
               ROUND(100.0 * COUNT(*) FILTER (WHERE hash_fingerprint IS NULL) / COUNT(*), 2)
        FROM t04_document
    """, conn)

print("\nüìã Taux de NULL par champ critique :")
display(nulls)

if len(nulls) > 0:
    plt.figure(figsize=(10, 5))
    bars = plt.bar(nulls["champ"], nulls["pct_null"], color=['#FF6B6B' if p > 20 else '#4ECDC4' for p in nulls["pct_null"]])
    for bar, value in zip(bars, nulls["pct_null"]):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                f"{value}%", ha='center', va='bottom', fontweight='bold')
    plt.axhline(y=20, color='r', linestyle='--', label='Seuil 20%')
    plt.title("üìä Taux de valeurs NULL par champ (t04_document)", fontsize=12, fontweight='bold')
    plt.ylabel("Pourcentage NULL (%)", fontsize=11)
    plt.legend()
    plt.grid(axis="y", linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 4. INTEGRITE REFERENCES (Foreign Keys)
# ============================================================
print("\nüîó 4. INTEGRITE REFERENCES (Foreign Keys)")
print("-" * 80)


In [None]:
with engine.connect() as conn:
    integrity = pd.read_sql_query("""
        SELECT 
            't04_document ‚Üí t03_flux' AS relation,
            COUNT(*) FILTER (WHERE d.id_flux NOT IN (SELECT id_flux FROM t03_flux)) AS orphelins
        FROM t04_document d
        UNION ALL
        SELECT 't03_flux ‚Üí t02_source',
               COUNT(*) FILTER (WHERE f.id_source NOT IN (SELECT id_source FROM t02_source))
        FROM t03_flux f
        UNION ALL
        SELECT 't19_meteo ‚Üí t17_territoire',
               COUNT(*) FILTER (WHERE m.id_territoire NOT IN (SELECT id_territoire FROM t17_territoire))
        FROM t19_meteo m
    """, conn)

print("\nüìã V√©rification int√©grit√© r√©f√©rentielle :")
display(integrity)

orphelins_total = integrity['orphelins'].sum()
if orphelins_total == 0:
    print("‚úÖ Int√©grit√© r√©f√©rentielle : OK (aucun orphelin)")
else:
    print(f"‚ö†Ô∏è {orphelins_total} orphelins d√©tect√©s !")

# ============================================================
# 5. MINIO DATALAKE
# ============================================================
print("\n‚òÅÔ∏è 5. MINIO DATALAKE")
print("-" * 80)

try:
    minio_client = Minio(
        MINIO_ENDPOINT.replace("http://", "").replace("https://", ""),
        access_key=MINIO_ACCESS_KEY,
        secret_key=MINIO_SECRET_KEY,
        secure=False
    )
    
    objects = list(minio_client.list_objects(MINIO_BUCKET, recursive=True))
    total_size = sum(obj.size for obj in objects)
    
    print(f"\nüìä Bucket '{MINIO_BUCKET}' :")
    print(f"   ‚Ä¢ {len(objects)} objets")
    print(f"   ‚Ä¢ Taille totale : {total_size / (1024*1024):.2f} MB")
    
    # R√©partition par pr√©fixe (type de source)
    prefixes = {}
    for obj in objects:
        prefix = obj.object_name.split('/')[0] if '/' in obj.object_name else 'root'
        prefixes[prefix] = prefixes.get(prefix, 0) + 1
    
    if prefixes:
        df_minio = pd.DataFrame(list(prefixes.items()), columns=["Pr√©fixe", "Nb objets"])
        display(df_minio)
        
        plt.figure(figsize=(10, 5))
        bars = plt.bar(df_minio["Pr√©fixe"], df_minio["Nb objets"], color='#45B7D1')
        for bar, value in zip(bars, df_minio["Nb objets"]):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    str(value), ha='center', va='bottom', fontweight='bold')
        plt.title("üìä R√©partition des objets MinIO par type de source (E1_v3)", fontsize=12, fontweight='bold')
        plt.ylabel("Nombre d'objets", fontsize=11)
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis="y", linestyle="--", alpha=0.3)
        plt.tight_layout()
        plt.show()
    
except Exception as e:
    print(f"‚ö†Ô∏è MinIO non accessible : {e}")
    objects = []

# ============================================================
# 6. BILAN QA GLOBAL
# ============================================================
print("\n‚úÖ 6. BILAN QA GLOBAL E1_V3")
print("-" * 80)

qa_summary = {
    "Volumes": f"{total_docs:,} documents",
    "Doublons": "‚úÖ OK" if len(df_doublons) == 0 else f"‚ö†Ô∏è {len(df_doublons)} doublons",
    "NULL critiques": "‚úÖ OK" if nulls['pct_null'].max() < 20 else f"‚ö†Ô∏è {nulls['pct_null'].max()}% max",
    "Int√©grit√© FK": "‚úÖ OK" if orphelins_total == 0 else f"‚ö†Ô∏è {orphelins_total} orphelins",
    "MinIO": f"‚úÖ {len(objects)} objets"
}

df_qa = pd.DataFrame(list(qa_summary.items()), columns=["Check", "R√©sultat"])
display(df_qa)

print("\n‚úÖ Contr√¥les qualit√© E1_v3 termin√©s !")
print("   üìä Architecture : 36/37 tables (t01-t37) valid√©es")
print("   ‚û°Ô∏è Passez au notebook 05_snapshot_and_readme.ipynb pour finaliser")


## üóëÔ∏è CRUD "D" (DELETE) : Suppression contr√¥l√©e

Suppression avec v√©rification des contraintes ON DELETE


In [None]:
print("üóëÔ∏è CRUD DELETE - Suppression contr√¥l√©e")
print("=" * 80)

with engine.begin() as conn:
    # Compter avant suppression (corrig√© avec pr√©fixe t04_)
    count_before = conn.execute(text("SELECT COUNT(*) FROM t04_document WHERE titre LIKE '%CRUD%'")).scalar()
    print(f"üìä Documents 'CRUD' avant suppression : {count_before}")
    
    # Afficher les documents avant suppression
    df_before = pd.read_sql_query("""
        SELECT id_doc, LEFT(titre, 50) AS titre_apercu, langue, date_publication
        FROM t04_document
        WHERE titre LIKE '%CRUD%'
        LIMIT 5
    """, engine)
    if len(df_before) > 0:
        print("\nüìã Documents avant suppression (aper√ßu) :")
        display(df_before)

    # Supprimer un document (ON DELETE SET NULL pour id_flux)
    conn.execute(text("""
        DELETE FROM t04_document
        WHERE titre LIKE '%CRUD%' AND id_doc IN (
            SELECT id_doc FROM t04_document
            WHERE titre LIKE '%CRUD%'
            LIMIT 1
        )
    """))

    count_after = conn.execute(text("SELECT COUNT(*) FROM t04_document WHERE titre LIKE '%CRUD%'")).scalar()
    print(f"\nüìä Documents 'CRUD' apr√®s suppression : {count_after}")
    print(f"   ‚úÖ {count_before - count_after} document(s) supprim√©(s)")
    
    # Visualisation avant/apr√®s
    if count_before > 0:
        plt.figure(figsize=(10, 5))
        categories = ['Avant DELETE', 'Apr√®s DELETE']
        values = [count_before, count_after]
        colors = ['#FF6B6B', '#4ECDC4']
        bars = plt.bar(categories, values, color=colors)
        for bar, value in zip(bars, values):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(values) * 0.02,
                    f"{int(value)}", ha='center', va='bottom', fontweight='bold', fontsize=12)
        plt.title("üóëÔ∏è Impact de l'op√©ration DELETE (CRUD)", fontsize=12, fontweight='bold')
        plt.ylabel("Nombre de documents", fontsize=11)
        plt.grid(axis="y", linestyle="--", alpha=0.3)
        plt.tight_layout()
        plt.show()

print("\n‚úÖ CRUD DELETE termin√© !")


## üîç Contr√¥les qualit√©

D√©tection des doublons et v√©rification des valeurs NULL


In [None]:
print("üîç Contr√¥les qualit√© avec Visualisations")
print("=" * 80)

import matplotlib.pyplot as plt

with engine.connect() as conn:
    # Doublons fingerprint (corrig√© avec pr√©fixe t04_)
    dup_query = """
    SELECT hash_fingerprint, COUNT(*) as c
    FROM t04_document
    WHERE hash_fingerprint IS NOT NULL
    GROUP BY hash_fingerprint
    HAVING COUNT(*) > 1;
    """
    df_dup = pd.read_sql(dup_query, conn)
    print(f"\nüîé Doublons fingerprint : {len(df_dup)}")
    if len(df_dup) > 0:
        display(df_dup.head(10))
    else:
        print("   ‚úÖ Aucun doublon d√©tect√©")

    # %NULL par colonne (corrig√© avec pr√©fixe t04_)
    null_query = """
    SELECT
        COUNT(*) as total,
        SUM(CASE WHEN titre IS NULL THEN 1 ELSE 0 END)::float / COUNT(*) * 100 as pct_null_titre,
        SUM(CASE WHEN texte IS NULL THEN 1 ELSE 0 END)::float / COUNT(*) * 100 as pct_null_texte,
        SUM(CASE WHEN langue IS NULL THEN 1 ELSE 0 END)::float / COUNT(*) * 100 as pct_null_langue
    FROM t04_document;
    """
    df_null = pd.read_sql(null_query, conn)
    print("\nüìä Pourcentage NULL par colonne :")
    display(df_null)
    
    # Visualisation NULL
    if len(df_null) > 0 and df_null.iloc[0]['total'] > 0:
        row = df_null.iloc[0]
        plt.figure(figsize=(10, 6))
        columns = ['titre', 'texte', 'langue']
        pct_nulls = [row['pct_null_titre'], row['pct_null_texte'], row['pct_null_langue']]
        colors = ['#FF6B6B' if p > 10 else '#4ECDC4' for p in pct_nulls]
        bars = plt.bar(columns, pct_nulls, color=colors)
        for bar, value in zip(bars, pct_nulls):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    f"{value:.1f}%", ha='center', va='bottom', fontweight='bold', fontsize=11)
        plt.axhline(y=10, color='red', linestyle='--', linewidth=2, label='Seuil critique (10%)')
        plt.title("üìä Pourcentage de valeurs NULL par colonne (E1_v3)", fontsize=12, fontweight='bold')
        plt.ylabel("Pourcentage NULL (%)", fontsize=11)
        plt.legend()
        plt.grid(axis="y", linestyle="--", alpha=0.3)
        plt.tight_layout()
        plt.show()

print("\n‚úÖ Contr√¥les qualit√© termin√©s !")


## üìä KPIs : Statistiques par source/type/th√®me

Comptages et agr√©gations pour visualisation


In [None]:
print("üìä KPIs - Statistiques avec Visualisations CRUD")
print("=" * 80)

import matplotlib.pyplot as plt

with engine.connect() as conn:
    # KPI 1 : Counts par type_donnee (corrig√© avec pr√©fixes tXX_)
    kpi1 = """
    SELECT
        td.libelle as type_source,
        COUNT(DISTINCT d.id_doc) as nb_documents,
        COUNT(DISTINCT s.id_source) as nb_sources
    FROM t04_document d
    LEFT JOIN t03_flux f ON d.id_flux = f.id_flux
    LEFT JOIN t02_source s ON f.id_source = s.id_source
    LEFT JOIN t01_type_donnee td ON s.id_type_donnee = td.id_type_donnee
    GROUP BY td.libelle
    ORDER BY nb_documents DESC;
    """
    df_kpi1 = pd.read_sql(kpi1, conn)
    print("\nüì¶ Documents par type de source :")
    display(df_kpi1)
    
    # Graphique documents par type
    if len(df_kpi1) > 0:
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        bars = plt.bar(df_kpi1["type_source"], df_kpi1["nb_documents"], color=plt.cm.Set2(range(len(df_kpi1))))
        for bar, value in zip(bars, df_kpi1["nb_documents"]):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(df_kpi1["nb_documents"]) * 0.02,
                    f"{int(value):,}", ha='center', va='bottom', fontweight='bold', fontsize=9)
        plt.title("üìä Documents par type de source (E1_v3)", fontsize=12, fontweight='bold')
        plt.ylabel("Nombre de documents", fontsize=11)
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis="y", linestyle="--", alpha=0.3)
        
        plt.subplot(1, 2, 2)
        bars = plt.bar(df_kpi1["type_source"], df_kpi1["nb_sources"], color=plt.cm.Pastel1(range(len(df_kpi1))))
        for bar, value in zip(bars, df_kpi1["nb_sources"]):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(df_kpi1["nb_sources"]) * 0.02,
                    str(int(value)), ha='center', va='bottom', fontweight='bold', fontsize=9)
        plt.title("üìä Sources par type de donn√©e (E1_v3)", fontsize=12, fontweight='bold')
        plt.ylabel("Nombre de sources", fontsize=11)
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis="y", linestyle="--", alpha=0.3)
        plt.tight_layout()
        plt.show()

    # KPI 2 : Counts par th√®me (corrig√© avec pr√©fixes tXX_)
    kpi2 = """
    SELECT
        t.libelle as theme,
        COUNT(DISTINCT e.id_event) as nb_evenements,
        COUNT(DISTINCT de.id_doc) as nb_documents_associes
    FROM t24_theme t
    LEFT JOIN t25_evenement e ON t.id_theme = e.id_theme
    LEFT JOIN t27_document_evenement de ON e.id_event = de.id_event
    GROUP BY t.libelle
    ORDER BY nb_evenements DESC;
    """
    df_kpi2 = pd.read_sql(kpi2, conn)
    print("\nüè∑Ô∏è √âv√©nements par th√®me :")
    display(df_kpi2)
    
    # Graphique √©v√©nements par th√®me
    if len(df_kpi2) > 0 and df_kpi2['nb_evenements'].sum() > 0:
        plt.figure(figsize=(14, 6))
        plt.subplot(1, 2, 1)
        top_themes = df_kpi2.nlargest(10, "nb_evenements")
        bars = plt.barh(top_themes["theme"], top_themes["nb_evenements"], color=plt.cm.Pastel2(range(len(top_themes))))
        for i, (bar, value) in enumerate(zip(bars, top_themes["nb_evenements"])):
            plt.text(bar.get_width() + max(top_themes["nb_evenements"]) * 0.02, bar.get_y() + bar.get_height()/2,
                    f"{int(value)}", ha='left', va='center', fontweight='bold', fontsize=9)
        plt.title("üè∑Ô∏è √âv√©nements par th√®me (Top 10)", fontsize=12, fontweight='bold')
        plt.xlabel("Nombre d'√©v√©nements", fontsize=11)
        plt.grid(axis="x", linestyle="--", alpha=0.3)
        
        plt.subplot(1, 2, 2)
        top_docs = df_kpi2.nlargest(10, "nb_documents_associes")
        bars = plt.barh(top_docs["theme"], top_docs["nb_documents_associes"], color='#4ECDC4')
        for i, (bar, value) in enumerate(zip(bars, top_docs["nb_documents_associes"])):
            plt.text(bar.get_width() + max(top_docs["nb_documents_associes"]) * 0.02, bar.get_y() + bar.get_height()/2,
                    f"{int(value)}", ha='left', va='center', fontweight='bold', fontsize=9)
        plt.title("üìÑ Documents associ√©s par th√®me (Top 10)", fontsize=12, fontweight='bold')
        plt.xlabel("Nombre de documents", fontsize=11)
        plt.grid(axis="x", linestyle="--", alpha=0.3)
        plt.tight_layout()
        plt.show()
    
    # KPI 3 : R√©sum√© CRUD op√©rations test√©es
    print("\nüìä R√©sum√© des op√©rations CRUD test√©es :")
    crud_summary = pd.DataFrame({
        "Op√©ration": ["CREATE (INSERT)", "READ (SELECT)", "UPDATE", "DELETE"],
        "Statut": ["‚úÖ Test√©", "‚úÖ Test√©", "‚úÖ Test√©", "‚úÖ Test√©"],
        "Tables utilis√©es": ["t04_document", "t04_document, t02_source, t03_flux", "t04_document", "t04_document"]
    })
    display(crud_summary)
    
    # Graphique r√©sum√© CRUD
    plt.figure(figsize=(10, 6))
    colors = ['#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
    bars = plt.bar(crud_summary["Op√©ration"], [1, 1, 1, 1], color=colors)
    for bar, op in zip(bars, crud_summary["Op√©ration"]):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
                "‚úÖ", ha='center', va='bottom', fontweight='bold', fontsize=20)
    plt.title("‚úÖ Op√©rations CRUD test√©es (E1_v3)", fontsize=14, fontweight='bold')
    plt.ylabel("Statut", fontsize=12)
    plt.ylim(0, 1.5)
    plt.grid(axis="y", linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.show()

print("\n‚úÖ CRUD complet test√© avec succ√®s !")
print("   üìä Visualisations CRUD : Op√©rations CREATE, READ, UPDATE, DELETE valid√©es")
print("   ‚û°Ô∏è Passez au notebook 05_snapshot_and_readme.ipynb pour le dataset final annot√©")
