# üîÑ DataSens E1 ‚Äî Notebook 4 : Tests CRUD Complets

**üéØ Objectif** : D√©montrer les op√©rations CRUD (Create, Read, Update, Delete) sur les tables principales

---

## üìã Contenu de ce notebook

1. **CRUD "C" (Create)** : Insertion de documents, m√©t√©o, indicateurs
2. **CRUD "R" (Read)** : Requ√™tes jointes complexes
3. **CRUD "U" (Update)** : Mise √† jour de champs
4. **CRUD "D" (Delete)** : Suppression contr√¥l√©e (ON DELETE)
5. **Contr√¥les qualit√©** : D√©tection doublons, %NULL par colonne
6. **KPIs** : Counts par source/type_donnee, par th√®me/√©v√©nement

---

## üîí RGPD & Gouvernance

‚ö†Ô∏è **Rappel** : Suppressions avec ON DELETE CASCADE pour int√©grit√© r√©f√©rentielle



In [None]:
# Configuration (r√©utiliser depuis notebooks pr√©c√©dents)
import os
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR
load_dotenv(PROJECT_ROOT / ".env")

PG_HOST = os.getenv("POSTGRES_HOST", "localhost")
PG_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
PG_DB = os.getenv("POSTGRES_DB", "datasens")
PG_USER = os.getenv("POSTGRES_USER", "ds_user")
PG_PASS = os.getenv("POSTGRES_PASS", "ds_pass")

PG_URL = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(PG_URL, future=True)

print("‚úÖ Connexion PostgreSQL √©tablie")
print(f"   üìç {PG_HOST}:{PG_PORT}/{PG_DB}")


## ‚úÖ CRUD "C" (CREATE) : Insertion de donn√©es

Insertion d'exemples pour tester les contraintes d'int√©grit√©


In [None]:
print("üìù CRUD CREATE - Insertion d'exemples")
print("=" * 80)

with engine.begin() as conn:
    # 1. Cr√©er un document
    result = conn.execute(text("""
        INSERT INTO document (titre, texte, langue, hash_fingerprint)
        VALUES (:titre, :texte, :langue, :hash)
        RETURNING id_doc
    """), {
        "titre": "Test CRUD Create",
        "texte": "Document de test pour d√©monstration CRUD",
        "langue": "fr",
        "hash": "test_hash_1234567890abcdef"
    })
    id_doc = result.scalar()
    print(f"‚úÖ Document cr√©√© : id_doc = {id_doc}")

    # 2. Cr√©er un relev√© m√©t√©o
    # D'abord s'assurer qu'un territoire existe
    result = conn.execute(text("""
        INSERT INTO territoire (ville, code_insee, lat, lon)
        VALUES ('Paris', '75056', 48.8566, 2.3522)
        ON CONFLICT (code_insee) DO UPDATE SET code_insee = EXCLUDED.code_insee
        RETURNING id_territoire
    """))
    id_territoire = result.scalar()

    result = conn.execute(text("""
        INSERT INTO meteo (id_territoire, date_obs, temperature, humidite, vent_kmh, pression, meteo_type)
        VALUES (:t, NOW(), :temp, :hum, :vent, :pres, :type)
        RETURNING id_meteo
    """), {
        "t": id_territoire,
        "temp": 18.5,
        "hum": 65.0,
        "vent": 15.0,
        "pres": 1013.25,
        "type": "CLOUDS"
    })
    id_meteo = result.scalar()
    print(f"‚úÖ Relev√© m√©t√©o cr√©√© : id_meteo = {id_meteo}")

    # 3. Cr√©er un indicateur
    result = conn.execute(text("""
        SELECT id_type_indic FROM type_indicateur WHERE code = 'POPULATION'
    """)).scalar()

    if result:
        id_type_indic = result
        conn.execute(text("""
            INSERT INTO indicateur (id_territoire, id_type_indic, valeur, annee)
            VALUES (:t, :ti, :val, :annee)
        """), {
            "t": id_territoire,
            "ti": id_type_indic,
            "val": 2161000.0,
            "annee": 2023
        })
        print("‚úÖ Indicateur cr√©√© pour Paris (population 2023)")

print("\n‚úÖ CRUD CREATE termin√© !")


## üìñ CRUD "R" (READ) : Requ√™tes jointes

Lecture des donn√©es avec jointures complexes


In [None]:
print("üìñ CRUD READ - Requ√™tes jointes")
print("=" * 80)

# Requ√™te 1 : Documents avec territoire et source
query1 = """
SELECT
    d.id_doc,
    LEFT(d.titre, 50) as titre_extrait,
    d.langue,
    t.ville,
    s.nom as source,
    f.date_collecte
FROM document d
LEFT JOIN territoire t ON d.id_territoire = t.id_territoire
LEFT JOIN flux f ON d.id_flux = f.id_flux
LEFT JOIN source s ON f.id_source = s.id_source
ORDER BY d.id_doc DESC
LIMIT 10;
"""

df_read = pd.read_sql(query1, engine)
print(f"\nüìÑ {len(df_read)} documents avec jointures :\n")
print(df_read.to_string(index=False))

# Requ√™te 2 : M√©t√©o avec territoire
query2 = """
SELECT
    t.ville,
    m.date_obs,
    m.temperature,
    m.humidite,
    m.meteo_type
FROM meteo m
JOIN territoire t ON m.id_territoire = t.id_territoire
ORDER BY m.date_obs DESC
LIMIT 5;
"""

df_meteo = pd.read_sql(query2, engine)
print("\nüå¶Ô∏è Derniers relev√©s m√©t√©o :\n")
print(df_meteo.to_string(index=False))

print("\n‚úÖ CRUD READ termin√© !")


## ‚úèÔ∏è CRUD "U" (UPDATE) : Mise √† jour

Modification de champs existants


In [None]:
print("‚úèÔ∏è CRUD UPDATE - Mise √† jour")
print("=" * 80)

with engine.begin() as conn:
    # Mettre √† jour un document
    result = conn.execute(text("""
        UPDATE document
        SET langue = :langue, titre = :titre
        WHERE id_doc = (
            SELECT id_doc FROM document
            WHERE titre LIKE '%CRUD%'
            LIMIT 1
        )
        RETURNING id_doc, titre, langue
    """), {
        "langue": "fr",
        "titre": "Test CRUD Update - Modifi√©"
    })

    row = result.fetchone()
    if row:
        print(f"‚úÖ Document mis √† jour : id_doc={row[0]}, titre='{row[1]}', langue='{row[2]}'")
    else:
        print("‚ö†Ô∏è Aucun document √† mettre √† jour")

print("\n‚úÖ CRUD UPDATE termin√© !")


## üóëÔ∏è CRUD "D" (DELETE) : Suppression contr√¥l√©e

Suppression avec v√©rification des contraintes ON DELETE


In [None]:
print("üóëÔ∏è CRUD DELETE - Suppression contr√¥l√©e")
print("=" * 80)

with engine.begin() as conn:
    # Compter avant suppression
    count_before = conn.execute(text("SELECT COUNT(*) FROM document WHERE titre LIKE '%CRUD%'")).scalar()
    print(f"üìä Documents 'CRUD' avant suppression : {count_before}")

    # Supprimer un document (ON DELETE SET NULL pour id_flux)
    conn.execute(text("""
        DELETE FROM document
        WHERE titre LIKE '%CRUD%' AND id_doc IN (
            SELECT id_doc FROM document
            WHERE titre LIKE '%CRUD%'
            LIMIT 1
        )
    """))

    count_after = conn.execute(text("SELECT COUNT(*) FROM document WHERE titre LIKE '%CRUD%'")).scalar()
    print(f"üìä Documents 'CRUD' apr√®s suppression : {count_after}")
    print(f"   ‚úÖ {count_before - count_after} document(s) supprim√©(s)")

print("\n‚úÖ CRUD DELETE termin√© !")


## üîç Contr√¥les qualit√©

D√©tection des doublons et v√©rification des valeurs NULL


In [None]:
print("üîç Contr√¥les qualit√©")
print("=" * 80)

with engine.connect() as conn:
    # Doublons fingerprint
    dup_query = """
    SELECT hash_fingerprint, COUNT(*) as c
    FROM document
    WHERE hash_fingerprint IS NOT NULL
    GROUP BY hash_fingerprint
    HAVING COUNT(*) > 1;
    """
    df_dup = pd.read_sql(dup_query, conn)
    print(f"\nüîé Doublons fingerprint : {len(df_dup)}")
    if len(df_dup) > 0:
        print(df_dup.head())
    else:
        print("   ‚úÖ Aucun doublon d√©tect√©")

    # %NULL par colonne
    null_query = """
    SELECT
        COUNT(*) as total,
        SUM(CASE WHEN titre IS NULL THEN 1 ELSE 0 END)::float / COUNT(*) * 100 as pct_null_titre,
        SUM(CASE WHEN texte IS NULL THEN 1 ELSE 0 END)::float / COUNT(*) * 100 as pct_null_texte,
        SUM(CASE WHEN langue IS NULL THEN 1 ELSE 0 END)::float / COUNT(*) * 100 as pct_null_langue
    FROM document;
    """
    df_null = pd.read_sql(null_query, conn)
    print("\nüìä Pourcentage NULL par colonne :")
    print(df_null.to_string(index=False))

print("\n‚úÖ Contr√¥les qualit√© termin√©s !")


## üìä KPIs : Statistiques par source/type/th√®me

Comptages et agr√©gations pour visualisation


In [None]:
print("üìä KPIs - Statistiques")
print("=" * 80)

with engine.connect() as conn:
    # KPI 1 : Counts par type_donnee
    kpi1 = """
    SELECT
        td.libelle as type_source,
        COUNT(DISTINCT d.id_doc) as nb_documents,
        COUNT(DISTINCT s.id_source) as nb_sources
    FROM document d
    LEFT JOIN flux f ON d.id_flux = f.id_flux
    LEFT JOIN source s ON f.id_source = s.id_source
    LEFT JOIN type_donnee td ON s.id_type_donnee = td.id_type_donnee
    GROUP BY td.libelle
    ORDER BY nb_documents DESC;
    """
    df_kpi1 = pd.read_sql(kpi1, conn)
    print("\nüì¶ Documents par type de source :")
    print(df_kpi1.to_string(index=False))

    # KPI 2 : Counts par th√®me
    kpi2 = """
    SELECT
        t.libelle as theme,
        COUNT(DISTINCT e.id_event) as nb_evenements,
        COUNT(DISTINCT de.id_doc) as nb_documents_associes
    FROM theme t
    LEFT JOIN evenement e ON t.id_theme = e.id_theme
    LEFT JOIN document_evenement de ON e.id_event = de.id_event
    GROUP BY t.libelle
    ORDER BY nb_evenements DESC;
    """
    df_kpi2 = pd.read_sql(kpi2, conn)
    print("\nüè∑Ô∏è √âv√©nements par th√®me :")
    print(df_kpi2.to_string(index=False))

print("\n‚úÖ CRUD complet test√© avec succ√®s !")
print("   ‚û°Ô∏è Passez au notebook 05_snapshot_and_readme.ipynb")
