# üì• DataSens E1 ‚Äî Notebook 3 : Ingestion des 5 Sources

**üéØ Objectif** : Ing√©rer r√©ellement les 5 types de sources avec tra√ßabilit√© compl√®te

---

## üìã Plan d'ingestion

1. **Fichier plat CSV** : Kaggle (50% ‚Üí Postgres, 50% ‚Üí raw)
2. **Base de donn√©es** : Kaggle SQLite ‚Üí Postgres
3. **API** : OpenWeatherMap ‚Üí meteo + flux
4. **Web Scraping** : MonAvisCitoyen (dry-run) ‚Üí document
5. **Big Data** : GDELT GKG ‚Üí evenement + document_evenement

**Tra√ßabilit√©** : Manifest JSON par run avec chemins, compteurs, horodatages

---

## üîí RGPD & Gouvernance

‚ö†Ô∏è **Rappel** : Pas de donn√©es personnelles directes (hash SHA-256), respect robots.txt



In [None]:
# Configuration et imports (architecture pipeline compl√®te)
import hashlib
import json
import logging
import os
import time
import traceback
from datetime import UTC, datetime
from pathlib import Path

import pandas as pd
import requests
from dotenv import load_dotenv
from minio import Minio
from sqlalchemy import create_engine, text
from tqdm import tqdm

# Configuration
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR
load_dotenv(PROJECT_ROOT / ".env")

PG_HOST = os.getenv("POSTGRES_HOST", "localhost")
PG_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
PG_DB = os.getenv("POSTGRES_DB", "datasens")
PG_USER = os.getenv("POSTGRES_USER", "ds_user")
PG_PASS = os.getenv("POSTGRES_PASS", "ds_pass")

PG_URL = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(PG_URL, future=True)

# Configuration MinIO (DataLake)
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "http://localhost:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "miniouser")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "miniosecret")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "datasens-raw")

RAW_DIR = PROJECT_ROOT / "data" / "raw"
MANIFESTS_DIR = RAW_DIR / "manifests"
LOGS_DIR = PROJECT_ROOT / "logs"

# Cr√©er dossiers
RAW_DIR.mkdir(parents=True, exist_ok=True)
MANIFESTS_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR.mkdir(parents=True, exist_ok=True)

# =====================================================
# SYST√àME DE LOGGING (comme datasens_E1_v2.ipynb)
# =====================================================
log_timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
log_file = LOGS_DIR / f"collecte_{log_timestamp}.log"
error_file = LOGS_DIR / f"errors_{log_timestamp}.log"

logger = logging.getLogger("DataSens")
logger.setLevel(logging.DEBUG)

file_formatter = logging.Formatter(
    "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
console_formatter = logging.Formatter(
    "[%(asctime)s] %(levelname)s - %(message)s",
    datefmt="%H:%M:%S"
)

file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(file_formatter)

error_handler = logging.FileHandler(error_file, encoding="utf-8")
error_handler.setLevel(logging.ERROR)
error_handler.setFormatter(file_formatter)

console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(console_formatter)

logger.addHandler(file_handler)
logger.addHandler(error_handler)
logger.addHandler(console_handler)

def log_error(source: str, error: Exception, context: str = ""):
    """Log une erreur avec traceback complet"""
    error_msg = f"[{source}] {context}: {error!s}"
    logger.error(error_msg)
    logger.error(f"Traceback:\n{traceback.format_exc()}")

logger.info("üöÄ Syst√®me de logging initialis√©")
logger.info(f"üìÅ Logs: {log_file}")
logger.info(f"‚ùå Erreurs: {error_file}")

# =====================================================
# MINIO CLIENT (DataLake)
# =====================================================
try:
    minio_client = Minio(
        MINIO_ENDPOINT.replace("http://", "").replace("https://", ""),
        access_key=MINIO_ACCESS_KEY,
        secret_key=MINIO_SECRET_KEY,
        secure=MINIO_ENDPOINT.startswith("https")
    )

    def ensure_bucket(bucket: str = MINIO_BUCKET):
        if not minio_client.bucket_exists(bucket):
            minio_client.make_bucket(bucket)

    def minio_upload(local_path: Path, dest_key: str) -> str:
        """Upload fichier vers MinIO DataLake"""
        ensure_bucket(MINIO_BUCKET)
        minio_client.fput_object(MINIO_BUCKET, dest_key, str(local_path))
        return f"s3://{MINIO_BUCKET}/{dest_key}"

    ensure_bucket()
    logger.info(f"‚úÖ MinIO OK ‚Üí bucket: {MINIO_BUCKET}")
except Exception as e:
    logger.warning(f"‚ö†Ô∏è MinIO non disponible: {e} - Mode local uniquement")
    minio_client = None
    def minio_upload(local_path: Path, dest_key: str) -> str:
        return f"local://{local_path}"

# =====================================================
# FONCTIONS UTILITAIRES
# =====================================================
def ts() -> str:
    """Timestamp UTC ISO compact"""
    return datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")

def sha256(s: str) -> str:
    """Hash SHA-256 pour d√©duplication"""
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def get_source_id(conn, nom: str) -> int:
    """R√©cup√®re l'id_source depuis le nom"""
    logger.info(f"[get_source_id] Recherche source: {nom}")
    result = conn.execute(text("SELECT id_source FROM source WHERE nom = :nom"), {"nom": nom}).fetchone()
    if result:
        logger.info(f"   ‚Üí id_source trouv√©: {result[0]}")
        return result[0]
    logger.warning(f"   ‚Üí Source non trouv√©e: {nom}")
    return None

def create_flux(conn, id_source: int, format_type: str = "csv", manifest_uri: str = None) -> int:
    """Cr√©e un flux et retourne id_flux"""
    logger.info(f"[create_flux] Cr√©ation flux pour id_source={id_source}, format={format_type}")
    result = conn.execute(text("""
        INSERT INTO flux (id_source, format, manifest_uri)
        VALUES (:id_source, :format, :manifest_uri)
        RETURNING id_flux
    """), {"id_source": id_source, "format": format_type, "manifest_uri": manifest_uri})
    id_flux = result.scalar()
    logger.info(f"   ‚Üí id_flux cr√©√©: {id_flux}")
    return id_flux

def ensure_territoire(conn, ville: str, code_insee: str = None, lat: float = None, lon: float = None) -> int:
    """Cr√©e ou r√©cup√®re un territoire"""
    logger.info(f"[ensure_territoire] V√©rification territoire: ville={ville}")
    result = conn.execute(text("SELECT id_territoire FROM territoire WHERE ville = :ville"), {"ville": ville}).fetchone()
    if result:
        logger.info(f"   ‚Üí id_territoire existant: {result[0]}")
        return result[0]
    result = conn.execute(text("""
        INSERT INTO territoire (ville, code_insee, lat, lon)
        VALUES (:ville, :code_insee, :lat, :lon)
        RETURNING id_territoire
    """), {"ville": ville, "code_insee": code_insee, "lat": lat, "lon": lon})
    id_territoire = result.scalar()
    logger.info(f"   ‚Üí id_territoire cr√©√©: {id_territoire}")
    return id_territoire

def insert_documents(conn, docs: list) -> int:
    """Insertion batch de documents avec gestion doublons"""
    logger.info(f"[insert_documents] Insertion de {len(docs)} documents...")
    inserted = 0
    for doc in docs:
        try:
            result = conn.execute(text("""
                INSERT INTO document (id_flux, id_territoire, titre, texte, langue, date_publication, hash_fingerprint)
                VALUES (:id_flux, :id_territoire, :titre, :texte, :langue, :date_publication, :hash_fingerprint)
                ON CONFLICT (hash_fingerprint) DO NOTHING
                RETURNING id_doc
            """), doc)
            id_doc = result.scalar()
            if id_doc:
                logger.info(f"   ‚Üí Document ins√©r√©: id_doc={id_doc}, titre={doc.get('titre', '')[:40]}")
                inserted += 1
        except Exception as e:
            log_error("insert_documents", e, "Erreur insertion document")
    logger.info(f"   ‚Üí Total ins√©r√©s: {inserted}/{len(docs)}")
    return inserted

print("‚úÖ Configuration pipeline charg√©e")
print(f"   üìç PostgreSQL : {PG_HOST}:{PG_PORT}/{PG_DB}")
print(f"   ‚òÅÔ∏è MinIO : {MINIO_BUCKET if minio_client else 'Mode local'}")
print(f"   üìÇ Raw data : {RAW_DIR}")
print(f"   üìÑ Logs : {LOGS_DIR}")
print("\n‚úÖ Pipeline DataLake + PostgreSQL pr√™t !")


## üìÑ Source 1/5 : Fichier plat CSV (Kaggle)

**Architecture hybride (comme datasens_E1_v2.ipynb)** :
- **50% ‚Üí PostgreSQL** : Donn√©es structur√©es pour requ√™tes SQL
- **50% ‚Üí MinIO DataLake** : Donn√©es brutes pour analyses Big Data futures

**Process** :
1. Chargement CSV depuis `data/raw/kaggle/`
2. Calcul SHA256 fingerprint pour d√©duplication
3. Split al√©atoire 50/50
4. Upload 50% vers MinIO (DataLake)
5. Insertion 50% dans PostgreSQL avec tra√ßabilit√© (id_flux)


In [None]:
logger.info("üìÑ SOURCE 1/5 : Fichier plat CSV (Kaggle)")
logger.info("=" * 80)

# Rechercher fichier Kaggle existant ou cr√©er √©chantillon
kaggle_csv_paths = [
    RAW_DIR / "kaggle" / "kaggle_sample.csv",
    PROJECT_ROOT / "data" / "raw" / "kaggle" / "*.csv",
    Path.cwd() / "data" / "raw" / "kaggle" / "*.csv"
]

kaggle_csv_path = None
for path in kaggle_csv_paths:
    if path.exists():
        kaggle_csv_path = path
        break

if not kaggle_csv_path or not kaggle_csv_path.exists():
    logger.warning("‚ö†Ô∏è Fichier Kaggle non trouv√© ‚Äî Cr√©ation √©chantillon pour d√©mo")
    sample_data = pd.DataFrame({
        "text": [
            "Great product, very satisfied!",
            "Service terrible, avoid at all costs",
            "Excellent quality, recommend",
            "Bon produit, je recommande",
            "Mauvais service, d√©√ßu"
        ],
        "langue": ["en", "en", "en", "fr", "fr"],
        "date": [datetime.now(UTC)] * 5
    })
    kaggle_csv_path = RAW_DIR / "kaggle" / "kaggle_sample.csv"
    kaggle_csv_path.parent.mkdir(parents=True, exist_ok=True)
    sample_data.to_csv(kaggle_csv_path, index=False)
    logger.info(f"   ‚úÖ √âchantillon cr√©√© : {kaggle_csv_path.name}")

# Charger le CSV
df_kaggle = pd.read_csv(kaggle_csv_path)
logger.info(f"üìä {len(df_kaggle)} lignes charg√©es")

# Split 50/50 (architecture hybride : PostgreSQL + MinIO)
df_kaggle["hash_fingerprint"] = df_kaggle["text"].apply(lambda x: sha256(str(x)))
mid_point = len(df_kaggle) // 2
df_pg = df_kaggle.iloc[:mid_point].copy()  # 50% ‚Üí PostgreSQL
df_raw = df_kaggle.iloc[mid_point:].copy()  # 50% ‚Üí MinIO DataLake

logger.info(f"   ‚Ä¢ 50% PostgreSQL : {len(df_pg)} lignes")
logger.info(f"   ‚Ä¢ 50% MinIO DataLake : {len(df_raw)} lignes")

# Sauvegarder 50% en raw local + upload MinIO
raw_output = RAW_DIR / "kaggle" / f"kaggle_raw_{ts()}.csv"
df_raw.to_csv(raw_output, index=False)
logger.info(f"   ‚úÖ Sauvegard√© local : {raw_output.name}")

# Upload MinIO (50% bruts vers DataLake)
try:
    minio_uri = minio_upload(raw_output, f"kaggle/{raw_output.name}")
    logger.info(f"   ‚òÅÔ∏è Upload MinIO : {minio_uri}")
except Exception as e:
    log_error("MinIO", e, "Upload fichier Kaggle")
    minio_uri = f"local://{raw_output}"

# Ins√©rer 50% dans PostgreSQL
with engine.begin() as conn:
    id_source = get_source_id(conn, "Kaggle CSV")
    if not id_source:
        id_type = conn.execute(text("SELECT id_type_donnee FROM type_donnee WHERE libelle = 'Fichier plat'")).scalar()
        conn.execute(text("""
            INSERT INTO source (id_type_donnee, nom, url, fiabilite)
            VALUES (:id_type, 'Kaggle CSV', 'https://www.kaggle.com', 0.8)
        """), {"id_type": id_type})
        id_source = conn.execute(text("SELECT id_source FROM source WHERE nom = 'Kaggle CSV'")).scalar()

    id_flux = create_flux(conn, id_source, "csv", minio_uri)

    # Pr√©parer documents pour insertion batch
    docs = []
    for _, row in df_pg.iterrows():
        docs.append({
            "id_flux": id_flux,
            "id_territoire": None,
            "titre": "",
            "texte": str(row["text"]),
            "langue": row.get("langue", "en"),
            "date_publication": row.get("date", datetime.now(UTC)),
            "hash_fingerprint": row["hash_fingerprint"]
        })

    inserted = insert_documents(conn, docs)

logger.info(f"\n‚úÖ Source 1/5 termin√©e : {inserted} docs PostgreSQL + {len(df_raw)} docs MinIO")


## üîß Architecture Pipeline (R√©f√©rence datasens_E1_v2.ipynb)

**Ce notebook suit l'architecture du pipeline existant** :

‚úÖ **Logging structur√©** : `logs/collecte_*.log` + `logs/errors_*.log`  
‚úÖ **MinIO DataLake** : Upload automatique fichiers bruts ‚Üí `s3://datasens-raw/`  
‚úÖ **PostgreSQL** : Insertion structur√©e avec tra√ßabilit√© (flux, manifests)  
‚úÖ **Fonctions helpers** : `create_flux()`, `insert_documents()`, `ensure_territoire()`, `minio_upload()`  
‚úÖ **D√©duplication** : Hash SHA-256 pour √©viter doublons  
‚úÖ **RGPD** : Pas de donn√©es personnelles directes  

**Sources 2-5** : Impl√©ment√©es ci-dessous avec vraies sources (code extrait de `datasens_E1_v2.ipynb`)


## üå¶Ô∏è Source 2/5 : API OpenWeatherMap

Collecte de donn√©es m√©t√©o en temps r√©el via l'API OpenWeatherMap.

**Villes collect√©es** : Paris, Lyon, Marseille, Lille

**Donn√©es r√©cup√©r√©es** :
- Temp√©rature (¬∞C), Humidit√© (%), Pression (hPa)
- Description m√©t√©o (clair, nuageux, pluie...)
- Vitesse du vent (m/s)
- Timestamp de mesure

**Stockage** :
- **PostgreSQL** : Table `meteo` avec g√©olocalisation (id_territoire FK)
- **MinIO** : CSV brut pour historisation compl√®te

**RGPD** : Aucune donn√©e personnelle, donn√©es publiques uniquement


In [None]:
logger.info("üå¶Ô∏è SOURCE 2/5 : API OpenWeatherMap")
logger.info("=" * 80)

# Variables d'environnement
OWM_API_KEY = os.getenv("OWM_API_KEY")
if not OWM_API_KEY:
    logger.warning("‚ö†Ô∏è OWM_API_KEY manquante dans .env - Source 2 ignor√©e")
else:
    OWM_CITIES = ["Paris,FR", "Lyon,FR", "Marseille,FR", "Lille,FR"]

    rows = []
    for c in tqdm(OWM_CITIES, desc="OWM"):
        try:
            r = requests.get(
                "https://api.openweathermap.org/data/2.5/weather",
                params={"q": c, "appid": OWM_API_KEY, "units": "metric", "lang": "fr"},
                timeout=10
            )
            if r.status_code == 200:
                j = r.json()
                rows.append({
                    "ville": j["name"],
                    "lat": j["coord"]["lat"],
                    "lon": j["coord"]["lon"],
                    "date_obs": pd.to_datetime(j["dt"], unit="s"),
                    "temperature": j["main"]["temp"],
                    "humidite": j["main"]["humidity"],
                    "vent_kmh": (j.get("wind", {}).get("speed") or 0) * 3.6,
                    "pression": j.get("main", {}).get("pressure"),
                    "meteo_type": j["weather"][0]["main"] if j.get("weather") else None
                })
        except Exception as e:
            log_error("OpenWeatherMap", e, f"Collecte m√©t√©o {c}")

        time.sleep(1)  # Respect rate limit

    if len(rows) > 0:
        dfm = pd.DataFrame(rows)
        local = RAW_DIR / "api" / "owm" / f"owm_{ts()}.csv"
        local.parent.mkdir(parents=True, exist_ok=True)
        dfm.to_csv(local, index=False)

        try:
            minio_uri = minio_upload(local, f"api/owm/{local.name}")
            logger.info(f"   ‚òÅÔ∏è Upload MinIO : {minio_uri}")
        except Exception as e:
            log_error("MinIO", e, "Upload fichier OWM")
            minio_uri = f"local://{local}"

        # Insertion PostgreSQL
        with engine.begin() as conn:
            id_source = get_source_id(conn, "OpenWeatherMap")
            if not id_source:
                id_type = conn.execute(text("SELECT id_type_donnee FROM type_donnee WHERE libelle = 'API'")).scalar()
                if id_type:
                    conn.execute(text("""
                        INSERT INTO source (id_type_donnee, nom, url, fiabilite)
                        VALUES (:id_type, 'OpenWeatherMap', 'https://openweathermap.org/api', 0.9)
                    """), {"id_type": id_type})
                    id_source = conn.execute(text("SELECT id_source FROM source WHERE nom = 'OpenWeatherMap'")).scalar()
                else:
                    logger.warning("   ‚ö†Ô∏è Type 'API' non trouv√© dans type_donnee")

            if id_source:
                id_flux = create_flux(conn, id_source, "json", minio_uri)

                # Ins√©rer territoires et m√©t√©o
                for _, r in dfm.iterrows():
                    tid = ensure_territoire(conn, ville=r["ville"], lat=r["lat"], lon=r["lon"])
                    try:
                        conn.execute(text("""
                            INSERT INTO meteo(id_territoire, date_obs, temperature, humidite, vent_kmh, pression, meteo_type)
                            VALUES(:t, :d, :T, :H, :V, :P, :MT)
                        """), {
                            "t": tid, "d": r["date_obs"], "T": r["temperature"],
                            "H": r["humidite"], "V": r["vent_kmh"], "P": r["pression"], "MT": r["meteo_type"]
                        })
                    except Exception as e:
                        log_error("meteo", e, f"Insertion relev√© {r['ville']}")

                logger.info(f"‚úÖ Source 2/5 termin√©e : {len(dfm)} relev√©s m√©t√©o ins√©r√©s")
            else:
                logger.warning("   ‚ö†Ô∏è Source OpenWeatherMap non cr√©√©e - insertion m√©t√©o ignor√©e")
    else:
        logger.warning("‚ö†Ô∏è Aucun relev√© m√©t√©o collect√©")


## üì∞ Source 3/5 : Flux RSS Multi-Sources (Presse fran√ßaise)

Collecte d'articles d'actualit√© via 3 flux RSS fran√ßais compl√©mentaires.

**Sources** :
- **Franceinfo** : flux principal actualit√©s nationales
- **20 Minutes** : actualit√©s fran√ßaises grand public
- **Le Monde** : presse de r√©f√©rence

**Extraction** : titre, description, date publication, URL source

**Stockage** : PostgreSQL + MinIO

**D√©duplication** : SHA256 sur (titre + description) pour √©viter doublons inter-sources

**Parser** : Utilisation de `feedparser` pour robustesse


In [None]:
logger.info("üì∞ SOURCE 3/5 : Flux RSS Multi-Sources (Presse fran√ßaise)")
logger.info("=" * 80)

try:
    import feedparser
except ImportError:
    logger.error("‚ùå Module feedparser manquant - install: pip install feedparser")
    feedparser = None

if feedparser:
    RSS_SOURCES = {
        "Franceinfo": "https://www.francetvinfo.fr/titres.rss",
        "20 Minutes": "https://www.20minutes.fr/feeds/rss-une.xml",
        "Le Monde": "https://www.lemonde.fr/rss/une.xml"
    }

    all_rss_items = []

    for source_name, rss_url in RSS_SOURCES.items():
        logger.info(f"üì° Source : {source_name}")
        logger.info(f"   URL : {rss_url}")

        try:
            feed = feedparser.parse(rss_url)

            if len(feed.entries) == 0:
                logger.warning("   ‚ö†Ô∏è Aucun article trouv√©")
                continue

            source_items = []
            for e in feed.entries[:100]:  # Max 100 par source
                titre = e.get("title", "").strip()
                texte = (e.get("summary", "") or e.get("description", "") or "").strip()
                dp = pd.to_datetime(e.get("published", ""), errors="coerce")
                url = e.get("link", "")

                if titre and texte:
                    source_items.append({
                        "titre": titre,
                        "texte": texte,
                        "date_publication": dp if pd.notna(dp) else datetime.now(UTC),
                        "langue": "fr",
                        "source_media": source_name,
                        "url": url
                    })

            all_rss_items.extend(source_items)
            logger.info(f"   ‚úÖ {len(source_items)} articles collect√©s")

        except Exception as e:
            log_error(f"RSS_{source_name}", e, "Parsing flux RSS")
            logger.warning(f"   ‚ö†Ô∏è Erreur : {str(e)[:80]}")

        time.sleep(1)  # Respect rate limit

    # Consolidation DataFrame
    if len(all_rss_items) > 0:
        dfr = pd.DataFrame(all_rss_items)

        # D√©duplication inter-sources
        dfr["hash_fingerprint"] = dfr.apply(lambda row: sha256(row["titre"] + " " + row["texte"]), axis=1)
        nb_avant = len(dfr)
        dfr = dfr.drop_duplicates(subset=["hash_fingerprint"])
        nb_apres = len(dfr)

        logger.info(f"üßπ D√©duplication : {nb_avant} ‚Üí {nb_apres} articles uniques ({nb_avant - nb_apres} doublons supprim√©s)")

        # Distribution par source
        logger.info("üìä Distribution par source :")
        for source in dfr["source_media"].value_counts().items():
            logger.info(f"   {source[0]:15s} : {source[1]:3d} articles")

        # Sauvegarde locale + MinIO
        local = RAW_DIR / "rss" / f"rss_multi_sources_{ts()}.csv"
        local.parent.mkdir(parents=True, exist_ok=True)
        dfr.to_csv(local, index=False)

        try:
            minio_uri = minio_upload(local, f"rss/{local.name}")
            logger.info(f"   ‚òÅÔ∏è Upload MinIO : {minio_uri}")
        except Exception as e:
            log_error("MinIO", e, "Upload fichier RSS")
            minio_uri = f"local://{local}"

        # Insertion PostgreSQL
        with engine.begin() as conn:
            id_source = get_source_id(conn, "Flux RSS Multi-Sources")
            if not id_source:
                id_type = conn.execute(text("SELECT id_type_donnee FROM type_donnee WHERE libelle = 'API' OR libelle = 'Web Scraping'")).scalar()
                if id_type:
                    conn.execute(text("""
                        INSERT INTO source (id_type_donnee, nom, url, fiabilite)
                        VALUES (:id_type, 'Flux RSS Multi-Sources', 'https://www.francetvinfo.fr/titres.rss', 0.95)
                    """), {"id_type": id_type})
                    id_source = conn.execute(text("SELECT id_source FROM source WHERE nom = 'Flux RSS Multi-Sources'")).scalar()

            if id_source:
                id_flux = create_flux(conn, id_source, "rss", minio_uri)

                # Pr√©parer documents pour insertion batch
                docs = []
                for _, row in dfr.iterrows():
                    docs.append({
                        "id_flux": id_flux,
                        "id_territoire": None,
                        "titre": row["titre"],
                        "texte": row["texte"],
                        "langue": row["langue"],
                        "date_publication": row["date_publication"],
                        "hash_fingerprint": row["hash_fingerprint"]
                    })

                inserted = insert_documents(conn, docs)
                logger.info(f"‚úÖ Source 3/5 termin√©e : {inserted} articles RSS ins√©r√©s")
            else:
                logger.warning("   ‚ö†Ô∏è Source RSS non cr√©√©e - insertion ignor√©e")
    else:
        logger.warning("‚ö†Ô∏è Aucun article RSS collect√©")
else:
    logger.warning("‚ö†Ô∏è Module feedparser manquant - Source 3 ignor√©e")


## üåê Source 4/5 : Web Scraping Multi-Sources (Dry-run MonAvisCitoyen)

Collecte de donn√©es citoyennes depuis sources l√©gales et √©thiques (version simplifi√©e pour E1).

**Sources impl√©ment√©es (dry-run)** :
- **Vie-publique.fr** (RSS) : Consultations citoyennes nationales
- **data.gouv.fr** (API) : Open Data datasets CSV officiels

**√âthique & L√©galit√©** :
- ‚úÖ Open Data gouvernemental (.gouv.fr)
- ‚úÖ Respect robots.txt
- ‚úÖ APIs officielles uniquement
- ‚úÖ Aucun scraping de sites priv√©s sans autorisation

**Stockage** :
- **PostgreSQL** : Documents structur√©s
- **MinIO** : CSV bruts pour audit


In [None]:
logger.info("üåê SOURCE 4/5 : Web Scraping Multi-Sources (Dry-run)")
logger.info("=" * 80)

all_scraping_data = []

# ============================================================
# SOURCE 1 : VIE-PUBLIQUE.FR (RSS)
# ============================================================
logger.info("üèõÔ∏è Source 1/2 : Vie-publique.fr (RSS)")

try:
    if feedparser:
        feed_url = "https://www.vie-publique.fr/rss"
        feed = feedparser.parse(feed_url)

        for entry in feed.entries[:50]:
            all_scraping_data.append({
                "titre": entry.get("title", ""),
                "texte": entry.get("summary", entry.get("description", "")),
                "source_site": "vie-publique.fr",
                "url": entry.get("link", ""),
                "date_publication": datetime(*entry.published_parsed[:6], tzinfo=UTC) if hasattr(entry, "published_parsed") else datetime.now(UTC),
                "langue": "fr"
            })

        logger.info(f"‚úÖ Vie-publique.fr: {len([d for d in all_scraping_data if 'vie-publique' in d['source_site']])} articles collect√©s")
    else:
        logger.warning("   ‚ö†Ô∏è Module feedparser manquant")
except Exception as e:
    log_error("ViePublique", e, "Parsing RSS feed")
    logger.warning(f"   ‚ö†Ô∏è Vie-publique.fr: {str(e)[:100]} (skip)")

# ============================================================
# SOURCE 2 : DATA.GOUV.FR (API officielle)
# ============================================================
logger.info("üìä Source 2/2 : data.gouv.fr (API officielle)")

try:
    url = "https://www.data.gouv.fr/api/1/datasets/"
    params = {"q": "france", "page_size": 50}
    response = requests.get(url, params=params, timeout=10)
    response.raise_for_status()

    data = response.json()
    for dataset in data.get("data", []):
        all_scraping_data.append({
            "titre": dataset.get("title", ""),
            "texte": dataset.get("description", dataset.get("title", "")),
            "source_site": "data.gouv.fr",
            "url": f"https://www.data.gouv.fr/fr/datasets/{dataset.get('slug', '')}",
            "date_publication": datetime.fromisoformat(dataset.get("created_at", datetime.now(UTC).isoformat()).replace("Z", "+00:00")),
            "langue": "fr"
        })

    logger.info(f"‚úÖ data.gouv.fr: {len([d for d in all_scraping_data if 'data.gouv' in d['source_site']])} datasets collect√©s")

except Exception as e:
    log_error("DataGouv", e, "Collecte datasets Open Data")
    logger.warning(f"   ‚ö†Ô∏è data.gouv.fr: {str(e)[:100]} (skip)")

# ============================================================
# CONSOLIDATION ET STORAGE
# ============================================================
if len(all_scraping_data) > 0:
    df_scraping = pd.DataFrame(all_scraping_data)

    # Nettoyage
    df_scraping = df_scraping[df_scraping["texte"].str.len() > 20].copy()
    df_scraping["hash_fingerprint"] = df_scraping["texte"].apply(lambda t: sha256(t[:500]))
    df_scraping = df_scraping.drop_duplicates(subset=["hash_fingerprint"])

    logger.info(f"üìà Total collect√©: {len(df_scraping)} documents citoyens")
    logger.info(f"   ‚Ä¢ Vie Publique: {len(df_scraping[df_scraping['source_site'].str.contains('vie-publique', na=False)])}")
    logger.info(f"   ‚Ä¢ Data.gouv: {len(df_scraping[df_scraping['source_site'].str.contains('data.gouv', na=False)])}")

    # Storage MinIO
    scraping_dir = RAW_DIR / "scraping" / "multi"
    scraping_dir.mkdir(parents=True, exist_ok=True)
    local = scraping_dir / f"scraping_multi_{ts()}.csv"
    df_scraping.to_csv(local, index=False)

    try:
        minio_uri = minio_upload(local, f"scraping/multi/{local.name}")
        logger.info(f"   ‚òÅÔ∏è Upload MinIO : {minio_uri}")
    except Exception as e:
        log_error("MinIO", e, "Upload fichier scraping")
        minio_uri = f"local://{local}"

    # Storage PostgreSQL
    with engine.begin() as conn:
        id_source = get_source_id(conn, "Web Scraping Multi-Sources")
        if not id_source:
            id_type = conn.execute(text("SELECT id_type_donnee FROM type_donnee WHERE libelle = 'Web Scraping'")).scalar()
            if id_type:
                conn.execute(text("""
                    INSERT INTO source (id_type_donnee, nom, url, fiabilite)
                    VALUES (:id_type, 'Web Scraping Multi-Sources', 'https://www.data.gouv.fr', 0.85)
                """), {"id_type": id_type})
                id_source = conn.execute(text("SELECT id_source FROM source WHERE nom = 'Web Scraping Multi-Sources'")).scalar()

        if id_source:
            id_flux = create_flux(conn, id_source, "html", minio_uri)

            docs = []
            for _, row in df_scraping.iterrows():
                docs.append({
                    "id_flux": id_flux,
                    "id_territoire": None,
                    "titre": row["titre"],
                    "texte": row["texte"],
                    "langue": row["langue"],
                    "date_publication": row["date_publication"],
                    "hash_fingerprint": row["hash_fingerprint"]
                })

            inserted = insert_documents(conn, docs)
            logger.info(f"‚úÖ Source 4/5 termin√©e : {inserted} documents scraping ins√©r√©s")
        else:
            logger.warning("   ‚ö†Ô∏è Source scraping non cr√©√©e - insertion ignor√©e")
else:
    logger.warning("‚ö†Ô∏è Aucune donn√©e collect√©e depuis les sources web scraping")


## üåç Source 5/5 : GDELT GKG France (Big Data)

T√©l√©chargement et analyse de donn√©es Big Data depuis GDELT Project (Global Database of Events, Language, and Tone) avec **focus France**.

**Source** : http://data.gdeltproject.org/gdeltv2/

**Format** : GKG 2.0 (Global Knowledge Graph) - Fichiers CSV.zip (~300 MB/15min)

**Contenu Big Data** :
- √âv√©nements mondiaux g√©olocalis√©s
- **Tonalit√© √©motionnelle** (V2Tone : -100 n√©gatif ‚Üí +100 positif)
- **Th√®mes extraits** (V2Themes : PROTEST, HEALTH, ECONOMY, TERROR...)
- **Entit√©s nomm√©es** (V2Persons, V2Organizations)
- **G√©olocalisation** (V2Locations avec codes pays)

**Filtrage France** :
- S√©lection √©v√©nements avec localisation France (code pays FR)
- Extraction tonalit√© moyenne France
- Top th√®mes fran√ßais

**Strat√©gie Big Data** :
- T√©l√©chargement fichier derni√®res 15min (~6-300 MB brut)
- Parsing colonnes V2* nomm√©es (27 colonnes GKG)
- Filtrage g√©ographique France ‚Üí √©chantillon
- Storage MinIO (fichier brut complet)
- Insertion PostgreSQL (√©v√©nements France)


In [None]:
logger.info("üåç SOURCE 5/5 : GDELT GKG France (Big Data)")
logger.info("=" * 80)

import io
import zipfile

# Colonnes GKG 2.0 (version compl√®te)
GKG_COLUMNS = [
    "GKGRECORDID", "V2.1DATE", "V2SourceCollectionIdentifier", "V2SourceCommonName",
    "V2DocumentIdentifier", "V1Counts", "V2.1Counts", "V1Themes", "V2Themes",
    "V1Locations", "V2Locations", "V1Persons", "V2Persons", "V1Organizations",
    "V2Organizations", "V1.5Tone", "V2.1Tone", "V2.1Dates", "V2.1Amounts",
    "V2.1TransInfo", "V2.1Extras", "V21SourceLanguage", "V21QuotationLanguage",
    "V21Url", "V21Date2", "V21Xml"
]

# R√©cup√©rer le fichier GKG le plus r√©cent (derni√®res 15 minutes)
try:
    # URL du dernier update GDELT
    update_url = "http://data.gdeltproject.org/gdeltv2/lastupdate.txt"
    r = requests.get(update_url, timeout=15)

    if r.status_code == 200:
        lines = r.text.strip().split("\n")
        # Trouver ligne GKG (pas export ni mentions)
        gkg_line = [line for line in lines if ".gkg.csv.zip" in line and "translation" not in line]

        if gkg_line:
            # Format: size hash url
            parts = gkg_line[0].split()
            gkg_url = parts[2] if len(parts) >= 3 else parts[-1]
            file_size_mb = int(parts[0]) / 1024 / 1024 if parts[0].isdigit() else 0

            logger.info(f"üì• T√©l√©chargement GDELT GKG ({file_size_mb:.1f} MB)")
            logger.info(f"   URL: {gkg_url}")

            # T√©l√©charger
            gkg_r = requests.get(gkg_url, timeout=120)

            if gkg_r.status_code == 200:
                # Sauvegarder ZIP
                zip_filename = gkg_url.split("/")[-1]
                zip_path = RAW_DIR / "gdelt" / zip_filename
                zip_path.parent.mkdir(parents=True, exist_ok=True)

                with zip_path.open("wb") as f:
                    f.write(gkg_r.content)

                logger.info(f"   ‚úÖ T√©l√©charg√©: {zip_path.name} ({len(gkg_r.content) / 1024 / 1024:.1f} MB)")

                # Upload MinIO (fichier brut complet)
                try:
                    minio_uri = minio_upload(zip_path, f"gdelt/{zip_path.name}")
                    logger.info(f"   ‚òÅÔ∏è Upload MinIO : {minio_uri}")
                except Exception as e:
                    log_error("MinIO", e, "Upload fichier GDELT")
                    minio_uri = f"local://{zip_path}"

                # Extraction et parsing
                with zipfile.ZipFile(zip_path, "r") as z:
                    csv_filename = z.namelist()[0]
                    logger.info(f"\nüìä Parsing: {csv_filename}")

                    with z.open(csv_filename) as f:
                        # Lire avec pandas
                        try:
                            df_gkg = pd.read_csv(
                                io.BytesIO(f.read()),
                                sep="\t",
                                header=None,
                                names=GKG_COLUMNS,
                                on_bad_lines="skip",
                                low_memory=False,
                                nrows=5000  # Limiter pour d√©mo (sinon trop long)
                            )

                            logger.info(f"   üìà Total lignes charg√©es: {len(df_gkg):,}")

                            # üá´üá∑ FILTRAGE FRANCE
                            logger.info("\nüá´üá∑ Filtrage √©v√©nements France...")
                            df_france = df_gkg[
                                df_gkg["V2Locations"].fillna("").str.contains("1#France#FR#", na=False) |
                                df_gkg["V2Locations"].fillna("").str.contains("#FR#", na=False)
                            ].copy()

                            logger.info(f"   ‚úÖ √âv√©nements France: {len(df_france):,} ({len(df_france)/len(df_gkg)*100:.1f}%)")

                            if len(df_france) > 0:
                                # Extraction tonalit√© √©motionnelle
                                def parse_tone(tone_str):
                                    if pd.isna(tone_str) or tone_str == "":
                                        return None
                                    try:
                                        parts = str(tone_str).split(",")
                                        return float(parts[0]) if parts else None
                                    except Exception:
                                        return None

                                df_france["tone_value"] = df_france["V2.1Tone"].apply(parse_tone)
                                avg_tone = df_france["tone_value"].mean()

                                logger.info(f"üìä Tonalit√© moyenne France: {avg_tone:.2f} (-100=tr√®s n√©gatif, +100=tr√®s positif)")

                                # Insertion PostgreSQL (√©v√©nements et documents)
                                with engine.begin() as conn:
                                    id_source = get_source_id(conn, "GDELT GKG")
                                    if not id_source:
                                        id_type = conn.execute(text("SELECT id_type_donnee FROM type_donnee WHERE libelle = 'Big Data'")).scalar()
                                        if id_type:
                                            conn.execute(text("""
                                                INSERT INTO source (id_type_donnee, nom, url, fiabilite)
                                                VALUES (:id_type, 'GDELT GKG', 'http://data.gdeltproject.org/gdeltv2/', 0.9)
                                            """), {"id_type": id_type})
                                            id_source = conn.execute(text("SELECT id_source FROM source WHERE nom = 'GDELT GKG'")).scalar()

                                    if id_source:
                                        id_flux = create_flux(conn, id_source, "csv", minio_uri)

                                        # Insertion √©v√©nements et documents
                                        inserted_events = 0
                                        inserted_docs = 0

                                        for _, row in df_france.head(100).iterrows():  # Limiter √† 100 pour d√©mo
                                            try:
                                                # Cr√©er th√®me si n√©cessaire
                                                themes_str = str(row["V2Themes"]) if pd.notna(row["V2Themes"]) else ""
                                                theme_libelle = themes_str.split(";")[0] if themes_str else "GENERAL"

                                                theme_id = conn.execute(text("""
                                                    SELECT id_theme FROM theme WHERE libelle = :libelle
                                                """), {"libelle": theme_libelle}).fetchone()

                                                if not theme_id:
                                                    conn.execute(text("""
                                                        INSERT INTO theme (libelle, description)
                                                        VALUES (:libelle, :desc)
                                                    """), {"libelle": theme_libelle, "desc": f"Th√®me GDELT: {theme_libelle}"})
                                                    theme_id = conn.execute(text("""
                                                        SELECT id_theme FROM theme WHERE libelle = :libelle
                                                    """), {"libelle": theme_libelle}).fetchone()

                                                theme_id_val = theme_id[0] if theme_id else None

                                                # Cr√©er √©v√©nement
                                                event_result = conn.execute(text("""
                                                    INSERT INTO evenement (id_theme, date_event, avg_tone, source_event)
                                                    VALUES (:theme, :date_event, :tone, :source)
                                                    RETURNING id_event
                                                """), {
                                                    "theme": theme_id_val,
                                                    "date_event": datetime.fromtimestamp(int(str(row["V2.1DATE"])[:8]), tz=UTC) if len(str(row["V2.1DATE"])) >= 8 else datetime.now(UTC),
                                                    "tone": avg_tone,
                                                    "source": "GDELT"
                                                })
                                                event_id = event_result.scalar()

                                                # Cr√©er document associ√©
                                                doc_text = f"{row.get('V2SourceCommonName', '')} - {themes_str[:200]}"
                                                doc_hash = sha256(doc_text)

                                                doc_result = conn.execute(text("""
                                                    INSERT INTO document (id_flux, id_territoire, titre, texte, langue, date_publication, hash_fingerprint)
                                                    VALUES (:id_flux, NULL, :titre, :texte, 'en', :date_pub, :hash)
                                                    ON CONFLICT (hash_fingerprint) DO NOTHING
                                                    RETURNING id_doc
                                                """), {
                                                    "id_flux": id_flux,
                                                    "titre": row.get("V2SourceCommonName", "GDELT Event")[:200],
                                                    "texte": doc_text,
                                                    "date_pub": datetime.now(UTC),
                                                    "hash": doc_hash
                                                })
                                                doc_id = doc_result.scalar()

                                                if doc_id and event_id:
                                                    # Lier document √† √©v√©nement
                                                    conn.execute(text("""
                                                        INSERT INTO document_evenement (id_doc, id_event)
                                                        VALUES (:doc_id, :event_id)
                                                        ON CONFLICT DO NOTHING
                                                    """), {"doc_id": doc_id, "event_id": event_id})
                                                    inserted_events += 1
                                                    inserted_docs += 1

                                            except Exception as e:
                                                log_error("GDELT", e, "Insertion √©v√©nement/document")

                                        logger.info(f"‚úÖ Source 5/5 termin√©e : {inserted_events} √©v√©nements France ins√©r√©s ({inserted_docs} docs)")
                                    else:
                                        logger.warning("   ‚ö†Ô∏è Source GDELT non cr√©√©e - insertion ignor√©e")
                            else:
                                logger.warning("   ‚ö†Ô∏è Aucun √©v√©nement France trouv√© dans ce fichier")

                        except Exception as e:
                            log_error("GDELT", e, "Parsing CSV")
                            logger.warning(f"   ‚ùå Erreur parsing CSV: {str(e)[:100]}")
                            logger.info("   i Fichier brut sauvegard√© sur MinIO")

            else:
                logger.error(f"   ‚ùå Erreur t√©l√©chargement GKG: {gkg_r.status_code}")
        else:
            logger.warning("   ‚ö†Ô∏è Aucun fichier GKG trouv√© dans lastupdate.txt")
    else:
        logger.error(f"   ‚ùå Erreur acc√®s lastupdate.txt: {r.status_code}")

except Exception as e:
    log_error("GDELT", e, "Collecte Big Data")
    logger.warning(f"‚ùå Erreur GDELT: {str(e)[:200]}")
    logger.info("i GDELT peut √™tre temporairement indisponible (service tiers)")


## üìã Cr√©ation du Manifest JSON

G√©n√©ration d'un manifest JSON pour tra√ßabilit√© compl√®te de toutes les ingestions


## üìä Barom√®tres DataSens - Sources M√©tier (E2/E3)

Les 5 sources de base (E1) sont compl√®tes. Pour enrichir le dataset avec des donn√©es m√©tier sp√©cialis√©es, voici **10 types de barom√®tres** √† impl√©menter dans les phases E2/E3 :

### üìã Liste des Barom√®tres

1. **üîπ Barom√®tre de confiance politique & sociale**
   - **Source** : CEVIPOF ‚Äì La confiance des Fran√ßais dans la politique
   - **Th√©matique** : Soci√©t√©, gouvernance, d√©mocratie, institutions
   - **Format** : CSV / PDF / API
   - **Mapping E1** : API / Fichier plat

2. **üîπ Barom√®tre des √©motions et du moral des Fran√ßais**
   - **Source** : Kantar Public / Ipsos Mood of France
   - **Th√©matique** : Joie, anxi√©t√©, col√®re, espoir (‚Üí table EMOTION)
   - **Format** : CSV / scraping
   - **Mapping E1** : CSV / Web Scraping

3. **üîπ Barom√®tre environnemental**
   - **Source** : ADEME / IFOP pour la transition √©cologique
   - **Th√©matique** : √âcologie, √©nergie, climat, sobri√©t√©
   - **Format** : Dataset plat + API
   - **Mapping E1** : API / CSV

4. **üîπ Barom√®tre √©conomique et social**
   - **Source** : INSEE Conjoncture + BVA Observatoire social
   - **Th√©matique** : Pouvoir d'achat, ch√¥mage, inflation, emploi
   - **Format** : Base SQL / CSV
   - **Mapping E1** : Base de donn√©es / CSV

5. **üîπ Barom√®tre des m√©dias et de la confiance**
   - **Source** : La Croix ‚Äì Barom√®tre Kantar sur les m√©dias
   - **Th√©matique** : Information, confiance m√©diatique, fake news
   - **Format** : Web scraping
   - **Mapping E1** : Web Scraping

6. **üîπ Barom√®tre sport & coh√©sion sociale**
   - **Source** : Minist√®re des Sports / CNOSF / Paris 2024
   - **Th√©matique** : Sport, bien-√™tre, fiert√© nationale, coh√©sion
   - **Format** : CSV / API
   - **Mapping E1** : CSV / API

7. **üîπ Barom√®tre des discriminations et √©galit√©**
   - **Source** : D√©fenseur des Droits / IFOP
   - **Th√©matique** : Inclusion, diversit√©, √©galit√© femmes-hommes
   - **Format** : CSV / API
   - **Mapping E1** : CSV / API

8. **üîπ Barom√®tre sant√© mentale et bien-√™tre**
   - **Source** : Sant√© Publique France ‚Äì CoviPrev
   - **Th√©matique** : Stress, anxi√©t√©, sant√© mentale post-COVID
   - **Format** : CSV
   - **Mapping E1** : CSV

9. **üîπ Barom√®tre climat social et tensions**
   - **Source** : Elabe / BFMTV Opinion 2024
   - **Th√©matique** : Col√®re, frustration, confiance, peur
   - **Format** : Web Scraping
   - **Mapping E1** : Web Scraping

10. **üîπ Barom√®tre innovation et IA**
    - **Source** : CNIL / France IA / Capgemini Research Institute
    - **Th√©matique** : Adoption de l'IA, confiance num√©rique
    - **Format** : PDF / API
    - **Mapping E1** : API / PDF scraping

### üìö Documentation Compl√®te

Voir `docs/BAROMETRES_SOURCES.md` pour :
- D√©tails par barom√®tre (URLs, format, tables PostgreSQL)
- Plan d'impl√©mentation E2/E3
- Notes techniques et RGPD

### üéØ Plan d'Impl√©mentation

**Phase E2 (Priorit√©)** :
1. Barom√®tre √©conomique et social (INSEE)
2. Barom√®tre des √©motions (Kantar/Ipsos)
3. Barom√®tre sant√© mentale (Sant√© Publique France)

**Phase E3 (Compl√©ment)** :
4-10. Autres barom√®tres selon priorit√©s m√©tier

**Architecture** : Tous les barom√®tres suivront le m√™me pipeline que les sources E1 :
- Logging structur√©
- Upload MinIO
- Insertion PostgreSQL avec helpers
- D√©duplication SHA-256


In [None]:
# =====================================================
# SOURCES 2, 3, 4, 5 : √Ä IMPL√âMENTER AVEC VRAIES SOURCES
# =====================================================
#
# Pour respecter l'architecture pipeline du notebook datasens_E1_v2.ipynb,
# les sources 2-5 doivent √™tre impl√©ment√©es avec :
# 1. Collecte r√©elle depuis API/BDD/Scraping/GDELT
# 2. Upload MinIO pour tra√ßabilit√© DataLake
# 3. Insertion PostgreSQL avec fonctions helpers (create_flux, insert_documents)
# 4. Logging complet via logger.info/error
#
# Voir notebook datasens_E1_v2.ipynb pour impl√©mentations compl√®tes :
# - Source 2 : Kaggle DB (SQLite ‚Üí Postgres via Pandas)
# - Source 3 : OpenWeatherMap API (voir Cell 20 du notebook existant)
# - Source 4 : Web Scraping MonAvisCitoyen (voir Cell 26 du notebook existant)
# - Source 5 : GDELT GKG Big Data (voir Cell 28 du notebook existant)

logger.info("\nüìã Pour sources 2-5 : Voir notebooks/datasens_E1_v2.ipynb")
logger.info("   ‚Üí Exemples complets avec vraies API keys et collectes r√©elles")

# =====================================================
# MANIFEST JSON (Tra√ßabilit√© finale)
# =====================================================
logger.info("üìã Cr√©ation du manifest JSON")
logger.info("=" * 80)

# Compter les donn√©es collect√©es
with engine.connect() as conn:
    counts = {
        "documents": conn.execute(text("SELECT COUNT(*) FROM document")).scalar(),
        "flux": conn.execute(text("SELECT COUNT(*) FROM flux")).scalar(),
        "sources": conn.execute(text("SELECT COUNT(*) FROM source")).scalar(),
        "meteo": conn.execute(text("SELECT COUNT(*) FROM meteo")).scalar(),
        "evenements": conn.execute(text("SELECT COUNT(*) FROM evenement")).scalar(),
    }

manifest = {
    "run_id": ts(),
    "timestamp_utc": datetime.now(UTC).isoformat(),
    "notebook_version": "03_ingest_sources.ipynb",
    "sources_ingested": [
        "Kaggle CSV (fichier plat - 50% PG + 50% MinIO)",
        "Kaggle DB (base de donn√©es - √† impl√©menter)",
        "OpenWeatherMap (API - √† impl√©menter)",
        "MonAvisCitoyen (scraping - √† impl√©menter)",
        "GDELT GKG (big data - √† impl√©menter)"
    ],
    "counts": counts,
    "postgres_db": PG_DB,
    "minio_bucket": MINIO_BUCKET,
    "raw_data_location": str(RAW_DIR),
    "log_file": str(log_file)
}

# Sauvegarder manifest local + MinIO
manifest_path = MANIFESTS_DIR / f"manifest_{manifest['run_id']}.json"
manifest_path.parent.mkdir(parents=True, exist_ok=True)

with manifest_path.open("w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2, ensure_ascii=False)

try:
    manifest_minio_uri = minio_upload(manifest_path, f"manifests/{manifest_path.name}")
    logger.info(f"‚úÖ Manifest cr√©√© : {manifest_path.name}")
    logger.info(f"‚òÅÔ∏è Manifest MinIO : {manifest_minio_uri}")
except Exception as e:
    log_error("MinIO", e, "Upload manifest")
    manifest_minio_uri = f"local://{manifest_path}"

logger.info("\nüìä R√©sum√© ingestion :")
for key, value in counts.items():
    logger.info(f"   ‚Ä¢ {key}: {value}")

logger.info("\n‚úÖ Ingestion termin√©e ! (Source 1/5 compl√®te, sources 2-5 √† documenter)")
logger.info("   ‚û°Ô∏è Passez au notebook 04_crud_tests.ipynb")
