# DataSens E1_v2 — 03_ingest_sources

- Objectifs: ingestion réelle (ex: RSS), création `flux` + `document`
- Prérequis: 02_schema_create
- Sortie: lignes insérées + fichier brut (manifest local)
- Guide: docs/GUIDE_TECHNIQUE_E1.md



> Notes:
> - Lecture d’un flux RSS (Franceinfo) via `feedparser`.
> - Construction d’un DataFrame normalisé: `titre`, `texte`, `date_publication`, `langue`.
> - Sauvegarde du brut en CSV (traçabilité) et insertion en base.
> - `get_source_id` assure l’existence de la source; `flux` matérialise la collecte.


In [None]:
# DataSens E1_v2 - 03_ingest_sources
# Placeholders ingestion: Kaggle(50/50), OWM, RSS
import os
import time
from pathlib import Path

import feedparser
import pandas as pd
from sqlalchemy import create_engine, text

ROOT = Path.cwd()
RAW = ROOT / "data" / "raw"
PG_URL = os.getenv("DATASENS_PG_URL", "postgresql+psycopg2://ds_user:ds_pass@localhost:5432/datasens")
engine = create_engine(PG_URL, future=True)

# Helper: create flux
with engine.begin() as conn:
    def get_source_id(nom):
        r = conn.execute(text("SELECT id_source FROM source WHERE nom=:n"), {"n": nom}).scalar()
        if not r:
            tid = conn.execute(text("INSERT INTO type_donnee(libelle) VALUES ('API') RETURNING id_type_donnee")).scalar()
            r = conn.execute(text("INSERT INTO source(id_type_donnee,nom,url,fiabilite) VALUES (:t,:n,'',0.8) RETURNING id_source"), {"t": tid, "n": nom}).scalar()
        return r

# RSS minimal demo
RSS = {"Franceinfo": "https://www.francetvinfo.fr/titres.rss"}
items = []
for name, url in RSS.items():
    try:
        feed = feedparser.parse(url)
        for e in feed.entries[:20]:
            items.append({
                "titre": (e.get("title") or "").strip(),
                "texte": (e.get("summary") or e.get("description") or "").strip(),
                "date_publication": pd.to_datetime(e.get("published", None), errors="coerce"),
                "langue": "fr"
            })
    except Exception:
        pass

if items:
    df = pd.DataFrame(items)
    local = (RAW / "rss" / f"rss_{int(time.time())}.csv"); local.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(local, index=False)
    with engine.begin() as conn:
        sid = get_source_id("Flux RSS Multi-Sources (Franceinfo + 20 Minutes + Le Monde)")
        fid = conn.execute(text("INSERT INTO flux(id_source,format,manifest_uri) VALUES (:s,'rss',:m) RETURNING id_flux"), {"s": sid, "m": str(local)}).scalar()
        for _, r in df.iterrows():
            conn.execute(text("""
            INSERT INTO document(id_flux,titre,texte,langue,date_publication,hash_fingerprint)
            VALUES(:f,:t,:x,:l,:d,substr(encode(digest(coalesce(:t,'')||' '||coalesce(:x,''),'sha256'),'hex'),1,64))
            ON CONFLICT DO NOTHING
            """), {"f": fid, "t": r["titre"], "x": r["texte"], "l": r["langue"], "d": r["date_publication"]})
    print(f"✅ RSS: {len(df)} articles insérés")
else:
    print("⚠️ RSS: aucune donnée")

