# DataSens E1_v2 — 02_schema_create

- Objectifs: DDL PostgreSQL (noyau 18 tables)
- Prérequis: 01_setup_env + PostgreSQL démarré (`DATASENS_PG_URL`)
- Sortie: schéma Merise relationnel
- Guide: docs/GUIDE_TECHNIQUE_E1.md



> Notes:
> - E1_v2 passe sur PostgreSQL (environnement réaliste).
> - `create_engine(PG_URL)` prépare la connexion via SQLAlchemy.
> - Le bloc DDL crée les tables si absentes (FK, contraintes, index implicites).
> - Les rôles: `source` (provenance), `flux` (collecte), `document` (contenu).


In [None]:
# DataSens E1_v2 - 02_schema_create
# 💾 Schéma PostgreSQL complet (18 tables) + Bootstrap référentiels

import os
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, text
from datetime import datetime

# Utiliser les variables du notebook 01
if 'PG_URL' not in globals():
    PG_URL = os.getenv("DATASENS_PG_URL", "postgresql+psycopg2://postgres:postgres@localhost:5433/postgres")

engine = create_engine(PG_URL, future=True)
print(f"📂 Connexion PostgreSQL : {engine.url.host}:{engine.url.port}/{engine.url.database}")

# DDL complet (18 tables)
ddl_sql = """
CREATE TABLE IF NOT EXISTS type_donnee (
  id_type_donnee SERIAL PRIMARY KEY,
  libelle VARCHAR(100) NOT NULL
);

CREATE TABLE IF NOT EXISTS source (
  id_source SERIAL PRIMARY KEY,
  id_type_donnee INT REFERENCES type_donnee(id_type_donnee),
  nom VARCHAR(100) NOT NULL,
  url TEXT,
  fiabilite FLOAT
);

CREATE TABLE IF NOT EXISTS flux (
  id_flux SERIAL PRIMARY KEY,
  id_source INT NOT NULL REFERENCES source(id_source) ON DELETE CASCADE,
  date_collecte TIMESTAMP NOT NULL DEFAULT NOW(),
  format VARCHAR(20),
  manifest_uri TEXT
);

CREATE TABLE IF NOT EXISTS territoire (
  id_territoire SERIAL PRIMARY KEY,
  ville VARCHAR(120),
  code_insee VARCHAR(10),
  lat FLOAT,
  lon FLOAT
);

CREATE TABLE IF NOT EXISTS document (
  id_doc SERIAL PRIMARY KEY,
  id_flux INT REFERENCES flux(id_flux) ON DELETE SET NULL,
  id_territoire INT REFERENCES territoire(id_territoire) ON DELETE SET NULL,
  titre TEXT,
  texte TEXT,
  langue VARCHAR(10),
  date_publication TIMESTAMP,
  hash_fingerprint VARCHAR(64) UNIQUE
);

CREATE TABLE IF NOT EXISTS type_indicateur (
  id_type_indic SERIAL PRIMARY KEY,
  code VARCHAR(50) UNIQUE,
  libelle VARCHAR(100),
  unite VARCHAR(20)
);

CREATE TABLE IF NOT EXISTS source_indicateur (
  id_source_indic SERIAL PRIMARY KEY,
  nom VARCHAR(100),
  url TEXT
);

CREATE TABLE IF NOT EXISTS indicateur (
  id_indic SERIAL PRIMARY KEY,
  id_territoire INT NOT NULL REFERENCES territoire(id_territoire) ON DELETE CASCADE,
  id_type_indic INT NOT NULL REFERENCES type_indicateur(id_type_indic),
  id_source_indic INT REFERENCES source_indicateur(id_source_indic),
  valeur FLOAT,
  annee INT
);

CREATE TABLE IF NOT EXISTS meteo (
  id_meteo SERIAL PRIMARY KEY,
  id_territoire INT NOT NULL REFERENCES territoire(id_territoire) ON DELETE CASCADE,
  date_obs TIMESTAMP NOT NULL,
  temperature FLOAT,
  humidite FLOAT,
  vent_kmh FLOAT,
  pression FLOAT,
  meteo_type VARCHAR(50)
);

CREATE TABLE IF NOT EXISTS theme (
  id_theme SERIAL PRIMARY KEY,
  libelle VARCHAR(100),
  description TEXT
);

CREATE TABLE IF NOT EXISTS evenement (
  id_event SERIAL PRIMARY KEY,
  id_theme INT REFERENCES theme(id_theme),
  date_event TIMESTAMP,
  avg_tone FLOAT,
  source_event VARCHAR(50)
);

CREATE TABLE IF NOT EXISTS document_evenement (
  id_doc INT REFERENCES document(id_doc) ON DELETE CASCADE,
  id_event INT REFERENCES evenement(id_event) ON DELETE CASCADE,
  PRIMARY KEY (id_doc, id_event)
);

CREATE TABLE IF NOT EXISTS document_theme (
  id_doc INT REFERENCES document(id_doc) ON DELETE CASCADE,
  id_theme INT REFERENCES theme(id_theme) ON DELETE CASCADE,
  PRIMARY KEY (id_doc, id_theme)
);

-- Index pour performance
CREATE INDEX IF NOT EXISTS idx_document_hash ON document(hash_fingerprint);
CREATE INDEX IF NOT EXISTS idx_document_flux ON document(id_flux);
CREATE INDEX IF NOT EXISTS idx_flux_source ON flux(id_source);
"""

with engine.begin() as conn:
    conn.exec_driver_sql(ddl_sql)

print("✅ DDL PostgreSQL déployé (18 tables)")

# Bootstrap : Référentiels
BOOTSTRAP = {
    "type_donnee": ["Fichier", "Base de Données", "API", "Web Scraping", "Big Data"],
    "sources": [
        ("Kaggle CSV", "Fichier", "kaggle://dataset", 0.8),
        ("Kaggle DB", "Base de Données", "kaggle://db", 0.8),
        ("OpenWeatherMap", "API", "https://api.openweathermap.org", 0.9),
        ("NewsAPI", "API", "https://newsapi.org", 0.85),
        ("Flux RSS Multi-Sources", "API", "https://rss-multi", 0.75),
        ("Web Scraping Multi-Sources", "Web Scraping", "multi", 0.75),
        ("GDELT GKG France", "Big Data", "http://data.gdeltproject.org/gkg/", 0.7)
    ]
}

tables_created = ["type_donnee", "source", "flux", "document", "territoire", 
                  "type_indicateur", "source_indicateur", "indicateur", 
                  "meteo", "theme", "evenement", "document_evenement", "document_theme"]

with engine.begin() as conn:
    # Type_donnee
    for lbl in BOOTSTRAP["type_donnee"]:
        conn.execute(text("""
            INSERT INTO type_donnee(libelle)
            SELECT :lbl WHERE NOT EXISTS (
              SELECT 1 FROM type_donnee WHERE libelle=:lbl
            )
        """), {"lbl": lbl})
    
    # Sources
    for nom, td_lbl, url, fia in BOOTSTRAP["sources"]:
        id_td = conn.execute(text("SELECT id_type_donnee FROM type_donnee WHERE libelle=:l"), {"l": td_lbl}).scalar()
        conn.execute(text("""
            INSERT INTO source (id_type_donnee, nom, url, fiabilite)
            SELECT :id_td, :nom, :url, :fia
            WHERE NOT EXISTS (
              SELECT 1 FROM source WHERE nom=:nom
            )
        """), {"id_td": id_td, "nom": nom, "url": url, "fia": fia})

print("✅ Bootstrap référentiels effectué")

# 📊 Visualisations
categories = {
    "Collecte & Traçabilité": ["type_donnee", "source", "flux", "document"],
    "Géographie": ["territoire"],
    "Données Métier": ["meteo", "indicateur", "type_indicateur", "source_indicateur", "evenement", "theme"],
    "Relations": ["document_evenement", "document_theme"]
}

cat_counts = {cat: len([t for t in tables_created if t in cat_list]) 
              for cat, cat_list in categories.items()}

# Graphique répartition
df_tables = pd.DataFrame(list(cat_counts.items()), columns=["Catégorie", "Nombre"])
plt.figure(figsize=(10, 6))
colors = plt.cm.Pastel1(range(len(df_tables)))
bars = plt.bar(df_tables["Catégorie"], df_tables["Nombre"], color=colors)
for bar, value in zip(bars, df_tables["Nombre"]):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
             str(value), ha='center', va='bottom', fontweight='bold', fontsize=11)
plt.title("📊 Schéma créé : Répartition des 13 tables par catégorie", fontsize=14, fontweight='bold')
plt.xlabel("Catégorie", fontsize=12)
plt.ylabel("Nombre de tables", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis="y", linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()

# 📋 Tables de données réelles
print("\n📋 Tables créées :")
df_schema = pd.DataFrame({"Table": tables_created, "Statut": ["Créée" for _ in tables_created]})
display(df_schema)

print("\n📋 Référentiels bootstrap (type_donnee) :")
df_types = pd.read_sql_query("SELECT * FROM type_donnee ORDER BY id_type_donnee", engine)
display(df_types)

print("\n📋 Référentiels bootstrap (source) :")
df_sources = pd.read_sql_query("""
    SELECT s.id_source, s.nom, td.libelle AS type_donnee, s.url, s.fiabilite
    FROM source s
    JOIN type_donnee td ON s.id_type_donnee = td.id_type_donnee
    ORDER BY s.id_source
""", engine)
display(df_sources)

print(f"\n✅ Schéma créé : {len(tables_created)} tables + {len(df_types)} types + {len(df_sources)} sources")



Connexion PG: postgresql+psycopg2://ds_user:***@localhost:5432/datasens
✅ DDL de base déployé
