In [None]:
# DataSens E1_v3 ‚Äî 01_setup_env

- Objectifs: Configuration compl√®te MinIO + PostgreSQL + arborescence + logging
- Pr√©requis: Docker Compose lanc√© (MinIO + PostgreSQL), Python + venv, `pip install -r requirements.txt`
- Ordre global E1_v3: 01 ‚Üí 02 ‚Üí 03 ‚Üí 04 ‚Üí 05
- Guide: docs/GUIDE_TECHNIQUE_E1.md

> **E1_v3** : Architecture compl√®te **36/37 tables** avec **TOUTES les sources r√©elles**
> - Sources compl√®tes : Kaggle, OpenWeatherMap, RSS Multi, NewsAPI, Web Scraping (6 sources), GDELT Big Data
> - Barom√®tres : 10 types de barom√®tres d'opinion et indicateurs sociaux
> - Sch√©ma complet : T01-T36 + T37 (archive_flux) selon MPD.sql


In [None]:
> Notes:
> - Configuration des connexions **MinIO (DataLake)** et **PostgreSQL (SGBD)**
> - Cr√©ation de l'arborescence `data/raw/` avec **tous les sous-dossiers** pour sources compl√®tes
> - Syst√®me de logging pour tracer toutes les op√©rations
> - Fonctions utilitaires (timestamp UTC, hash SHA256 pour d√©duplication)
> - **R√©f√©rences** : docs/datasens_MPD.sql (36 tables), docs/datasens_sources_dictionary.md, docs/datasens_barometer_themes.md
    ROOT = ROOT
except NameError:
    from pathlib import Path
    ROOT = Path.cwd().resolve().parents[2]
DATA_DIR = ROOT / 'data'
DOCS_DIR = ROOT / 'docs'
LOGS_DIR = ROOT / 'logs'
print('ROOT=', ROOT)
print('DATA_DIR=', DATA_DIR)
print('DOCS_DIR=', DOCS_DIR)
print('LOGS_DIR=', LOGS_DIR)


# DataSens E1_v3 - 01_setup_env
# üîß Configuration environnement : MinIO + PostgreSQL + Arborescence + Logging
# Architecture compl√®te : 36/37 tables + Toutes les sources

import datetime as dt
import hashlib
import logging
import os
from datetime import UTC, datetime
from pathlib import Path

from dotenv import load_dotenv

# D√©tection robuste du dossier projet
current = Path.cwd()
PROJECT_ROOT = None
while current != current.parent:
    if (current / "notebooks").exists() and (current / "docs").exists():
        PROJECT_ROOT = current
        break
    current = current.parent
else:
    PROJECT_ROOT = Path.cwd()

print(f"üìÇ Racine projet d√©tect√©e : {PROJECT_ROOT}")

# Chargement .env
env_path = PROJECT_ROOT / '.env'
loaded = load_dotenv(env_path)
if loaded:
    print(f'‚úÖ .env charg√©: {env_path}')
else:
    print(f'‚ö†Ô∏è .env non trouv√©: {env_path}')
    # Cr√©er .env.example si absent
    env_example = PROJECT_ROOT / '.env.example'
    if not env_example.exists():
        env_example.write_text("""
# PostgreSQL
POSTGRES_HOST=localhost
POSTGRES_PORT=5433
POSTGRES_DB=postgres
POSTGRES_USER=postgres
POSTGRES_PASS=postgres

# MinIO
MINIO_ENDPOINT=http://localhost:9002
MINIO_ACCESS_KEY=admin
MINIO_SECRET_KEY=admin123
MINIO_BUCKET=datasens-raw

# API Keys (pour toutes les sources E1_v3)
OWM_API_KEY=
NEWSAPI_KEY=
KAGGLE_USERNAME=
KAGGLE_KEY=
GDELT_BASE=http://data.gdeltproject.org/gkg/
""".strip() + "\n", encoding='utf-8')
        print(f'üìÑ .env.example cr√©√©: {env_example}')

# Configuration MinIO (DataLake)
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "http://localhost:9002")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "admin")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "admin123")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "datasens-raw")

# Configuration PostgreSQL (SGBD) - 36/37 tables E1_v3
PG_HOST = os.getenv("POSTGRES_HOST", "localhost")
PG_PORT = int(os.getenv("POSTGRES_PORT", "5433"))
PG_DB = os.getenv("POSTGRES_DB", "postgres")
PG_USER = os.getenv("POSTGRES_USER", "postgres")
PG_PASS = os.getenv("POSTGRES_PASS", "postgres")
PG_URL = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"

# Cl√©s API (pour toutes les sources E1_v3)
KAGGLE_USERNAME = os.getenv("KAGGLE_USERNAME")
KAGGLE_KEY = os.getenv("KAGGLE_KEY")
OWM_API_KEY = os.getenv("OWM_API_KEY")
NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
GDELT_BASE = os.getenv("GDELT_BASE", "http://data.gdeltproject.org/gkg/")

print("\nüîê Configuration MinIO (DataLake) :")
print(f"   ‚Ä¢ Endpoint : {MINIO_ENDPOINT}")
print(f"   ‚Ä¢ Bucket   : {MINIO_BUCKET}")

print("\nüóÑÔ∏è Configuration PostgreSQL (SGBD) :")
print(f"   ‚Ä¢ Host     : {PG_HOST}:{PG_PORT}")
print(f"   ‚Ä¢ Database : {PG_DB}")
print(f"   ‚Ä¢ User     : {PG_USER}")
print(f"   ‚Ä¢ Sch√©ma   : 36/37 tables (T01-T36 + T37) selon datasens_MPD.sql")

print("\nüîë Cl√©s API (E1_v3 - Toutes les sources) :")
print(f"   ‚Ä¢ Kaggle        : {'‚úÖ Configur√©e' if KAGGLE_USERNAME else '‚ùå Manquante'}")
print(f"   ‚Ä¢ OpenWeatherMap: {'‚úÖ Configur√©e' if OWM_API_KEY else '‚ùå Manquante'}")
print(f"   ‚Ä¢ NewsAPI       : {'‚úÖ Configur√©e' if NEWSAPI_KEY else '‚ùå Manquante'}")

# Arborescence compl√®te pour TOUTES les sources E1_v3
DATA_DIR = PROJECT_ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
LOGS_DIR = PROJECT_ROOT / 'logs'

RAW_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR.mkdir(parents=True, exist_ok=True)

# Tous les sous-dossiers pour sources compl√®tes E1_v3 (selon docs)
folders = [
    "kaggle",                    # Source 1 : Fichier plat
    "api/owm",                   # Source 2 : API OpenWeatherMap
    "api/newsapi",               # Source 2 : API NewsAPI
    "rss",                       # Source 3 : Flux RSS Multi-Sources
    "scraping/multi",            # Source 4 : Web Scraping Multi (6 sources)
    "scraping/viepublique",      # Source 4 : Vie-publique.fr
    "scraping/datagouv",         # Source 4 : data.gouv.fr
    "gdelt",                     # Source 5 : GDELT Big Data
    "manifests"                  # Manifest JSON par run
]
for sub in folders:
    (RAW_DIR / sub).mkdir(parents=True, exist_ok=True)

print(f"\n‚úÖ Arborescence cr√©√©e: {RAW_DIR}")
print(f"   ‚Ä¢ {len(folders)} sous-dossiers pr√™ts (toutes sources E1_v3)")

# Logging
stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
log_file = LOGS_DIR / f"collecte_{stamp}.log"
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(message)s',
    datefmt='%H:%M:%S',
    handlers=[
        logging.FileHandler(log_file, encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logging.info("Syst√®me de logging initialis√© - E1_v3 (36/37 tables)")
print(f"üìÑ Log: {log_file}")

# Fonctions utilitaires
def ts() -> str:
    """Timestamp UTC ISO compact (YYYYMMDDTHHMMSSZ)"""
    return dt.datetime.now(tz=dt.UTC).strftime("%Y%m%dT%H%M%SZ")

def sha256_hash(s: str) -> str:
    """Hash SHA256 pour d√©duplication"""
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

print(f"\nüîß Utilitaires : ts()={ts()}, sha256()={sha256_hash('test')[:16]}...")

print("\n‚úÖ Configuration E1_v3 termin√©e !")



In [None]:
# ============================================================
# üé¨ DASHBOARD NARRATIF - O√ô SOMMES-NOUS ?
# ============================================================
# Ce dashboard vous guide √† travers le pipeline DataSens E1
# Il montre la progression et l'√©tat actuel des donn√©es
# ============================================================

import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch
import matplotlib.patches as mpatches

print("\n" + "="*80)
print("üé¨ FIL D'ARIANE VISUEL - PIPELINE DATASENS E1")
print("="*80)

# Cr√©er figure dashboard
fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)
ax.set_xlim(0, 10)
ax.set_ylim(0, 6)
ax.axis('off')

# √âtapes du pipeline
etapes = [
    {"nom": "üì• COLLECTE", "status": "‚úÖ", "desc": "Sources brutes"},
    {"nom": "‚òÅÔ∏è DATALAKE", "status": "‚úÖ", "desc": "MinIO Raw"},
    {"nom": "üßπ NETTOYAGE", "status": "üîÑ", "desc": "D√©duplication"},
    {"nom": "üíæ ETL", "status": "‚è≥", "desc": "PostgreSQL"},
    {"nom": "üìä ANNOTATION", "status": "‚è≥", "desc": "Enrichissement"},
    {"nom": "üì¶ EXPORT", "status": "‚è≥", "desc": "Dataset IA"}
]

# Couleurs selon statut
colors = {
    "‚úÖ": "#4ECDC4",
    "üîÑ": "#FECA57", 
    "‚è≥": "#E8E8E8"
}

# Dessiner timeline
y_pos = 4
x_start = 1
x_spacing = 1.4

for i, etape in enumerate(etapes):
    x_pos = x_start + i * x_spacing
    
    # Cercle √©tape
    circle = plt.Circle((x_pos, y_pos), 0.25, color=colors[etape["status"]], zorder=3)
    ax.add_patch(circle)
    ax.text(x_pos, y_pos, etape["status"], ha='center', va='center', fontsize=14, fontweight='bold', zorder=4)
    
    # Nom √©tape
    ax.text(x_pos, y_pos - 0.6, etape["nom"], ha='center', va='top', fontsize=11, fontweight='bold')
    ax.text(x_pos, y_pos - 0.85, etape["desc"], ha='center', va='top', fontsize=9, style='italic')
    
    # Fl√®che vers prochaine √©tape
    if i < len(etapes) - 1:
        ax.arrow(x_pos + 0.3, y_pos, x_spacing - 0.6, 0, 
                head_width=0.1, head_length=0.15, fc='gray', ec='gray', zorder=2)

# Titre narratif
ax.text(5, 5.5, "üéØ PROGRESSION DU PIPELINE E1", ha='center', va='center', 
        fontsize=16, fontweight='bold', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# L√©gende
legend_elements = [
    mpatches.Patch(facecolor='#4ECDC4', label='Termin√©'),
    mpatches.Patch(facecolor='#FECA57', label='En cours'),
    mpatches.Patch(facecolor='#E8E8E8', label='√Ä venir')
]
ax.legend(handles=legend_elements, loc='upper left', fontsize=10)

# Statistiques rapides (si disponibles)
stats_text = "\nüìä SNAPSHOT ACTUEL :\n"
try:
    # Essayer de charger des stats si base disponible
    stats_text += "   ‚Ä¢ Pipeline en cours d'ex√©cution...\n"
except:
    stats_text += "   ‚Ä¢ D√©marrage du pipeline...\n"

ax.text(5, 1.5, stats_text, ha='center', va='center', fontsize=10,
        bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))

plt.title("üé¨ FIL D'ARIANE VISUEL - Accompagnement narratif du jury", 
          fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nüí° Le fil d'Ariane vous guide √©tape par √©tape √† travers le pipeline")
print("   Chaque visualisation s'inscrit dans cette progression narrative\n")



In [None]:
# Test des connexions MinIO et PostgreSQL

print("üîå Test des connexions...")
print("=" * 80)

# Connexion MinIO
try:
    from minio import Minio
    
    minio_client = Minio(
        MINIO_ENDPOINT.replace("http://", "").replace("https://", ""),
        access_key=MINIO_ACCESS_KEY,
        secret_key=MINIO_SECRET_KEY,
        secure=False
    )
    
    # Cr√©er le bucket s'il n'existe pas
    if not minio_client.bucket_exists(MINIO_BUCKET):
        minio_client.make_bucket(MINIO_BUCKET)
        print(f"‚úÖ MinIO : Bucket '{MINIO_BUCKET}' cr√©√©")
    else:
        print(f"‚úÖ MinIO : Bucket '{MINIO_BUCKET}' existe d√©j√†")
    
    # Lister les objets existants
    objects = list(minio_client.list_objects(MINIO_BUCKET, recursive=False))
    print(f"   ‚Ä¢ {len(list(objects))} objets existants dans le bucket")
    
except Exception as e:
    print(f"‚ùå MinIO : Erreur de connexion - {e}")
    print("   üí° V√©rifiez que Docker Compose est lanc√© : docker compose up -d")
    minio_client = None

# Connexion PostgreSQL
try:
    from sqlalchemy import create_engine, text
    
    engine = create_engine(PG_URL, future=True)
    
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1 as test"))
        test_value = result.scalar()
    
    if test_value == 1:
        print(f"‚úÖ PostgreSQL : Connexion r√©ussie ({PG_HOST}:{PG_PORT}/{PG_DB})")
        
        # Compter les tables existantes
        with engine.connect() as conn:
            result = conn.execute(text("""
                SELECT COUNT(*) FROM information_schema.tables 
                WHERE table_schema = 'public'
            """))
            nb_tables = result.scalar()
            print(f"   ‚Ä¢ {nb_tables} tables existantes dans la base")
            print(f"   ‚Ä¢ E1_v3 attend 36/37 tables (architecture compl√®te selon datasens_MPD.sql)")
    else:
        print("‚ö†Ô∏è PostgreSQL : Connexion OK mais test inattendu")
        
except Exception as e:
    print(f"‚ùå PostgreSQL : Erreur de connexion - {e}")
    print("   üí° V√©rifiez que Docker Compose est lanc√© : docker compose up -d")
    engine = None

print("\n‚úÖ Tests de connexion termin√©s !")
print("   ‚û°Ô∏è Passez au notebook 02_schema_create.ipynb pour cr√©er les 36/37 tables")


## üéØ Introduction & Objectifs E1

**DataSens E1** : Construction du socle data avec :
- ‚úÖ Mod√©lisation Merise (MCD ‚Üí MLD ‚Üí MPD cibl√©)
- ‚úÖ Cr√©ation et remplissage PostgreSQL (18 tables)
- ‚úÖ CRUD complet test√© depuis le notebook
- ‚úÖ Ingestion r√©elle des **5 types de sources** :
  1. **Fichier plat** : Kaggle CSV
  2. **Base de donn√©es** : Export Kaggle SQLite ‚Üí Postgres
  3. **API** : OpenWeatherMap (m√©t√©o communes)
  4. **Web Scraping** : MonAvisCitoyen (dry-run, robots.txt)
  5. **Big Data** : GDELT GKG (√©chantillon journalier)
- ‚úÖ Tra√ßabilit√© et gouvernance (flux, manifest, versionning Git)

**Mode d'ex√©cution** : Cellule par cellule (pas √† pas)


In [1]:
# V√©rification environnement Python
import subprocess
import sys
from pathlib import Path

print("üîç V√©rification environnement Python")
print("=" * 80)
print(f"Python version : {sys.version}")
print(f"Python executable : {sys.executable}")

# V√©rifier version Python >= 3.11
version_info = sys.version_info
if version_info.major >= 3 and version_info.minor >= 11:
    print(f"‚úÖ Python {version_info.major}.{version_info.minor}.{version_info.micro} OK")
else:
    print(f"‚ö†Ô∏è Python {version_info.major}.{version_info.minor} ‚Äî Recommand√© Python 3.11+")


üîç V√©rification environnement Python
Python version : 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
Python executable : c:\Users\Utilisateur\Desktop\Datasens_Project\.venv\Scripts\python.exe
‚úÖ Python 3.12.7 OK


In [2]:
# Liste des packages install√©s
print("\nüì¶ Packages Python install√©s")
print("=" * 80)

packages_to_check = [
    "pandas",
    "sqlalchemy",
    "psycopg2",
    "requests",
    "beautifulsoup4",
    "python-dotenv",
]

result = subprocess.run(
    [sys.executable, "-m", "pip", "list"],
    check=False, capture_output=True,
    text=True
)

if result.returncode == 0:
    installed_packages = result.stdout
    print("\nV√©rification packages critiques :\n")
    for pkg in packages_to_check:
        if pkg in installed_packages.lower():
            version = [line for line in installed_packages.split("\n") if pkg.lower() in line.lower()]
            if version:
                print(f"  ‚úÖ {version[0]}")
            else:
                print(f"  ‚úÖ {pkg} (version non d√©tect√©e)")
        else:
            print(f"  ‚ùå {pkg} - √Ä installer : pip install {pkg}")

    print("\nüìã Liste compl√®te (pip list) :")
    print(installed_packages[:500] + "..." if len(installed_packages) > 500 else installed_packages)
else:
    print("‚ö†Ô∏è Impossible d'ex√©cuter pip list")



üì¶ Packages Python install√©s

V√©rification packages critiques :

  ‚úÖ pandas                   2.3.3
  ‚úÖ SQLAlchemy               2.0.44
  ‚úÖ psycopg2-binary          2.9.11
  ‚úÖ requests                 2.32.5
  ‚úÖ beautifulsoup4           4.14.2
  ‚úÖ python-dotenv            1.2.1

üìã Liste compl√®te (pip list) :
Package                  Version
------------------------ -----------
annotated-types          0.7.0
argon2-cffi              25.1.0
argon2-cffi-bindings     25.1.0
asttokens                3.0.0
beautifulsoup4           4.14.2
cachetools               6.2.1
certifi                  2025.10.5
cffi                     2.0.0
charset-normalizer       3.4.4
colorama                 0.4.6
comm                     0.2.3
contourpy                1.3.3
cycler                   0.12.1
debugpy            ...


## üìÅ Cr√©ation de l'arborescence projet

Structure du projet selon les conventions DataSens :


In [3]:
# D√©terminer la racine du projet (parent de notebooks/)
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR

print("üìÅ Cr√©ation de l'arborescence projet")
print("=" * 80)
print(f"Racine projet : {PROJECT_ROOT}")

# Arborescence √† cr√©er
directories = {
    "data": ["raw", "silver", "gold"],
    "data/raw": ["kaggle", "api", "scraping", "gdelt", "manifests"],
    "data/raw/api": ["owm"],
    "data/raw/scraping": ["mav"],  # MonAvisCitoyen
    "logs": [],
    "docs": [],
    "notebooks": [],
}

created = []
for base_dir, subdirs in directories.items():
    base_path = PROJECT_ROOT / base_dir
    base_path.mkdir(parents=True, exist_ok=True)
    created.append(f"‚úÖ {base_dir}/")

    for subdir in subdirs:
        sub_path = base_path / subdir
        sub_path.mkdir(parents=True, exist_ok=True)
        created.append(f"   ‚úÖ {base_dir}/{subdir}/")

print("\nüìÇ Dossiers cr√©√©s :")
for item in created:
    print(item)

print(f"\n‚úÖ Arborescence pr√™te ! ({len(created)} dossiers)")


üìÅ Cr√©ation de l'arborescence projet
Racine projet : c:\Users\Utilisateur\Desktop\Datasens_Project

üìÇ Dossiers cr√©√©s :
‚úÖ data/
   ‚úÖ data/raw/
   ‚úÖ data/silver/
   ‚úÖ data/gold/
‚úÖ data/raw/
   ‚úÖ data/raw/kaggle/
   ‚úÖ data/raw/api/
   ‚úÖ data/raw/scraping/
   ‚úÖ data/raw/gdelt/
   ‚úÖ data/raw/manifests/
‚úÖ data/raw/api/
   ‚úÖ data/raw/api/owm/
‚úÖ data/raw/scraping/
   ‚úÖ data/raw/scraping/mav/
‚úÖ logs/
‚úÖ docs/
‚úÖ notebooks/

‚úÖ Arborescence pr√™te ! (17 dossiers)


## ‚öôÔ∏è Configuration .env

Cr√©ation du fichier `.env` de d√©veloppement avec variables PostgreSQL et API keys.

**‚ö†Ô∏è IMPORTANT** : Ce fichier ne doit JAMAIS √™tre commit√© (dans .gitignore)


In [4]:
import os

from dotenv import load_dotenv

# Charger .env s'il existe
env_path = PROJECT_ROOT / ".env"
env_loaded = load_dotenv(env_path)

if env_loaded:
    print(f"‚úÖ Fichier .env charg√© : {env_path}")
else:
    print(f"‚ö†Ô∏è Fichier .env non trouv√© : {env_path}")
    print("   Cr√©ation d'un .env.example pour r√©f√©rence...")

    # Cr√©er un .env.example
    env_example = PROJECT_ROOT / ".env.example"
    env_example.write_text("""
# PostgreSQL
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
POSTGRES_DB=datasens
POSTGRES_USER=ds_user
POSTGRES_PASS=ds_pass

# API Keys (optionnelles pour d√©mo)
OWM_API_KEY=your_openweathermap_key_here
KAGGLE_USERNAME=your_kaggle_username
KAGGLE_KEY=your_kaggle_key

# Git (optionnel)
GIT_USER_NAME=Your Name
GIT_USER_EMAIL=your.email@example.com
""")
    print(f"   üìÑ Template cr√©√© : {env_example}")

# Afficher configuration (sans afficher les mots de passe)
print("\nüîê Configuration charg√©e :")
print(f"   POSTGRES_HOST : {os.getenv('POSTGRES_HOST', 'localhost')}")
print(f"   POSTGRES_PORT : {os.getenv('POSTGRES_PORT', '5432')}")
print(f"   POSTGRES_DB   : {os.getenv('POSTGRES_DB', 'datasens')}")
print(f"   POSTGRES_USER : {os.getenv('POSTGRES_USER', 'ds_user')}")
print(f"   OWM_API_KEY   : {'‚úÖ Configur√©e' if os.getenv('OWM_API_KEY') else '‚ùå Manquante (optionnelle)'}")
print(f"   KAGGLE_USERNAME: {'‚úÖ Configur√©e' if os.getenv('KAGGLE_USERNAME') else '‚ùå Manquante (optionnelle)'}")


‚úÖ Fichier .env charg√© : c:\Users\Utilisateur\Desktop\Datasens_Project\.env

üîê Configuration charg√©e :
   POSTGRES_HOST : localhost
   POSTGRES_PORT : 5432
   POSTGRES_DB   : datasens
   POSTGRES_USER : ds_user
   OWM_API_KEY   : ‚úÖ Configur√©e
   KAGGLE_USERNAME: ‚úÖ Configur√©e


## üóÑÔ∏è Connexion PostgreSQL

Test de connexion √† la base PostgreSQL (via Docker ou locale)


In [5]:
from sqlalchemy import create_engine, text

# R√©cup√©rer variables d'environnement
PG_HOST = os.getenv("POSTGRES_HOST", "localhost")
PG_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
PG_DB = os.getenv("POSTGRES_DB", "datasens")
PG_USER = os.getenv("POSTGRES_USER", "ds_user")
PG_PASS = os.getenv("POSTGRES_PASS", "ds_pass")

# URL de connexion
PG_URL = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"

print("üîå Test connexion PostgreSQL")
print("=" * 80)
print(f"URL : postgresql://{PG_USER}:***@{PG_HOST}:{PG_PORT}/{PG_DB}")

try:
    engine = create_engine(PG_URL, future=True)

    # Test simple : SELECT 1
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1 as test"))
        test_value = result.scalar()

    if test_value == 1:
        print("‚úÖ Connexion PostgreSQL r√©ussie !")
        print(f"   üóÑÔ∏è Base de donn√©es : {PG_DB}")
        print(f"   üë§ Utilisateur : {PG_USER}")
        print(f"   üìç Serveur : {PG_HOST}:{PG_PORT}")
    else:
        print("‚ö†Ô∏è Connexion OK mais test inattendu")

except Exception as e:
    print(f"‚ùå Erreur de connexion : {e}")
    print("\nüí° V√©rifications :")
    print("   1. Docker Compose est-il d√©marr√© ? ‚Üí docker-compose up -d")
    print("   2. PostgreSQL est-il accessible sur le port 5432 ?")
    print("   3. Les credentials dans .env sont-ils corrects ?")


üîå Test connexion PostgreSQL
URL : postgresql://ds_user:***@localhost:5432/datasens
‚úÖ Connexion PostgreSQL r√©ussie !
   üóÑÔ∏è Base de donn√©es : datasens
   üë§ Utilisateur : ds_user
   üìç Serveur : localhost:5432


## üîÑ Initialisation Git

Initialisation du d√©p√¥t Git (si ce n'est pas d√©j√† fait) et premier commit


In [6]:
import subprocess

print("üîÑ V√©rification Git")
print("=" * 80)

# V√©rifier si Git est install√©
try:
    git_version = subprocess.run(
        ["git", "--version"],
        check=False, capture_output=True,
        text=True
    )
    print(f"‚úÖ {git_version.stdout.strip()}")
except FileNotFoundError:
    print("‚ùå Git non install√© ‚Äî Installation requise : https://git-scm.com/")
    exit(1)

# V√©rifier si le projet est d√©j√† un d√©p√¥t Git
git_dir = PROJECT_ROOT / ".git"
if git_dir.exists():
    print(f"\n‚úÖ D√©p√¥t Git d√©j√† initialis√© : {PROJECT_ROOT}")

    # Afficher git status
    try:
        status = subprocess.run(
            ["git", "status", "--short"],
            check=False, cwd=PROJECT_ROOT,
            capture_output=True,
            text=True
        )
        if status.stdout.strip():
            print("\nüìã Fichiers modifi√©s/non suivis :")
            print(status.stdout)
        else:
            print("\nüìã Aucun changement (working tree clean)")
    except Exception as e:
        print(f"‚ö†Ô∏è Impossible de lire git status : {e}")
else:
    print(f"\n‚ö†Ô∏è D√©p√¥t Git non initialis√© dans {PROJECT_ROOT}")
    print("   üí° Initialisation manuelle recommand√©e :")
    print(f"      cd {PROJECT_ROOT}")
    print("      git init")
    print("      git add .")
    print('      git commit -m "Initial commit E1"')

print("\n‚úÖ Setup environnement termin√© !")
print("   ‚û°Ô∏è Passez au notebook 02_schema_create.ipynb")


üîÑ V√©rification Git
‚úÖ git version 2.49.0.windows.1

‚úÖ D√©p√¥t Git d√©j√† initialis√© : c:\Users\Utilisateur\Desktop\Datasens_Project

üìã Fichiers modifi√©s/non suivis :
 D docs/FIX_PDF_QUICK.md
 M docs/GUIDE_TECHNIQUE_JURY.md
 M docs/GUIDE_TECHNIQUE_JURY.pdf
 D docs/PDF_FORMATTING_INSTRUCTIONS.md
 D docs/fix_pdf_formatting.ps1
 M notebooks/datasens_E1_v2.ipynb
?? docs/ARCHITECTURE_PIPELINE_E1.md
?? docs/GUIDE_TECHNIQUE_JURY_V2.md
?? docs/datasens_dictionary.md
?? docs/e1_schema.sql
?? notebooks/01_setup_env.ipynb
?? notebooks/02_schema_create.ipynb
?? notebooks/03_ingest_sources.ipynb
?? notebooks/04_crud_tests.ipynb
?? notebooks/05_snapshot_and_readme.ipynb
?? notebooks/README_VERSIONNING.md
?? notebooks/data/raw/api/
?? notebooks/data/raw/gdelt/
?? notebooks/data/raw/kaggle/
?? notebooks/data/raw/manifests/
?? notebooks/data/raw/rss/
?? notebooks/data/raw/scraping/multi/scraping_multi_20251029T122841Z.csv


‚úÖ Setup environnement termin√© !
   ‚û°Ô∏è Passez au notebook 02_s