In [None]:
# DataSens logging setup (marker:datasens_logging)
import logging, os
os.makedirs('logs', exist_ok=True)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('logs/datasens.log', encoding='utf-8')
    ]
)
logging.info('D√©marrage')


# DataSens E1_v2 ‚Äî 01_setup_env

- Objectifs: configuration environnement, connexions MinIO + PostgreSQL, arborescence, logging
- Pr√©requis: Docker Compose lanc√© (MinIO + PostgreSQL), Python + venv, `pip install -r requirements.txt`
- Ordre global E1_v2: 01 ‚Üí 02 ‚Üí 03 ‚Üí 04 ‚Üí 05
- Guide: docs/GUIDE_TECHNIQUE_E1.md

> **E1_v2** : Collecte r√©elle avec sources r√©duites mais fonctionnelles (18 tables PostgreSQL)


In [None]:
# ============================================================
# üé¨ DASHBOARD NARRATIF - O√ô SOMMES-NOUS ?
# ============================================================
# Ce dashboard vous guide √† travers le pipeline DataSens E1
# Il montre la progression et l'√©tat actuel des donn√©es
# ============================================================

import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch
import matplotlib.patches as mpatches

print("\n" + "="*80)
print("üé¨ FIL D'ARIANE VISUEL - PIPELINE DATASENS E1")
print("="*80)

# Cr√©er figure dashboard
fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)
ax.set_xlim(0, 10)
ax.set_ylim(0, 6)
ax.axis('off')

# √âtapes du pipeline
etapes = [
    {"nom": "üì• COLLECTE", "status": "‚úÖ", "desc": "Sources brutes"},
    {"nom": "‚òÅÔ∏è DATALAKE", "status": "‚úÖ", "desc": "MinIO Raw"},
    {"nom": "üßπ NETTOYAGE", "status": "üîÑ", "desc": "D√©duplication"},
    {"nom": "üíæ ETL", "status": "‚è≥", "desc": "PostgreSQL"},
    {"nom": "üìä ANNOTATION", "status": "‚è≥", "desc": "Enrichissement"},
    {"nom": "üì¶ EXPORT", "status": "‚è≥", "desc": "Dataset IA"}
]

# Couleurs selon statut
colors = {
    "‚úÖ": "#4ECDC4",
    "üîÑ": "#FECA57", 
    "‚è≥": "#E8E8E8"
}

# Dessiner timeline
y_pos = 4
x_start = 1
x_spacing = 1.4

for i, etape in enumerate(etapes):
    x_pos = x_start + i * x_spacing
    
    # Cercle √©tape
    circle = plt.Circle((x_pos, y_pos), 0.25, color=colors[etape["status"]], zorder=3)
    ax.add_patch(circle)
    ax.text(x_pos, y_pos, etape["status"], ha='center', va='center', fontsize=14, fontweight='bold', zorder=4)
    
    # Nom √©tape
    ax.text(x_pos, y_pos - 0.6, etape["nom"], ha='center', va='top', fontsize=11, fontweight='bold')
    ax.text(x_pos, y_pos - 0.85, etape["desc"], ha='center', va='top', fontsize=9, style='italic')
    
    # Fl√®che vers prochaine √©tape
    if i < len(etapes) - 1:
        ax.arrow(x_pos + 0.3, y_pos, x_spacing - 0.6, 0, 
                head_width=0.1, head_length=0.15, fc='gray', ec='gray', zorder=2)

# Titre narratif
ax.text(5, 5.5, "üéØ PROGRESSION DU PIPELINE E1", ha='center', va='center', 
        fontsize=16, fontweight='bold', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# L√©gende
legend_elements = [
    mpatches.Patch(facecolor='#4ECDC4', label='Termin√©'),
    mpatches.Patch(facecolor='#FECA57', label='En cours'),
    mpatches.Patch(facecolor='#E8E8E8', label='√Ä venir')
]
ax.legend(handles=legend_elements, loc='upper left', fontsize=10)

# Statistiques rapides (si disponibles)
stats_text = "\nüìä SNAPSHOT ACTUEL :\n"
try:
    # Essayer de charger des stats si base disponible
    stats_text += "   ‚Ä¢ Pipeline en cours d'ex√©cution...\n"
except:
    stats_text += "   ‚Ä¢ D√©marrage du pipeline...\n"

ax.text(5, 1.5, stats_text, ha='center', va='center', fontsize=10,
        bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))

plt.title("üé¨ FIL D'ARIANE VISUEL - Accompagnement narratif du jury", 
          fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nüí° Le fil d'Ariane vous guide √©tape par √©tape √† travers le pipeline")
print("   Chaque visualisation s'inscrit dans cette progression narrative\n")



> Notes:
> - Configuration des connexions **MinIO (DataLake)** et **PostgreSQL (SGBD)**
> - Cr√©ation de l'arborescence `data/raw/` avec sous-dossiers par type de source
> - Syst√®me de logging pour tracer toutes les op√©rations
> - Fonctions utilitaires (timestamp UTC, hash SHA256 pour d√©duplication)


In [8]:
# DataSens E1_v2 - 01_setup_env
# üîß Configuration environnement : MinIO + PostgreSQL + Arborescence + Logging

import datetime as dt
import hashlib
import logging
import os
from datetime import UTC, datetime
from pathlib import Path

from dotenv import load_dotenv

# D√©tection robuste du dossier projet
current = Path.cwd()
PROJECT_ROOT = None
while current != current.parent:
    if (current / "notebooks").exists() and (current / "docs").exists():
        PROJECT_ROOT = current
        break
    current = current.parent
else:
    PROJECT_ROOT = Path.cwd()

print(f"üìÇ Racine projet d√©tect√©e : {PROJECT_ROOT}")

# Chargement .env
env_path = PROJECT_ROOT / '.env'
loaded = load_dotenv(env_path)
if loaded:
    print(f'‚úÖ .env charg√©: {env_path}')
else:
    print(f'‚ö†Ô∏è .env non trouv√©: {env_path}')

# Configuration MinIO (DataLake)
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "http://localhost:9002")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "admin")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "admin123")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "datasens-raw")

# Configuration PostgreSQL (SGBD)
PG_HOST = os.getenv("POSTGRES_HOST", "localhost")
PG_PORT = int(os.getenv("POSTGRES_PORT", "5433"))
PG_DB = os.getenv("POSTGRES_DB", "postgres")
PG_USER = os.getenv("POSTGRES_USER", "postgres")
PG_PASS = os.getenv("POSTGRES_PASS", "postgres")
PG_URL = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"

# Cl√©s API (optionnelles)
KAGGLE_USERNAME = os.getenv("KAGGLE_USERNAME")
KAGGLE_KEY = os.getenv("KAGGLE_KEY")
OWM_API_KEY = os.getenv("OWM_API_KEY")
NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
GDELT_BASE = os.getenv("GDELT_BASE", "http://data.gdeltproject.org/gkg/")

print("\nüîê Configuration MinIO (DataLake) :")
print(f"   ‚Ä¢ Endpoint : {MINIO_ENDPOINT}")
print(f"   ‚Ä¢ Bucket   : {MINIO_BUCKET}")

print("\nüóÑÔ∏è Configuration PostgreSQL (SGBD) :")
print(f"   ‚Ä¢ Host     : {PG_HOST}:{PG_PORT}")
print(f"   ‚Ä¢ Database : {PG_DB}")
print(f"   ‚Ä¢ User     : {PG_USER}")

print("\nüîë Cl√©s API :")
print(f"   ‚Ä¢ Kaggle        : {'‚úÖ Configur√©e' if KAGGLE_USERNAME else '‚ùå Manquante'}")
print(f"   ‚Ä¢ OpenWeatherMap: {'‚úÖ Configur√©e' if OWM_API_KEY else '‚ùå Manquante'}")
print(f"   ‚Ä¢ NewsAPI       : {'‚úÖ Configur√©e' if NEWSAPI_KEY else '‚ùå Manquante'}")

# Arborescence
DATA_DIR = PROJECT_ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
LOGS_DIR = PROJECT_ROOT / 'logs'

RAW_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR.mkdir(parents=True, exist_ok=True)

folders = ["kaggle", "api/owm", "api/newsapi", "rss", "scraping/multi", 
           "scraping/viepublique", "scraping/datagouv", "gdelt", "manifests"]
for sub in folders:
    (RAW_DIR / sub).mkdir(parents=True, exist_ok=True)

print(f"\n‚úÖ Arborescence cr√©√©e: {RAW_DIR}")
print(f"   ‚Ä¢ {len(folders)} sous-dossiers pr√™ts")

# Logging
stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
log_file = LOGS_DIR / f"collecte_{stamp}.log"
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(message)s',
    datefmt='%H:%M:%S',
    handlers=[
        logging.FileHandler(log_file, encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logging.info("Syst√®me de logging initialis√©")
print(f"üìÑ Log: {log_file}")

# Fonctions utilitaires
def ts() -> str:
    """Timestamp UTC ISO compact (YYYYMMDDTHHMMSSZ)"""
    return dt.datetime.now(tz=dt.UTC).strftime("%Y%m%dT%H%M%SZ")

def sha256_hash(s: str) -> str:
    """Hash SHA256 pour d√©duplication"""
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

print(f"\nüîß Utilitaires : ts()={ts()}, sha256()={sha256_hash('test')[:16]}...")

print("\n‚úÖ Configuration termin√©e !")


[13:51:48] INFO - Syst√®me de logging initialis√©


üìÇ Racine projet d√©tect√©e : c:\Users\Utilisateur\Desktop\DataSens
‚úÖ .env charg√©: c:\Users\Utilisateur\Desktop\DataSens\.env

üîê Configuration MinIO (DataLake) :
   ‚Ä¢ Endpoint : http://localhost:9000
   ‚Ä¢ Bucket   : datasens-raw

üóÑÔ∏è Configuration PostgreSQL (SGBD) :
   ‚Ä¢ Host     : localhost:5432
   ‚Ä¢ Database : datasens
   ‚Ä¢ User     : ds_user

üîë Cl√©s API :
   ‚Ä¢ Kaggle        : ‚úÖ Configur√©e
   ‚Ä¢ OpenWeatherMap: ‚úÖ Configur√©e
   ‚Ä¢ NewsAPI       : ‚úÖ Configur√©e

‚úÖ Arborescence cr√©√©e: c:\Users\Utilisateur\Desktop\DataSens\data\raw
   ‚Ä¢ 9 sous-dossiers pr√™ts
üìÑ Log: c:\Users\Utilisateur\Desktop\DataSens\logs\collecte_20251101_125148.log

üîß Utilitaires : ts()=20251101T125148Z, sha256()=9f86d081884c7d65...

‚úÖ Configuration termin√©e !


In [9]:
# Test des connexions MinIO et PostgreSQL

print("üîå Test des connexions...")
print("=" * 80)

# Connexion MinIO
try:
    from minio import Minio
    from minio.error import S3Error
    
    minio_client = Minio(
        MINIO_ENDPOINT.replace("http://", "").replace("https://", ""),
        access_key=MINIO_ACCESS_KEY,
        secret_key=MINIO_SECRET_KEY,
        secure=False
    )
    
    # Cr√©er le bucket s'il n'existe pas
    if not minio_client.bucket_exists(MINIO_BUCKET):
        minio_client.make_bucket(MINIO_BUCKET)
        print(f"‚úÖ MinIO : Bucket '{MINIO_BUCKET}' cr√©√©")
    else:
        print(f"‚úÖ MinIO : Bucket '{MINIO_BUCKET}' existe d√©j√†")
    
    # Lister les objets existants
    objects = list(minio_client.list_objects(MINIO_BUCKET, recursive=False))
    print(f"   ‚Ä¢ {len(list(objects))} objets existants dans le bucket")
    
except Exception as e:
    print(f"‚ùå MinIO : Erreur de connexion - {e}")
    print("   üí° V√©rifiez que Docker Compose est lanc√© : docker compose up -d")
    minio_client = None

# Connexion PostgreSQL
try:
    from sqlalchemy import create_engine, text
    
    engine = create_engine(PG_URL, future=True)
    
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1 as test"))
        test_value = result.scalar()
    
    if test_value == 1:
        print(f"‚úÖ PostgreSQL : Connexion r√©ussie ({PG_HOST}:{PG_PORT}/{PG_DB})")
        
        # Compter les tables existantes
        with engine.connect() as conn:
            result = conn.execute(text("""
                SELECT COUNT(*) FROM information_schema.tables 
                WHERE table_schema = 'public'
            """))
            nb_tables = result.scalar()
            print(f"   ‚Ä¢ {nb_tables} tables existantes dans la base")
    else:
        print("‚ö†Ô∏è PostgreSQL : Connexion OK mais test inattendu")
        
except Exception as e:
    print(f"‚ùå PostgreSQL : Erreur de connexion - {e}")
    print("   üí° V√©rifiez que Docker Compose est lanc√© : docker compose up -d")
    engine = None

print("\n‚úÖ Tests de connexion termin√©s !")


üîå Test des connexions...
‚úÖ MinIO : Bucket 'datasens-raw' existe d√©j√†


KeyboardInterrupt: 

In [4]:
# Override to root-level directories
try:
    ROOT = ROOT
except NameError:
    from pathlib import Path
    ROOT = Path.cwd().resolve().parents[2]
LOGS = ROOT / 'logs'
DATA = ROOT / 'data' / 'raw'
print('ROOT=', ROOT)
print('LOGS=', LOGS)
print('DATA(raw)=', DATA)


ROOT= C:\Users\Utilisateur\Desktop
LOGS= C:\Users\Utilisateur\Desktop\logs
DATA(raw)= C:\Users\Utilisateur\Desktop\data\raw


In [6]:
# Charger .env depuis la racine et cr√©er un .env.example si absent
from pathlib import Path
from dotenv import load_dotenv

# D√©finir ROOT si non d√©fini
if 'ROOT' not in globals():
    ROOT = Path.cwd().resolve().parents[2]

env_path = ROOT / '.env'
loaded = load_dotenv(env_path)
if loaded:
    print('‚úÖ .env charg√©:', env_path)
else:
    print('‚ö†Ô∏è .env non trouv√©:', env_path)
    example = ROOT / '.env.example'
    if not example.exists():
        example.write_text("""
# PostgreSQL
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
POSTGRES_DB=datasens
POSTGRES_USER=ds_user
POSTGRES_PASS=ds_pass

# API Keys (optionnelles pour d√©mo)
OWM_API_KEY=
KAGGLE_USERNAME=
KAGGLE_KEY=

# Git (optionnel)
GIT_USER_NAME=
GIT_USER_EMAIL=
""".strip()+"\n", encoding='utf-8')
        print('üìÑ Exemple cr√©√©:', example)
    else:
        print('üìÑ Exemple d√©j√† pr√©sent:', example)



‚ö†Ô∏è .env non trouv√©: C:\Users\Utilisateur\Desktop\.env
üìÑ Exemple d√©j√† pr√©sent: C:\Users\Utilisateur\Desktop\.env.example


# DataSens E1_v2 ‚Äî 01_setup_env

- Objectifs: arborescence raw, logging, .env
- Pr√©requis: Python, venv activ√©, `pip install -r requirements.txt`
- Ordre global E1_v2: 01 ‚Üí 02 ‚Üí 03 ‚Üí 04 ‚Üí 05
- Guide: docs/GUIDE_TECHNIQUE_E1.md


In [7]:
# DataSens E1_v2 - 01_setup_env
# Config .env, arborescence raw, logging
import logging
from datetime import UTC, datetime
from pathlib import Path

ROOT = Path.cwd()
DATA = ROOT / "data" / "raw"
for sub in ["kaggle","api/owm","api/newsapi","rss","scraping/multi","scraping/viepublique","scraping/datagouv","gdelt","manifests"]:
    (DATA / sub).mkdir(parents=True, exist_ok=True)
print("‚úÖ Arborescence raw cr√©√©e:", DATA)

# Logging
LOGS = ROOT.parent / "logs"
LOGS.mkdir(parents=True, exist_ok=True)
stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
log_file = LOGS / f"collecte_{stamp}.log"
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s - %(message)s', datefmt='%H:%M:%S', handlers=[logging.FileHandler(log_file, encoding='utf-8'), logging.StreamHandler()])
logging.info("Syst√®me de logging initialis√©")
print("üìÑ Log:", log_file)



[12:20:39] INFO - Syst√®me de logging initialis√©


‚úÖ Arborescence raw cr√©√©e: c:\Users\Utilisateur\Desktop\DataSens\notebooks\datasens_E1_v2\data\raw
üìÑ Log: c:\Users\Utilisateur\Desktop\DataSens\notebooks\logs\collecte_20251101_112039.log
