In [None]:
from pathlib import Path

ROOT = Path.cwd().resolve().parents[2]
DATA_DIR = ROOT / 'data'
DOCS_DIR = ROOT / 'docs'
LOGS_DIR = ROOT / 'logs'
print('ROOT=', ROOT)
print('DATA_DIR=', DATA_DIR)
print('DOCS_DIR=', DOCS_DIR)
print('LOGS_DIR=', LOGS_DIR)


In [None]:
# Normalize paths to project root
try:
    ROOT = ROOT
except NameError:
    from pathlib import Path
    ROOT = Path.cwd().resolve().parents[2]
DATA_DIR = ROOT / 'data'
DOCS_DIR = ROOT / 'docs'
LOGS_DIR = ROOT / 'logs'
print('ROOT=', ROOT)
print('DATA_DIR=', DATA_DIR)
print('DOCS_DIR=', DOCS_DIR)
print('LOGS_DIR=', LOGS_DIR)


# üîß DataSens E1 ‚Äî Notebook 1 : Setup Environnement

**üéØ Objectif** : Configurer l'environnement de d√©veloppement et valider la stack technique

---

## üìã Contenu de ce notebook

1. Introduction et objectifs E1
2. V√©rification de l'environnement Python
3. Cr√©ation de l'arborescence projet
4. Configuration variables d'environnement (.env)
5. Connexion PostgreSQL (test)
6. Initialisation Git

---

## üîí RGPD & Gouvernance

‚ö†Ô∏è **Rappel important** :
- Pas de donn√©es personnelles directes (hash SHA-256 si n√©cessaire)
- Respect robots.txt pour le scraping ; throttle & user-agent explicite
- Droits d'usage document√©s par source (lien dans Markdown)
- Journaux d'ex√©cution (log simple CSV/JSON) + manifest par run



## üéØ Introduction & Objectifs E1

**DataSens E1** : Construction du socle data avec :
- ‚úÖ Mod√©lisation Merise (MCD ‚Üí MLD ‚Üí MPD cibl√©)
- ‚úÖ Cr√©ation et remplissage PostgreSQL (18 tables)
- ‚úÖ CRUD complet test√© depuis le notebook
- ‚úÖ Ingestion r√©elle des **5 types de sources** :
  1. **Fichier plat** : Kaggle CSV
  2. **Base de donn√©es** : Export Kaggle SQLite ‚Üí Postgres
  3. **API** : OpenWeatherMap (m√©t√©o communes)
  4. **Web Scraping** : MonAvisCitoyen (dry-run, robots.txt)
  5. **Big Data** : GDELT GKG (√©chantillon journalier)
- ‚úÖ Tra√ßabilit√© et gouvernance (flux, manifest, versionning Git)

**Mode d'ex√©cution** : Cellule par cellule (pas √† pas)


In [1]:
# V√©rification environnement Python
import subprocess
import sys
from pathlib import Path

print("üîç V√©rification environnement Python")
print("=" * 80)
print(f"Python version : {sys.version}")
print(f"Python executable : {sys.executable}")

# V√©rifier version Python >= 3.11
version_info = sys.version_info
if version_info.major >= 3 and version_info.minor >= 11:
    print(f"‚úÖ Python {version_info.major}.{version_info.minor}.{version_info.micro} OK")
else:
    print(f"‚ö†Ô∏è Python {version_info.major}.{version_info.minor} ‚Äî Recommand√© Python 3.11+")


üîç V√©rification environnement Python
Python version : 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
Python executable : c:\Users\Utilisateur\Desktop\Datasens_Project\.venv\Scripts\python.exe
‚úÖ Python 3.12.7 OK


In [2]:
# Liste des packages install√©s
print("\nüì¶ Packages Python install√©s")
print("=" * 80)

packages_to_check = [
    "pandas",
    "sqlalchemy",
    "psycopg2",
    "requests",
    "beautifulsoup4",
    "python-dotenv",
]

result = subprocess.run(
    [sys.executable, "-m", "pip", "list"],
    check=False, capture_output=True,
    text=True
)

if result.returncode == 0:
    installed_packages = result.stdout
    print("\nV√©rification packages critiques :\n")
    for pkg in packages_to_check:
        if pkg in installed_packages.lower():
            version = [line for line in installed_packages.split("\n") if pkg.lower() in line.lower()]
            if version:
                print(f"  ‚úÖ {version[0]}")
            else:
                print(f"  ‚úÖ {pkg} (version non d√©tect√©e)")
        else:
            print(f"  ‚ùå {pkg} - √Ä installer : pip install {pkg}")

    print("\nüìã Liste compl√®te (pip list) :")
    print(installed_packages[:500] + "..." if len(installed_packages) > 500 else installed_packages)
else:
    print("‚ö†Ô∏è Impossible d'ex√©cuter pip list")



üì¶ Packages Python install√©s

V√©rification packages critiques :

  ‚úÖ pandas                   2.3.3
  ‚úÖ SQLAlchemy               2.0.44
  ‚úÖ psycopg2-binary          2.9.11
  ‚úÖ requests                 2.32.5
  ‚úÖ beautifulsoup4           4.14.2
  ‚úÖ python-dotenv            1.2.1

üìã Liste compl√®te (pip list) :
Package                  Version
------------------------ -----------
annotated-types          0.7.0
argon2-cffi              25.1.0
argon2-cffi-bindings     25.1.0
asttokens                3.0.0
beautifulsoup4           4.14.2
cachetools               6.2.1
certifi                  2025.10.5
cffi                     2.0.0
charset-normalizer       3.4.4
colorama                 0.4.6
comm                     0.2.3
contourpy                1.3.3
cycler                   0.12.1
debugpy            ...


## üìÅ Cr√©ation de l'arborescence projet

Structure du projet selon les conventions DataSens :


In [3]:
# D√©terminer la racine du projet (parent de notebooks/)
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR

print("üìÅ Cr√©ation de l'arborescence projet")
print("=" * 80)
print(f"Racine projet : {PROJECT_ROOT}")

# Arborescence √† cr√©er
directories = {
    "data": ["raw", "silver", "gold"],
    "data/raw": ["kaggle", "api", "scraping", "gdelt", "manifests"],
    "data/raw/api": ["owm"],
    "data/raw/scraping": ["mav"],  # MonAvisCitoyen
    "logs": [],
    "docs": [],
    "notebooks": [],
}

created = []
for base_dir, subdirs in directories.items():
    base_path = PROJECT_ROOT / base_dir
    base_path.mkdir(parents=True, exist_ok=True)
    created.append(f"‚úÖ {base_dir}/")

    for subdir in subdirs:
        sub_path = base_path / subdir
        sub_path.mkdir(parents=True, exist_ok=True)
        created.append(f"   ‚úÖ {base_dir}/{subdir}/")

print("\nüìÇ Dossiers cr√©√©s :")
for item in created:
    print(item)

print(f"\n‚úÖ Arborescence pr√™te ! ({len(created)} dossiers)")


üìÅ Cr√©ation de l'arborescence projet
Racine projet : c:\Users\Utilisateur\Desktop\Datasens_Project

üìÇ Dossiers cr√©√©s :
‚úÖ data/
   ‚úÖ data/raw/
   ‚úÖ data/silver/
   ‚úÖ data/gold/
‚úÖ data/raw/
   ‚úÖ data/raw/kaggle/
   ‚úÖ data/raw/api/
   ‚úÖ data/raw/scraping/
   ‚úÖ data/raw/gdelt/
   ‚úÖ data/raw/manifests/
‚úÖ data/raw/api/
   ‚úÖ data/raw/api/owm/
‚úÖ data/raw/scraping/
   ‚úÖ data/raw/scraping/mav/
‚úÖ logs/
‚úÖ docs/
‚úÖ notebooks/

‚úÖ Arborescence pr√™te ! (17 dossiers)


## ‚öôÔ∏è Configuration .env

Cr√©ation du fichier `.env` de d√©veloppement avec variables PostgreSQL et API keys.

**‚ö†Ô∏è IMPORTANT** : Ce fichier ne doit JAMAIS √™tre commit√© (dans .gitignore)


In [4]:
import os

from dotenv import load_dotenv

# Charger .env s'il existe
env_path = PROJECT_ROOT / ".env"
env_loaded = load_dotenv(env_path)

if env_loaded:
    print(f"‚úÖ Fichier .env charg√© : {env_path}")
else:
    print(f"‚ö†Ô∏è Fichier .env non trouv√© : {env_path}")
    print("   Cr√©ation d'un .env.example pour r√©f√©rence...")

    # Cr√©er un .env.example
    env_example = PROJECT_ROOT / ".env.example"
    env_example.write_text("""
# PostgreSQL
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
POSTGRES_DB=datasens
POSTGRES_USER=ds_user
POSTGRES_PASS=ds_pass

# API Keys (optionnelles pour d√©mo)
OWM_API_KEY=your_openweathermap_key_here
KAGGLE_USERNAME=your_kaggle_username
KAGGLE_KEY=your_kaggle_key

# Git (optionnel)
GIT_USER_NAME=Your Name
GIT_USER_EMAIL=your.email@example.com
""")
    print(f"   üìÑ Template cr√©√© : {env_example}")

# Afficher configuration (sans afficher les mots de passe)
print("\nüîê Configuration charg√©e :")
print(f"   POSTGRES_HOST : {os.getenv('POSTGRES_HOST', 'localhost')}")
print(f"   POSTGRES_PORT : {os.getenv('POSTGRES_PORT', '5432')}")
print(f"   POSTGRES_DB   : {os.getenv('POSTGRES_DB', 'datasens')}")
print(f"   POSTGRES_USER : {os.getenv('POSTGRES_USER', 'ds_user')}")
print(f"   OWM_API_KEY   : {'‚úÖ Configur√©e' if os.getenv('OWM_API_KEY') else '‚ùå Manquante (optionnelle)'}")
print(f"   KAGGLE_USERNAME: {'‚úÖ Configur√©e' if os.getenv('KAGGLE_USERNAME') else '‚ùå Manquante (optionnelle)'}")


‚úÖ Fichier .env charg√© : c:\Users\Utilisateur\Desktop\Datasens_Project\.env

üîê Configuration charg√©e :
   POSTGRES_HOST : localhost
   POSTGRES_PORT : 5432
   POSTGRES_DB   : datasens
   POSTGRES_USER : ds_user
   OWM_API_KEY   : ‚úÖ Configur√©e
   KAGGLE_USERNAME: ‚úÖ Configur√©e


## üóÑÔ∏è Connexion PostgreSQL

Test de connexion √† la base PostgreSQL (via Docker ou locale)


In [5]:
from sqlalchemy import create_engine, text

# R√©cup√©rer variables d'environnement
PG_HOST = os.getenv("POSTGRES_HOST", "localhost")
PG_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
PG_DB = os.getenv("POSTGRES_DB", "datasens")
PG_USER = os.getenv("POSTGRES_USER", "ds_user")
PG_PASS = os.getenv("POSTGRES_PASS", "ds_pass")

# URL de connexion
PG_URL = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"

print("üîå Test connexion PostgreSQL")
print("=" * 80)
print(f"URL : postgresql://{PG_USER}:***@{PG_HOST}:{PG_PORT}/{PG_DB}")

try:
    engine = create_engine(PG_URL, future=True)

    # Test simple : SELECT 1
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1 as test"))
        test_value = result.scalar()

    if test_value == 1:
        print("‚úÖ Connexion PostgreSQL r√©ussie !")
        print(f"   üóÑÔ∏è Base de donn√©es : {PG_DB}")
        print(f"   üë§ Utilisateur : {PG_USER}")
        print(f"   üìç Serveur : {PG_HOST}:{PG_PORT}")
    else:
        print("‚ö†Ô∏è Connexion OK mais test inattendu")

except Exception as e:
    print(f"‚ùå Erreur de connexion : {e}")
    print("\nüí° V√©rifications :")
    print("   1. Docker Compose est-il d√©marr√© ? ‚Üí docker-compose up -d")
    print("   2. PostgreSQL est-il accessible sur le port 5432 ?")
    print("   3. Les credentials dans .env sont-ils corrects ?")


üîå Test connexion PostgreSQL
URL : postgresql://ds_user:***@localhost:5432/datasens
‚úÖ Connexion PostgreSQL r√©ussie !
   üóÑÔ∏è Base de donn√©es : datasens
   üë§ Utilisateur : ds_user
   üìç Serveur : localhost:5432


## üîÑ Initialisation Git

Initialisation du d√©p√¥t Git (si ce n'est pas d√©j√† fait) et premier commit


In [6]:
import subprocess

print("üîÑ V√©rification Git")
print("=" * 80)

# V√©rifier si Git est install√©
try:
    git_version = subprocess.run(
        ["git", "--version"],
        check=False, capture_output=True,
        text=True
    )
    print(f"‚úÖ {git_version.stdout.strip()}")
except FileNotFoundError:
    print("‚ùå Git non install√© ‚Äî Installation requise : https://git-scm.com/")
    exit(1)

# V√©rifier si le projet est d√©j√† un d√©p√¥t Git
git_dir = PROJECT_ROOT / ".git"
if git_dir.exists():
    print(f"\n‚úÖ D√©p√¥t Git d√©j√† initialis√© : {PROJECT_ROOT}")

    # Afficher git status
    try:
        status = subprocess.run(
            ["git", "status", "--short"],
            check=False, cwd=PROJECT_ROOT,
            capture_output=True,
            text=True
        )
        if status.stdout.strip():
            print("\nüìã Fichiers modifi√©s/non suivis :")
            print(status.stdout)
        else:
            print("\nüìã Aucun changement (working tree clean)")
    except Exception as e:
        print(f"‚ö†Ô∏è Impossible de lire git status : {e}")
else:
    print(f"\n‚ö†Ô∏è D√©p√¥t Git non initialis√© dans {PROJECT_ROOT}")
    print("   üí° Initialisation manuelle recommand√©e :")
    print(f"      cd {PROJECT_ROOT}")
    print("      git init")
    print("      git add .")
    print('      git commit -m "Initial commit E1"')

print("\n‚úÖ Setup environnement termin√© !")
print("   ‚û°Ô∏è Passez au notebook 02_schema_create.ipynb")


üîÑ V√©rification Git
‚úÖ git version 2.49.0.windows.1

‚úÖ D√©p√¥t Git d√©j√† initialis√© : c:\Users\Utilisateur\Desktop\Datasens_Project

üìã Fichiers modifi√©s/non suivis :
 D docs/FIX_PDF_QUICK.md
 M docs/GUIDE_TECHNIQUE_JURY.md
 M docs/GUIDE_TECHNIQUE_JURY.pdf
 D docs/PDF_FORMATTING_INSTRUCTIONS.md
 D docs/fix_pdf_formatting.ps1
 M notebooks/datasens_E1_v2.ipynb
?? docs/ARCHITECTURE_PIPELINE_E1.md
?? docs/GUIDE_TECHNIQUE_JURY_V2.md
?? docs/datasens_dictionary.md
?? docs/e1_schema.sql
?? notebooks/01_setup_env.ipynb
?? notebooks/02_schema_create.ipynb
?? notebooks/03_ingest_sources.ipynb
?? notebooks/04_crud_tests.ipynb
?? notebooks/05_snapshot_and_readme.ipynb
?? notebooks/README_VERSIONNING.md
?? notebooks/data/raw/api/
?? notebooks/data/raw/gdelt/
?? notebooks/data/raw/kaggle/
?? notebooks/data/raw/manifests/
?? notebooks/data/raw/rss/
?? notebooks/data/raw/scraping/multi/scraping_multi_20251029T122841Z.csv


‚úÖ Setup environnement termin√© !
   ‚û°Ô∏è Passez au notebook 02_s