## DATA PIPELINE
- Etapes : Download ➜ Extract ➜ EDA

Structure des dossiers :
- /data/raw/       : ZIP téléchargés
- /data/extracted/ : Fichiers extraits par dataset
-/data/cleaned/   : Fichiers nettoyés finaux

In [None]:
import os
import zipfile
from pathlib import Path
import subprocess
import papermill as pm

os.environ["KAGGLE_USERNAME"] = "gabrielcabart"
os.environ["KAGGLE_KEY"] = "ef2487d4a68ba1c9cf693898c167f3b2"

In [None]:
BASE_DIR = Path().resolve().parent 

RAW_DIR = BASE_DIR / "data" / "raw"
EXTRACTED_DIR = BASE_DIR / "data" / "extracted"
CLEANED_DIR = BASE_DIR / "data" / "cleaned"

RAW_DIR.mkdir(parents=True, exist_ok=True)
EXTRACTED_DIR.mkdir(parents=True, exist_ok=True)
CLEANED_DIR.mkdir(parents=True, exist_ok=True)

print(f"BASE_DIR: {BASE_DIR}")
print(f"RAW_DIR: {RAW_DIR}")
print(f"EXTRACTED_DIR: {EXTRACTED_DIR}")
print(f"CLEANED_DIR: {CLEANED_DIR}")

In [None]:
# Liste des datasets (Slug, Nom ZIP, Nom dossier extrait, Nom du fichier à extraire)
DATASETS = [
    {
        "slug": "sobhanmoosavi/us-accidents",
        "zip_name": "USA_Accidents_Traffic.zip",
        "extract_dir": "USA_Accidents_Traffic",
        "expected_file": "US_Accidents_March23.csv"
    },
    {
        "slug": "ryanjt/airline-delay-cause",
        "zip_name": "USA_Airline_Delay_Cause.zip",
        "extract_dir": "USA_Airline_Delay_Cause",
        "expected_file": "Airline_Delay_Cause.csv"
    },
    {
        "slug": "chrico03/railroad-accident-and-incident-data",
        "zip_name": "Railroad_Accident_Incident_Data.zip",
        "extract_dir": "Railroad_Accident_Incident_Data",
        "expected_file": "Rail_Equipment_Accident_Incident_Data.csv"
    },
    {
        "slug": "natasha0786/supply-chain-dataset",
        "zip_name": "Supply_chain_dataset.zip",
        "extract_dir": "Supply_chain_dataset",
        "expected_file": "dynamic_supply_chain_logistics_dataset_with_country.csv"
    },
    {
        "slug": "sujalsuthar/amazon-delivery-dataset",
        "zip_name": "Amazon_Delivery_Dataset.zip",
        "extract_dir": "Amazon_Delivery_Dataset",
        "expected_file": "amazon_delivery.csv"
    },
    {
        "slug": "gabrielcabart/maritime-accidents-and-port-data",
        "zip_name": "Shipping_Accidents.zip",
        "extract_dir": "Shipping_Accidents",
        "expected_file": "Shipping_Accidents.shp"
    }
]

In [None]:
# Téléchargement pour chaque dataset
for ds in DATASETS:
    zip_path = RAW_DIR / ds["zip_name"]

    if zip_path.exists():
        print(f"✅ {ds['zip_name']} déjà présent.")
        continue

    print(f"⬇️  Téléchargement de {ds['zip_name']} ...")

    try:
        # Forcer le bon nom dès le téléchargement :
        subprocess.run(
            [
                "kaggle", "datasets", "download",
                "-d", ds["slug"],
                "-p", str(RAW_DIR)
            ],
            check=True
        )

        # Kaggle télécharge souvent avec le nom du slug → on renomme au standard si besoin :
        downloaded_default_name = RAW_DIR / (ds["slug"].split("/")[-1] + ".zip")

        if downloaded_default_name.exists() and downloaded_default_name != zip_path:
            downloaded_default_name.rename(zip_path)
            print(f"✅ Fichier renommé : {downloaded_default_name.name} → {zip_path.name}")

        print(f"✅ Téléchargement terminé : {zip_path.name}")

    except subprocess.CalledProcessError as e:
        print(f"❌ Erreur lors du téléchargement de {ds['zip_name']} : {e}")

In [None]:
# EXTRACTION : pour chaque ZIP de /data/raw/
for ds in DATASETS:
    zip_path = RAW_DIR / ds["zip_name"]
    dest_dir = EXTRACTED_DIR / ds["extract_dir"]
    expected_file = dest_dir / ds["expected_file"]

    if expected_file.exists():
        print(f"✅ {expected_file.name} déjà extrait dans {dest_dir.name}.")
        continue

    # Assure le dossier cible
    dest_dir.mkdir(parents=True, exist_ok=True)

    print(f"Extraction de {zip_path.name} vers {dest_dir} ...")

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(dest_dir)
        if expected_file.exists():
            print(f"✅ Extraction terminée : {expected_file.name}")
        else:
            print(f"⚠️ Extraction faite mais {expected_file.name} introuvable — vérifie le contenu.")
    except Exception as e:
        print(f"❌ Erreur lors de l'extraction de {zip_path.name} : {e}")

In [None]:
# Dossier où sont les notebooks EDA
NOTEBOOKS_DIR = BASE_DIR / "notebooks"

# Mapping : notebook EDA, fichier nettoyé attendu
EDA_TASKS = [
    {
        "notebook": "EDA_Accident_Traffic.ipynb",
        "cleaned": "usa_accidents_traffic_cleaned.csv"
    },
    {
        "notebook": "EDA_Airline_Delay_Cause.ipynb",
        "cleaned": "airline_delay_cause_cleaned.csv"
    },
    {
        "notebook": "EDA_Amazon_Delivery_Dataset.ipynb",
        "cleaned": "amazon_delivery_cleaned.csv"
    },
    {
        "notebook": "EDA_Railroad_Accident_Incident_Data.ipynb",
        "cleaned": "railroad_accident_cleaned.csv"
    },
    {
        "notebook": "EDA_Supply_chain_dataset.ipynb",
        "cleaned": "supply_chain_cleaned.csv"
    },
    {
        "notebook": "EDA_Shipping_Accidents.ipynb",
        "cleaned": "shipping_accidents_cleaned.csv"
    }
]

# Exécution EDA pour chaque notebook
for task in EDA_TASKS:
    notebook_path = NOTEBOOKS_DIR / task["notebook"]
    cleaned_path = CLEANED_DIR / task["cleaned"]

    if cleaned_path.exists():
        print(f"✅ {cleaned_path.name} déjà généré. Skip.")
        continue

    print(f"Exécution de {task['notebook']} ...")

    try:
        pm.execute_notebook(
            input_path=str(notebook_path),
            output_path=None, 
            parameters={}
        )
        print(f"✅ EDA terminé pour : {task['notebook']}")
    except Exception as e:
        print(f"❌ Erreur lors de l'exécution de {task['notebook']}: {e}")

print("\n=== ✅ TOUS LES EDA SONT TERMINÉS ===")