# Quickstart — Pipeline ETL Olist

Ce notebook reproduit les etapes du README pour lancer le pipeline ETL de bout en bout :
1. Verification de l'environnement
2. Telechargement des donnees Kaggle
3. Execution du pipeline ETL
4. Verification de la base de donnees
5. Requetes exploratoires

**Pre-requis** : `uv venv && uv sync` et credentials Kaggle dans `.env`.

## 1. Setup & verifications

In [None]:
import logging
import sqlite3
from pathlib import Path

import pandas as pd

from src.config import CSV_FILES, DATABASE_PATH, RAW_DIR

logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(name)s | %(message)s")

# Verifier .env
env_path = Path("..") / ".env"
if env_path.exists():
    print(".env trouve")
else:
    print(".env manquant — copier .env.example en .env et renseigner KAGGLE_USERNAME / KAGGLE_KEY")

# Verifier les CSV
missing = [name for name, fname in CSV_FILES.items() if not (RAW_DIR / fname).exists()]
if missing:
    print(f"CSV manquants ({len(missing)}/{len(CSV_FILES)}) : {missing}")
    print("Lancer la cellule suivante pour telecharger les donnees.")
else:
    print(f"Les {len(CSV_FILES)} CSV sont presents dans {RAW_DIR}")

## 2. Telechargement des donnees

In [None]:
# Telecharger depuis Kaggle (skip si deja present)
missing = [name for name, fname in CSV_FILES.items() if not (RAW_DIR / fname).exists()]
if missing:
    !bash ../scripts/download_dataset.sh
else:
    print("Donnees deja presentes, telechargement ignore.")

## 3. Lancer le pipeline ETL

In [None]:
from src.etl.pipeline import run_full_pipeline

run_full_pipeline()

## 4. Verification de la base

In [None]:
TABLES = [
    "dim_dates",
    "dim_geolocation",
    "dim_customers",
    "dim_sellers",
    "dim_products",
    "fact_orders",
]

conn = sqlite3.connect(DATABASE_PATH)

# Volumetrie
counts = {t: pd.read_sql(f"SELECT COUNT(*) AS n FROM {t}", conn)["n"].iloc[0] for t in TABLES}
df_counts = pd.DataFrame(counts.items(), columns=["table", "lignes"])
display(df_counts)

# Apercu de chaque table
for t in TABLES:
    print(f"\n--- {t} ---")
    display(pd.read_sql(f"SELECT * FROM {t} LIMIT 3", conn))

conn.close()

## 5. Requetes exploratoires

In [None]:
conn = sqlite3.connect(DATABASE_PATH)

# Top 5 categories par nombre de commandes
print("Top 5 categories produits")
display(pd.read_sql("""
    SELECT p.category_name_en AS categorie, COUNT(*) AS nb_commandes
    FROM fact_orders f
    JOIN dim_products p ON f.product_key = p.product_key
    GROUP BY categorie
    ORDER BY nb_commandes DESC
    LIMIT 5
""", conn))

# Commandes par mois
print("\nCommandes par mois")
display(pd.read_sql("""
    SELECT d.year, d.month, COUNT(*) AS nb_commandes
    FROM fact_orders f
    JOIN dim_dates d ON f.date_key = d.date_key
    GROUP BY d.year, d.month
    ORDER BY d.year, d.month
""", conn))

# Score review moyen
print("\nScore review moyen")
display(pd.read_sql("""
    SELECT ROUND(AVG(review_score), 2) AS score_moyen
    FROM fact_orders
    WHERE review_score IS NOT NULL
""", conn))

conn.close()