In [61]:
import os
import json
import sqlite3
from datetime import datetime
from pathlib import Path
import yaml
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window
from IPython.display import display, Markdown


# -------------------------------------------------------------------
# 0. Racine du projet (attention √† __file__ si tu es en notebook)
# -------------------------------------------------------------------
try:
    PROJECT_ROOT = Path(__file__).resolve().parents[2]
except NameError:
    PROJECT_ROOT = Path("/mnt/c/Users/alexa/Simplon/Esther/Exos/Starter stack pour Data Engineers - Partie 1")

# -------------------------------------------------------------------
# 1. Chargement de settings.yaml (identique √† ta version)
# -------------------------------------------------------------------
def load_settings(path: str = "settings.yaml") -> dict:
    """
    Charge settings.yaml en basant les chemins relatifs sur la racine du
    projet (deux niveaux au-dessus de ce script).
    """
    base_dir = PROJECT_ROOT
    cfg_path = Path(path)
    if not cfg_path.is_absolute():
        cfg_path = (base_dir / cfg_path).resolve()
    if not cfg_path.exists():
        raise FileNotFoundError(f"Settings file not found: {cfg_path}")
    with cfg_path.open("r", encoding="utf-8") as f:
        return yaml.safe_load(f)

def resolve_path(value, default: str) -> Path:
    """
    Si 'value' est d√©fini on l'utilise, sinon 'default'.
    On convertit en chemin absolu bas√© sur PROJECT_ROOT.
    """
    target = Path(value if value is not None else default)
    if not target.is_absolute():
        target = (PROJECT_ROOT / target).resolve()
    return target

# -------------------------------------------------------------------
# 2. Cr√©ation de la SparkSession
# -------------------------------------------------------------------
def create_spark(app_name: str = "StarterStack_PySpark"):
    """
    Cr√©e une SparkSession locale.

    üêº Pandas : rien √† faire, tu manipules des DataFrame en m√©moire.
    üî• PySpark : tu DOIS cr√©er une session Spark pour avoir un contexte distribu√©.
    """
    spark = (
        SparkSession.builder
        .appName(app_name)
        .master("local[*]")  # toutes les cores locales
        # .config("spark.sql.shuffle.partitions", "8")  # optionnel
        .getOrCreate()
    )
    return spark

# -------------------------------------------------------------------
# 3. Lecture CSV avec Spark (remplace pd.read_csv)
# -------------------------------------------------------------------
def read_csv_spark(spark, path: Path, sep: str, enc: str):
    """
    Lecture CSV fa√ßon Spark.

    üêº Pandas : pd.read_csv(path, sep=sep, encoding=enc)
    üî• PySpark : spark.read.option(...).csv(...)
    """
    return (
        spark.read
        .option("header", True)
        .option("sep", sep)
        .option("encoding", enc)
        .csv(str(path))
    )
    
cfg = load_settings()

in_dir = resolve_path(cfg.get("input_dir"), "data/march-input")
out_dir = resolve_path(cfg.get("output_dir"), "data/out")
db_path = resolve_path(cfg.get("db_path"), "data/sales_db.db")

sep = cfg.get("csv_sep",",")
enc = cfg.get("csv_encoding", "utf-8")
ffmt = cfg.get("csv_float_format", "%.2f")  # si tu t'en sers plus tard

In [31]:
spark = create_spark()
# On reste coh√©rent avec ton usage de Path plut√¥t que os.path
customers_path = in_dir / "customers.csv"

if not customers_path.exists():
    display(Markdown(f"Fichier manquant : `{customers_path}`."))
else:
    # üî• Lecture avec Spark (remplace pd.read_csv)
    customers_sdf = read_csv_spark(spark, customers_path, sep=sep, enc=enc)

    # üî• Equivalent de customers.head(30)
    # Spark ne retourne pas directement un DataFrame "affichable" dans Jupyter,
    # donc on prend un √©chantillon limit√© et on le convertit en pandas
    customers_head_pdf = customers_sdf.show(50, truncate=False)
    display(customers_head_pdf)

    # üî• Equivalent de customers.shape
    # - .count() = nombre de lignes
    # - len(df.columns) = nombre de colonnes
    n_rows = customers_sdf.count()          # ‚ö†Ô∏è action ‚Üí d√©clenche un job Spark
    n_cols = len(customers_sdf.columns)

    display(Markdown(f"Taille: ({n_rows}, {n_cols})"))

+-----------+----------+---------+------------------+---------+---------+
|customer_id|first_name|last_name|email             |city     |is_active|
+-----------+----------+---------+------------------+---------+---------+
|C0001      |User1     |Test1    |user1@example.com |Nantes   |yes      |
|C0002      |User2     |Test2    |user2@example.com |Toulouse |yes      |
|C0003      |User3     |Test3    |user3@example.com |Bordeaux |y        |
|C0004      |User4     |Test4    |user4@example.com |Bordeaux |true     |
|C0005      |User5     |Test5    |user5@example.com |Lyon     |true     |
|C0006      |User6     |Test6    |user6@example.com |Marseille|false    |
|C0007      |User7     |Test7    |user7@example.com |Toulouse |true     |
|C0008      |User8     |Test8    |user8@example.com |Marseille|false    |
|C0009      |User9     |Test9    |user9@example.com |Toulouse |false    |
|C0010      |User10    |Test10   |user10@example.com|Bordeaux |true     |
|C0011      |User11    |Test11   |user

None

Taille: (800, 6)

In [32]:
# Ici on utilise Path comme dans tout ton notebook
refunds_path = in_dir / "refunds.csv"

if not refunds_path.exists():
    display(Markdown(f"Fichier manquant : `{refunds_path}`."))
else:
    # üî• Lecture Spark (remplace pd.read_csv)
    refunds_sdf = read_csv_spark(spark, refunds_path, sep=sep, enc=enc)

    # üî• Equivalent de refunds.head()
    refunds_head_pdf = refunds_sdf.show(50, truncate=False)
    display(refunds_head_pdf)

    # üî• Equivalent de refunds.shape
    n_rows = refunds_sdf.count()         # ‚ö†Ô∏è action ‚Üí spark job
    n_cols = len(refunds_sdf.columns)

    display(Markdown(f"Taille: ({n_rows}, {n_cols})"))

+---------+-------------+------+----------+-------------------+
|refund_id|order_id     |amount|reason    |created_at         |
+---------+-------------+------+----------+-------------------+
|R000001  |O202503010089|error |delay     |2025-03-01 14:03:41|
|R000002  |O202503010038|-8.89 |gesture   |2025-03-01 22:16:56|
|R000003  |O202503010008|again |item_issue|2025-03-01 20:06:25|
|R000004  |O202503010073|-2.47 |coupon    |2025-03-01 20:02:46|
|R000005  |O202503010005|-3.83 |gesture   |2025-03-01 09:58:15|
|R000006  |O202503010099|-6.53 |item_issue|2025-03-01 20:32:00|
|R000007  |O202503010003|-3.13 |item_issue|2025-03-01 08:49:52|
|R000008  |O202503010080|-16.96|gesture   |2025-03-01 11:00:56|
|R000009  |O202503010021|-5.93 |item_issue|2025-03-01 12:13:42|
|R000010  |O202503010096|-8.82 |item_issue|2025-03-01 18:18:47|
|R000011  |O202503010005|-8.92 |gesture   |2025-03-01 14:26:54|
|R000012  |O202503010091|-5.72 |coupon    |2025-03-01 12:23:01|
|R000013  |O202503010005|-5.1  |item_iss

None

Taille: (1122, 5)

In [None]:
# Toujours Path pour coh√©rence
order_path = in_dir / "orders_2025-03-01.json"

if not order_path.exists():
    display(Markdown(f"Fichier manquant : `{order_path}`."))
else:
    # üî• Lecture JSON avec Spark (remplace pd.read_json)
    order_sdf = (
        spark.read
             .option("multiline", True)      # si JSON complexe multi-lignes
             .json(str(order_path))
    )

    # üî• Equivalent de df.head()
    order_head_pdf = order_sdf.show(50, truncate=False)
    display(order_head_pdf)

    # üî• Equivalent de df.shape
    n_rows = order_sdf.count()                 # ‚ö†Ô∏è job Spark
    n_cols = len(order_sdf.columns)

    display(Markdown(f"Taille: ({n_rows}, {n_cols})"))

    # (Tr√®s utile !) voir le sch√©ma :
    print("üìå Sch√©ma d√©tect√© par Spark :")
    order_sdf.printSchema()
    #Le sch√©ma en PySpark, c‚Äôest la ‚Äúcarte d‚Äôidentit√©‚Äù des donn√©es : 
    #il d√©crit le type et la nullabilit√© de chaque colonne, 
    # ce qui permet √† Spark d‚Äôoptimiser massivement l‚Äôex√©cution, 
    # de d√©tecter les incoh√©rences et de manipuler des donn√©es complexes 
    # de fa√ßon s√ªre et ultra-performante ‚Äî en bref, c‚Äôest g√©nial 
    # parce que Spark sait exactement quoi tu traites et comment le traiter, 
    # sans jamais tout charger en m√©moire.

+-------+-------------------+-----------+---------------------------------------------------------------------------+-------------+--------------+
|channel|created_at         |customer_id|items                                                                      |order_id     |payment_status|
+-------+-------------------+-----------+---------------------------------------------------------------------------+-------------+--------------+
|app    |2025-03-01 20:36:44|C0793      |[{4, SKU001, 24.9}]                                                        |O202503010001|pending       |
|web    |2025-03-01 11:30:49|C0676      |[{4, SKU042, -7.5}, {4, SKU042, -7.5}, {5, SKU005, 12.5}]                  |O202503010001|paid          |
|web    |2025-03-01 07:27:00|C0642      |[{1, SKU014, 5.0}]                                                         |O202503010003|paid          |
|web    |2025-03-01 14:28:46|C0283      |[{2, SKU024, 4.0}]                                                         |O

None

Taille: (103, 6)

üìå Sch√©ma d√©tect√© par Spark :
root
 |-- channel: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- qty: long (nullable = true)
 |    |    |-- sku: string (nullable = true)
 |    |    |-- unit_price: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- payment_status: string (nullable = true)



In [51]:
# Pattern pour lire *tous* les orders du mois
# ex : orders_2025-03-01.json, orders_2025-03-02.json, ..., orders_2025-03-31.json
pattern = str(in_dir / "orders_2025-03-*.json")

# üî• Lecture de tous les JSON d'un coup
orders_sdf = spark.read.option("multiline", True).json(pattern)

# Aper√ßu (√©quivalent head)
orders_sdf.show(5, truncate=False)

# Taille
n_rows = orders_sdf.count()
n_cols = len(orders_sdf.columns)

display(Markdown(f"Taille: ({n_rows}, {n_cols})"))


25/11/20 13:39:13 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /mnt/c/Users/alexa/Simplon/Esther/Exos/Starter stack pour Data Engineers - Partie 1/data/march-input/orders_2025-03-*.json.
java.io.FileNotFoundException: File /mnt/c/Users/alexa/Simplon/Esther/Exos/Starter stack pour Data Engineers - Partie 1/data/march-input/orders_2025-03-*.json does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.s

+-------+-------------------+-----------+-------------------------------------------------------------------------+-------------+--------------+
|channel|created_at         |customer_id|items                                                                    |order_id     |payment_status|
+-------+-------------------+-----------+-------------------------------------------------------------------------+-------------+--------------+
|web    |2025-03-07 22:25:41|C0636      |[{1, SKU022, 5.0}, {2, SKU023, 4.0}, {3, SKU025, 15.0}, {5, SKU006, 7.5}]|O202503070001|paid          |
|app    |2025-03-07 19:17:49|C0499      |[{5, SKU017, 4.0}]                                                       |O202503070002|paid          |
|app    |2025-03-07 19:39:25|C0417      |[{5, SKU007, 4.0}, {3, SKU022, 7.5}, {4, SKU005, 4.0}, {2, SKU001, 12.5}]|O202503070003|pending       |
|web    |2025-03-07 11:00:36|C0056      |[{4, SKU030, 5.0}, {2, SKU046, 8.0}, {5, SKU029, 7.5}]                   |O202503070004|p

Taille: (3193, 6)

In [50]:

# On part de customers_sdf (DataFrame Spark charg√© avant)
customers_clean = (
    customers_sdf
    # 1Ô∏è‚É£ Normalisation de "is_active" (√©quivalent de apply(controle_bool))
    .withColumn(
        "is_active",
        F.when(F.col("is_active").cast("boolean").isNotNull(), F.col("is_active").cast("boolean"))  # cas bool direct
         .when(F.col("is_active").isin(1, "1"), True)                                              # int/str -> True
         .when(F.lower(F.col("is_active")).isin("true", "yes", "y", "t"), True)                   # strings positives
         .when(F.col("is_active").isin(0, "0"), False)                                             # False explicite
         .when(F.lower(F.col("is_active")).isin("false", "no", "n", "f"), False)                  # strings n√©gatives
         .otherwise(False)                                                                         # fallback = False (comme ton code)
    )

    # 2Ô∏è‚É£ Force le type de customer_id et city
    # Pandas : customers.astype({"customer_id":"string","city":"string"})
    # PySpark : cast sur chaque colonne
    .withColumn("customer_id", F.col("customer_id").cast("string"))
    .withColumn("city", F.col("city").cast("string"))
)

# 3Ô∏è‚É£ Affichage (√©quivalent display + head)
display(Markdown("Affichage clients (apr√®s nettoyage)"))
customers_clean.show(5, truncate=False)

# 4Ô∏è‚É£ Taille √©quivalente
n_rows = customers_clean.count()
n_cols = len(customers_clean.columns)
display(Markdown(f"Taille: ({n_rows}, {n_cols})"))


Affichage clients (apr√®s nettoyage)

+-----------+----------+---------+-----------------+--------+---------+
|customer_id|first_name|last_name|email            |city    |is_active|
+-----------+----------+---------+-----------------+--------+---------+
|C0001      |User1     |Test1    |user1@example.com|Nantes  |true     |
|C0002      |User2     |Test2    |user2@example.com|Toulouse|true     |
|C0003      |User3     |Test3    |user3@example.com|Bordeaux|true     |
|C0004      |User4     |Test4    |user4@example.com|Bordeaux|true     |
|C0005      |User5     |Test5    |user5@example.com|Lyon    |true     |
+-----------+----------+---------+-----------------+--------+---------+
only showing top 5 rows


Taille: (800, 6)

In [49]:
refunds_clean = (
    refunds_sdf
    # 1Ô∏è‚É£ Utiliser try_cast ‚Üí NULL si ce n'est pas convertible, oblig√© 
    #d'utiliser une fonction try_cast qui vient du SQL non pas du python pour
    #que pyspark continue sans soulever d'erreur
    .withColumn(
        "amount",
        F.expr("try_cast(amount as double)")
    )
    # 2Ô∏è‚É£ Remplacer les NULL par 0.0 (comme fillna apr√®s to_numeric)
    .fillna({"amount": 0.0})
    # 3Ô∏è‚É£ Garder created_at en string
    .withColumn("created_at", F.col("created_at").cast("string"))
)

display(Markdown("Aper√ßu remboursements (apr√®s coercition num√©rique)"))
refunds_clean.show(5, truncate=False)
n_rows = refunds_clean.count()
n_cols = len(refunds_clean.columns)
display(Markdown(f"Taille: ({n_rows}, {n_cols})"))


Aper√ßu remboursements (apr√®s coercition num√©rique)

+---------+-------------+------+----------+-------------------+
|refund_id|order_id     |amount|reason    |created_at         |
+---------+-------------+------+----------+-------------------+
|R000001  |O202503010089|0.0   |delay     |2025-03-01 14:03:41|
|R000002  |O202503010038|-8.89 |gesture   |2025-03-01 22:16:56|
|R000003  |O202503010008|0.0   |item_issue|2025-03-01 20:06:25|
|R000004  |O202503010073|-2.47 |coupon    |2025-03-01 20:02:46|
|R000005  |O202503010005|-3.83 |gesture   |2025-03-01 09:58:15|
+---------+-------------+------+----------+-------------------+
only showing top 5 rows


Taille: (1122, 5)

In [53]:
# 1Ô∏è‚É£ Nombre de lignes avant filtrage
ln_initial = orders_sdf.count()   # ‚ö†Ô∏è Spark action (job)

# 2Ô∏è‚É£ Filtrage (√©quivalent Pandas : orders[orders["payment_status"]=="paid"])
orders_filtered = orders_sdf.filter(F.col("payment_status") == "paid")

# 3Ô∏è‚É£ Nombre de lignes apr√®s filtrage
ln_final = orders_filtered.count()  # ‚ö†Ô∏è autre job Spark

# 4Ô∏è‚É£ Affichage
display(Markdown(f"Filtrage pay√©es : {ln_initial} ‚Üí {ln_final}"))

# Equivalent de .head()
orders_filtered.show(5, truncate=False)

Filtrage pay√©es : 3193 ‚Üí 2900

+-------+-------------------+-----------+-------------------------------------------------------------------------+-------------+--------------+
|channel|created_at         |customer_id|items                                                                    |order_id     |payment_status|
+-------+-------------------+-----------+-------------------------------------------------------------------------+-------------+--------------+
|web    |2025-03-07 22:25:41|C0636      |[{1, SKU022, 5.0}, {2, SKU023, 4.0}, {3, SKU025, 15.0}, {5, SKU006, 7.5}]|O202503070001|paid          |
|app    |2025-03-07 19:17:49|C0499      |[{5, SKU017, 4.0}]                                                       |O202503070002|paid          |
|web    |2025-03-07 11:00:36|C0056      |[{4, SKU030, 5.0}, {2, SKU046, 8.0}, {5, SKU029, 7.5}]                   |O202503070004|paid          |
|web    |2025-03-07 12:03:11|C0270      |[{4, SKU025, 19.9}, {2, SKU008, 5.0}]                                    |O202503070005|p

In [55]:
# On part du DataFrame Spark des commandes pay√©es
orders_paid = orders_sdf.filter(F.col("payment_status") == "paid")

display(Markdown("Avant explosion des items"))
orders_paid.show(5, truncate=False)

# 1Ô∏è‚É£ Explosion de la colonne array<struct> "items"
#    √âquivalent de : orders2 = orders2.explode("items")
orders_exploded = orders_paid.withColumn("item", F.explode("items"))

# 2Ô∏è‚É£ Flatten de la struct "item" en colonnes simples (item_sku, item_qty, item_unit_price)
#    En pandas : json_normalize(orders2["items"]).add_prefix("item_")
#    En Spark : on acc√®de directement aux champs de la struct
base_cols = [c for c in orders_paid.columns if c != "items"]

orders_flat = (
    orders_exploded
    .select(
        *[F.col(c) for c in base_cols],               # toutes les colonnes d‚Äôorigine sauf "items"
        F.col("item.sku").alias("item_sku"),          # champs de la struct
        F.col("item.qty").alias("item_qty"),
        F.col("item.unit_price").alias("item_unit_price"),
    )
)

display(Markdown("Apr√®s explosion des items"))
orders_flat.show(5, truncate=False)

display(Markdown(f"Colonnes: {orders_flat.columns[:12]} ..."))


Avant explosion des items

+-------+-------------------+-----------+-------------------------------------------------------------------------+-------------+--------------+
|channel|created_at         |customer_id|items                                                                    |order_id     |payment_status|
+-------+-------------------+-----------+-------------------------------------------------------------------------+-------------+--------------+
|web    |2025-03-07 22:25:41|C0636      |[{1, SKU022, 5.0}, {2, SKU023, 4.0}, {3, SKU025, 15.0}, {5, SKU006, 7.5}]|O202503070001|paid          |
|app    |2025-03-07 19:17:49|C0499      |[{5, SKU017, 4.0}]                                                       |O202503070002|paid          |
|web    |2025-03-07 11:00:36|C0056      |[{4, SKU030, 5.0}, {2, SKU046, 8.0}, {5, SKU029, 7.5}]                   |O202503070004|paid          |
|web    |2025-03-07 12:03:11|C0270      |[{4, SKU025, 19.9}, {2, SKU008, 5.0}]                                    |O202503070005|p

Apr√®s explosion des items

+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
|channel|created_at         |customer_id|order_id     |payment_status|item_sku|item_qty|item_unit_price|
+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
|web    |2025-03-07 22:25:41|C0636      |O202503070001|paid          |SKU022  |1       |5.0            |
|web    |2025-03-07 22:25:41|C0636      |O202503070001|paid          |SKU023  |2       |4.0            |
|web    |2025-03-07 22:25:41|C0636      |O202503070001|paid          |SKU025  |3       |15.0           |
|web    |2025-03-07 22:25:41|C0636      |O202503070001|paid          |SKU006  |5       |7.5            |
|app    |2025-03-07 19:17:49|C0499      |O202503070002|paid          |SKU017  |5       |4.0            |
+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
only showing top 5 rows


Colonnes: ['channel', 'created_at', 'customer_id', 'order_id', 'payment_status', 'item_sku', 'item_qty', 'item_unit_price'] ...

Avec Pandas, tu manipules des objets Python (list + dict) ‚Üí tu dois normaliser √† la main (explode, json_normalize, concat).

Avec PySpark, tu manipules des types fortement typ√©s (array<struct<...>>) ‚Üí tu peux simplement :

explode l‚Äôarray

acc√©der aux champs de la struct (item.sku, etc.)

select les colonnes que tu veux garder

C‚Äôest plus court, plus lisible, et scalable.

In [57]:
# 1Ô∏è‚É£ D√©tection des lignes n√©gatives
neg_sdf = orders_flat.filter(F.col("item_unit_price") < 0)

# 2Ô∏è‚É£ Compter (√©quivalent neg_mask.sum())
n_neg = neg_sdf.count()

display(Markdown(f"Lignes prix n√©gatifs : {n_neg}"))

# 3Ô∏è‚É£ Si rejets ‚Üí √©crire dans un CSV, sinon continuer normalement
if n_neg > 0:
    rejects_path = str(out_dir / "rejects_items.csv")

    # Sauvegarde en CSV (Spark √©crit un dossier ‚Üí on force en fichier unique)
    (neg_sdf
        .coalesce(1)                  # un seul fichier
        .write
        .option("header", True)
        .mode("overwrite")
        .csv(rejects_path)
    )

    display(Markdown(f"Rejets sauvegard√©s : `{rejects_path}`"))

# 4Ô∏è‚É£ Garder uniquement les lignes positives
orders_clean = orders_flat.filter(F.col("item_unit_price") >= 0)

# 5Ô∏è‚É£ Aper√ßu (√©quivalent orders2.head())
orders_clean.show(5, truncate=False)


Lignes prix n√©gatifs : 69

                                                                                

Rejets sauvegard√©s : `/mnt/c/Users/alexa/Simplon/Esther/Exos/Starter stack pour Data Engineers - Partie 1/data/out/rejects_items.csv`

+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
|channel|created_at         |customer_id|order_id     |payment_status|item_sku|item_qty|item_unit_price|
+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
|web    |2025-03-07 22:25:41|C0636      |O202503070001|paid          |SKU022  |1       |5.0            |
|web    |2025-03-07 22:25:41|C0636      |O202503070001|paid          |SKU023  |2       |4.0            |
|web    |2025-03-07 22:25:41|C0636      |O202503070001|paid          |SKU025  |3       |15.0           |
|web    |2025-03-07 22:25:41|C0636      |O202503070001|paid          |SKU006  |5       |7.5            |
|app    |2025-03-07 19:17:49|C0499      |O202503070002|paid          |SKU017  |5       |4.0            |
+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
only showing top 5 rows


In [66]:
# ==========================================================
# üêº Pandas (id√©e g√©n√©rale) :
# before = len(orders2)
#
# orders3 = (
#     orders2
#     .sort_values(["order_id","created_at"])
#     .drop_duplicates(subset=["order_id"], keep="first")
# )
#
# after = len(orders3)
#
# Ici :
#   ‚ñ∂ Pandas peut garder la "premi√®re occurrence" car il a un ordre de lignes
#   ‚ñ∂ Il peut trier, puis supprimer les doublons selon cet ordre
# ==========================================================


# ==========================================================
# üî• PySpark ‚Äî √âtape 1 : compter les lignes avant d√©duplication
#    (√©quivalent de len(orders2) en pandas)
# ==========================================================
before = orders_clean.count()    # ‚ö†Ô∏è Action Spark : d√©clenche un job


# ==========================================================
# üî• PySpark ‚Äî √âtape 2 : d√©finir une fen√™tre
#    üêº En Pandas : .sort_values("order_id","created_at")
#
# ‚ö†Ô∏è Spark ne peut PAS faire "drop_duplicates(... keep='first')"
#     sans qu‚Äôon lui dise ce que signifie "first".
#
# On doit donc d√©finir :
#   - comment grouper : partitionBy("order_id")
#   - comment ordonner : orderBy("created_at")
#
# Spark fera ensuite un ranking (row_number) dans chaque groupe.
# ==========================================================
w = Window.partitionBy("order_id").orderBy("created_at")


# ==========================================================
# üî• PySpark ‚Äî √âtape 3 :
# Num√©roter les lignes dans chaque groupe (ordre croissant dates)
#
# üêº Pandas aurait implicitement utilis√© l'ordre du dataframe tri√©
#     pour savoir quelle est "la premi√®re ligne".
#
# üî• Spark doit calculer explicitement un "num√©ro de ligne".
# ==========================================================
orders_ranked = orders_clean.withColumn(
    "rn",
    F.row_number().over(w)      # rn = 1 ‚Üí premi√®re ligne pour ce order_id
)


# ==========================================================
# üî• PySpark ‚Äî √âtape 4 :
# Garder uniquement la premi√®re ligne (rn == 1)
#
# üêº Pandas : .drop_duplicates(subset=["order_id"], keep="first")
# üî• PySpark : √©quivalent = filtrer rn == 1, puis drop rnk
# ==========================================================
orders_dedup = (
    orders_ranked
    .filter(F.col("rn") == 1)
    .drop("rn")
)


# ==========================================================
# üî• PySpark ‚Äî √âtape 5 :
# Compter les lignes apr√®s d√©duplication
#
# üêº Pandas : len(orders3)
# ==========================================================
after = orders_dedup.count()


# ==========================================================
# üî• PySpark ‚Äî √âtape 6 : afficher un √©chantillon
#
# üêº Pandas : orders3.head()
# üî• PySpark : limit().toPandas() OU show()
# ==========================================================
display(Markdown(f"D√©duplication : **{before} ‚Üí {after}**"))

orders_dedup.show(5, truncate=False)

D√©duplication : **7196 ‚Üí 2811**

+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
|channel|created_at         |customer_id|order_id     |payment_status|item_sku|item_qty|item_unit_price|
+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
|web    |2025-03-01 11:30:49|C0676      |O202503010001|paid          |SKU005  |5       |12.5           |
|web    |2025-03-01 07:27:00|C0642      |O202503010003|paid          |SKU014  |1       |5.0            |
|web    |2025-03-01 22:29:42|C0571      |O202503010005|paid          |SKU001  |1       |2.5            |
|web    |2025-03-01 09:24:19|C0704      |O202503010006|paid          |SKU039  |1       |9.9            |
|app    |2025-03-01 15:50:48|C0464      |O202503010007|paid          |SKU018  |1       |24.9           |
+-------+-------------------+-----------+-------------+--------------+--------+--------+---------------+
only showing top 5 rows


In [69]:
# On part de orders_dedup (√©quivalent de `orders3` en pandas)
# üêº Pandas : orders3["line_gross"] = orders3["item_qty"] * orders3["item_unit_price"]
# üî• PySpark : on utilise withColumn (les DF Spark sont immuables)
orders_with_gross = orders_dedup.withColumn(
    "line_gross",
    F.col("item_qty") * F.col("item_unit_price")
)

# üêº Pandas :
# per_order = orders3.groupby(
#     ["order_id","customer_id","channel","created_at"], as_index=False
# ).agg(
#     items_sold=("item_qty","sum"),
#     gross_revenue_eur=("line_gross","sum")
# )
#
# üî• PySpark : groupBy + agg(F.sum(...).alias(...))
per_order_sdf = (
    orders_with_gross
    .groupBy("order_id", "customer_id", "channel", "created_at")
    .agg(
        F.sum("item_qty").alias("items_sold"),
        F.sum("line_gross").alias("gross_revenue_eur")
    )
)

display(Markdown("Aper√ßu `per_order` (PySpark)"))

# üêº Pandas : per_order.head()
# üî• PySpark : limit().toPandas() pour avoir un mini DataFrame local affichable dans le notebook
per_order_sdf.show(5, truncate=False)

# üêº Pandas : per_order.shape
# üî• PySpark :
n_rows = per_order_sdf.count()            # action Spark ‚Üí d√©clenche un job
n_cols = len(per_order_sdf.columns)       # cheap, utilise juste le sch√©ma
display(Markdown(f"Taille: ({n_rows}, {n_cols})"))



Aper√ßu `per_order` (PySpark)

+-------------+-----------+-------+-------------------+----------+-----------------+
|order_id     |customer_id|channel|created_at         |items_sold|gross_revenue_eur|
+-------------+-----------+-------+-------------------+----------+-----------------+
|O202503010001|C0676      |web    |2025-03-01 11:30:49|5         |62.5             |
|O202503010003|C0642      |web    |2025-03-01 07:27:00|1         |5.0              |
|O202503010005|C0571      |web    |2025-03-01 22:29:42|1         |2.5              |
|O202503010006|C0704      |web    |2025-03-01 09:24:19|1         |9.9              |
|O202503010007|C0464      |app    |2025-03-01 15:50:48|1         |24.9             |
+-------------+-----------+-------+-------------------+----------+-----------------+
only showing top 5 rows


Taille: (2811, 6)

In [70]:
# =====================================================================
# üêº Pandas : len_init = len(per_order)
# üî• PySpark : on utilise count(), qui d√©clenche un job Spark
# =====================================================================
len_init = per_order_sdf.count()   # ‚ö†Ô∏è action Spark (scan distribu√©)

# =====================================================================
# üêº Pandas :
# per_order = per_order.merge(
#     customers[["customer_id","city","is_active"]],
#     on="customer_id",
#     how="left"
# )
# üî• PySpark :
# - join() au lieu de merge()
# - on travaille sur des DataFrames distribu√©s
# - n√©cessit√© de s√©lectionner les colonnes voulues c√¥t√© customers
# =====================================================================
per_order_joined = (
    per_order_sdf.alias("o")
    .join(
        customers_clean.select("customer_id", "city", "is_active").alias("c"),
        on="customer_id",
        how="left"
    )
)

# =====================================================================
# üêº Pandas :
# per_order = per_order[per_order["is_active"] == True].copy()
#
# - filtre sur les clients actifs
# - .copy() pour √©viter SettingWithCopyWarning
# üî• PySpark :
# - pas de .copy() (DF immuables)
# - on filtre avec filter()/where() sur la colonne bool√©enne "is_active"
# =====================================================================
per_order_active = per_order_joined.filter(F.col("is_active") == True)


# =====================================================================
# üêº Pandas : ln_aft = len(per_order)
# üî• PySpark : count() √† nouveau
# =====================================================================
ln_aft = per_order_active.count()


# =====================================================================
# üêº Pandas :
# display(Markdown(f"Apr√®s jointure+filtre actifs : **{len_init} ‚Üí {ln_aft}**"))
# display(per_order.head())
# üî• PySpark :
# - on garde le m√™me affichage Markdown
# - pour l‚Äôaper√ßu, on utilise limit().toPandas()
# =====================================================================
display(Markdown(f"Apr√®s jointure+filtre actifs : **{len_init} ‚Üí {ln_aft}**"))

per_order_active.show(5, truncate=False)


Apr√®s jointure+filtre actifs : **2811 ‚Üí 2471**

+-----------+-------------+-------+-------------------+----------+-----------------+---------+---------+
|customer_id|order_id     |channel|created_at         |items_sold|gross_revenue_eur|city     |is_active|
+-----------+-------------+-------+-------------------+----------+-----------------+---------+---------+
|C0676      |O202503010001|web    |2025-03-01 11:30:49|5         |62.5             |Marseille|true     |
|C0642      |O202503010003|web    |2025-03-01 07:27:00|1         |5.0              |Toulouse |true     |
|C0571      |O202503010005|web    |2025-03-01 22:29:42|1         |2.5              |Toulouse |true     |
|C0464      |O202503010007|app    |2025-03-01 15:50:48|1         |24.9             |Nantes   |true     |
|C0317      |O202503010008|app    |2025-03-01 20:56:15|2         |30.0             |Marseille|true     |
+-----------+-------------+-------+-------------------+----------+-----------------+---------+---------+
only showing top 5 rows


In [73]:
# ==========================================================================
# üêº Pandas version :
# def to_date(s):
#     s = str(s)
#     for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
#         try:
#             return datetime.strptime(s, fmt).date().isoformat()
#         except ValueError:
#             continue
#     raise ValueError("Format de date non reconnu")
#
# per_order["order_date"] = per_order["created_at"].apply(to_date)
#
# üî• PySpark version :
# Pas d'apply ‚Üí on utilise les fonctions de parsing SQL :
#   - to_timestamp()
#   - to_date()
#   - coalesce() pour tester plusieurs formats
# ==========================================================================

# 1Ô∏è‚É£ Essayer plusieurs formats pour parser created_at
order_date_col = F.to_date(
    F.to_timestamp(F.col("created_at"), "yyyy-MM-dd HH:mm:ss")
)
# 2Ô∏è‚É£ Deuxi√®me format si le premier √©choue
order_date_alt = F.to_date(
    F.to_timestamp(F.col("created_at"), "yyyy-MM-dd")
)

# 3Ô∏è‚É£ Fusionner les deux tentatives (comme ton try/except en Python)
order_date_final = F.coalesce(order_date_col, order_date_alt)

# 4Ô∏è‚É£ Ajouter la colonne order_date au DataFrame Spark
per_order_with_date = per_order_active.withColumn("order_date", order_date_final)

# 5Ô∏è‚É£ Aper√ßu (√©quivalent Pandas : df[["order_id","created_at","order_date"]].head())
per_order_with_date.select("order_id", "created_at", "order_date").show(5, truncate=False)

+-------------+-------------------+----------+
|order_id     |created_at         |order_date|
+-------------+-------------------+----------+
|O202503010001|2025-03-01 11:30:49|2025-03-01|
|O202503010003|2025-03-01 07:27:00|2025-03-01|
|O202503010005|2025-03-01 22:29:42|2025-03-01|
|O202503010007|2025-03-01 15:50:48|2025-03-01|
|O202503010008|2025-03-01 20:56:15|2025-03-01|
+-------------+-------------------+----------+
only showing top 5 rows


In [78]:
# ==========================================================================
# üêº Pandas :
# refunds_sum = refunds.groupby("order_id", as_index=False)["amount"].sum()
# refunds_sum = refunds_sum.rename(columns={"amount":"refunds_eur"})
#
# üî• PySpark :
# groupBy('order_id').agg(sum("amount_num").alias("refunds_eur"))
#
# ‚ö†Ô∏è On suppose que tu avais d√©j√† converti amount ‚Üí amount_num (double)
# ==========================================================================

refunds_sum_sdf = (
    refunds_clean
    .groupBy("order_id")
    .agg(F.sum("amount").alias("refunds_eur"))
)


# ==========================================================================
# üêº Pandas :
# per_order = per_order.merge(refunds_sum, on="order_id", how="left")
#
# üî• PySpark :
# per_order_sdf.join(refunds_sum_sdf, on="order_id", how="left")
#
# Spark join = merge Pandas
# ==========================================================================

per_order_refunded = (
    per_order_with_date
    .join(refunds_sum_sdf, on="order_id", how="left")
)


# ==========================================================================
# üêº Pandas : .fillna({"refunds_eur":0.0})
#
# üî• PySpark : fillna({"refunds_eur": 0.0})
# ==========================================================================

per_order_refunded = per_order_refunded.fillna({"refunds_eur": 0.0})


# ==========================================================================
# üêº Pandas : display(per_order.head())
#
# üî• PySpark : limit().toPandas()
# ==========================================================================

display(Markdown("Aper√ßu `per_order` avec refunds"))
per_order_refunded.show(5, truncate=False)

Aper√ßu `per_order` avec refunds

+-------------+-----------+-------+-------------------+----------+-----------------+---------+---------+----------+-----------+
|order_id     |customer_id|channel|created_at         |items_sold|gross_revenue_eur|city     |is_active|order_date|refunds_eur|
+-------------+-----------+-------+-------------------+----------+-----------------+---------+---------+----------+-----------+
|O202503010001|C0676      |web    |2025-03-01 11:30:49|5         |62.5             |Marseille|true     |2025-03-01|0.0        |
|O202503010003|C0642      |web    |2025-03-01 07:27:00|1         |5.0              |Toulouse |true     |2025-03-01|-3.13      |
|O202503010005|C0571      |web    |2025-03-01 22:29:42|1         |2.5              |Toulouse |true     |2025-03-01|-35.42     |
|O202503010007|C0464      |app    |2025-03-01 15:50:48|1         |24.9             |Nantes   |true     |2025-03-01|0.0        |
|O202503010008|C0317      |app    |2025-03-01 20:56:15|2         |30.0             |Marseille|true     |

In [79]:
# ==========================================================================
# üêº Pandas :
# per_order_save = per_order[["order_id", ...]].copy()
#
# üî• PySpark :
# select() pour choisir les colonnes
# ==========================================================================

per_order_save_sdf = per_order_refunded.select(
    "order_id",
    "customer_id",
    "city",
    "channel",
    "order_date",
    "items_sold",
    "gross_revenue_eur"
)

# ==========================================================================
# üêº Pandas : to_sql(...)
#
# üî• PySpark :
# ‚Üí Spark ne peut PAS √©crire dans SQLite directement
# ‚Üí Donc on convertit en pandas pour r√©utiliser to_sql()
#
# ‚ö†Ô∏è ATTENTION :
# - toPandas() charge tout en m√©moire
# - Assure-toi que le DF final n‚Äôest pas gigantesque
# ==========================================================================

per_order_save_pdf = per_order_save_sdf.toPandas()  # conversion Spark ‚Üí pandas

# connexion SQLite
conn = sqlite3.connect(db_path)

# enregistrement
per_order_save_pdf.to_sql(
    "orders_clean",
    conn,
    if_exists="replace",
    index=False
)

conn.close()

display(Markdown("‚úîÔ∏è Table `orders_clean` sauvegard√©e dans SQLite depuis PySpark"))

‚úîÔ∏è Table `orders_clean` sauvegard√©e dans SQLite depuis PySpark

In [None]:
# ==========================================================================
# üêº Pandas :
# agg = per_order.groupby(["order_date","city","channel"], as_index=False).agg(
#     orders_count=("order_id","nunique"),
#     unique_customers=("customer_id","nunique"),
#     items_sold=("items_sold","sum"),
#     gross_revenue_eur=("gross_revenue_eur","sum"),
#     refunds_eur=("refunds_eur","sum")
# )
#
# üî• PySpark :
# - groupBy(...).agg(...)
# - F.countDistinct(col) = nunique en pandas
# - F.sum(col) = sum
# ==========================================================================

agg_sdf = (
    per_order_refunded
    .groupBy("order_date", "city", "channel")
    .agg(
        F.countDistinct("order_id").alias("orders_count"),                      # nunique(order_id)
        F.countDistinct("customer_id").alias("unique_customers"),               # nunique(customer_id)
        F.bround(F.sum("items_sold"),2).alias("items_sold"),                    # sum(items_sold)
        F.bround(F.sum("gross_revenue_eur"),2).alias("gross_revenue_eur"),      # sum(gross_revenue_eur)
        F.bround(F.sum("refunds_eur"),2).alias("refunds_eur")                   # sum(refunds_eur)
    )
)

# ==========================================================================
# üêº Pandas :
# agg["net_revenue_eur"] = agg["gross_revenue_eur"] + agg["refunds_eur"]
#
# üî• PySpark :
# withColumn("col", expr)
# ==========================================================================

agg_sdf = agg_sdf.withColumn(
    "net_revenue_eur",
    F.col("gross_revenue_eur") + F.col("refunds_eur")
)

# ==========================================================================
# üêº Pandas :
# agg = agg.rename(columns={"order_date": "date"})
#
# üî• PySpark :
# withColumnRenamed("old", "new")
# ==========================================================================

agg_sdf = agg_sdf.withColumnRenamed("order_date", "date")

# ==========================================================================
# üêº Pandas :
# .sort_values(["date","city","channel"]).reset_index(drop=True)
#
# üî• PySpark :
# - .orderBy() pour trier
# - pas d'index ‚Üí rien √† reset
# ==========================================================================

agg_sdf = agg_sdf.orderBy("date", "city", "channel")

# ==========================================================================
# üêº Pandas :
# display(agg.head())
# display(df.shape)
#
# üî• PySpark :
# - limit().toPandas() pour afficher
# - count() + len(columns)
# ==========================================================================

agg_sdf.show(5, truncate=False)

# Taille
rows = agg_sdf.count()
cols = len(agg_sdf.columns)
display(Markdown(f"Taille: ({rows}, {cols})"))

+----------+--------+-------+------------+----------------+----------+-----------------+-----------+------------------+
|date      |city    |channel|orders_count|unique_customers|items_sold|gross_revenue_eur|refunds_eur|net_revenue_eur   |
+----------+--------+-------+------------+----------------+----------+-----------------+-----------+------------------+
|2025-03-01|Bordeaux|app    |4           |4               |8         |136.8            |-25.55     |111.25000000000001|
|2025-03-01|Bordeaux|web    |6           |6               |13        |195.2            |-39.73     |155.47            |
|2025-03-01|Lille   |app    |4           |4               |13        |224.2            |-24.47     |199.73            |
|2025-03-01|Lille   |web    |3           |3               |12        |117.5            |-14.23     |103.27            |
|2025-03-01|Lyon    |app    |7           |7               |16        |159.5            |-10.86     |148.64            |
+----------+--------+-------+-----------

Taille: (491, 9)

In [87]:
# -------------------------------------------------------------------
# On part de :
#   - agg_sdf : DataFrame Spark agr√©g√© (date, city, channel, m√©triques)
#   - db_path : chemin SQLite (Path)
#   - out_dir : dossier de sortie (Path)
#   - sep, enc, ffmt : param√®tres CSV
# -------------------------------------------------------------------

# 1) Spark ‚Üí pandas (petit volume, OK pour la RAM)
agg_pdf = agg_sdf.toPandas()

# 2) √âcriture dans SQLite via pandas.to_sql
conn = sqlite3.connect(db_path)
agg_pdf.to_sql("daily_city_sales", conn, if_exists="replace", index=False)
conn.close()
display(Markdown("‚úîÔ∏è Table `daily_city_sales` √©crite dans SQLite"))

# 3) Exports CSV par date (un fichier par jour)
for d, sub in agg_pdf.groupby("date"):
    # d est un objet date/Timestamp ‚Üí on formate proprement en YYYYMMDD
    if hasattr(d, "strftime"):
        d_str = d.strftime("%Y%m%d")
    else:
        d_str = str(d).replace("-", "")  # fallback si jamais

    out_path = out_dir / f"daily_summary_{d_str}.csv"
    sub[
        [
            "date",
            "city",
            "channel",
            "orders_count",
            "unique_customers",
            "items_sold",
            "gross_revenue_eur",
            "refunds_eur",
            "net_revenue_eur",
        ]
    ].to_csv(
        out_path,
        index=False,
        sep=sep,
        encoding=enc,
        float_format=ffmt,
    )

# 4) Export CSV global (toutes les dates)
all_path = out_dir / "daily_summary_all.csv"
agg_pdf.to_csv(
    all_path,
    index=False,
    sep=sep,
    encoding=enc,
    float_format=ffmt,
)

display(Markdown(f"‚úîÔ∏è Exports CSV √©crits dans `{out_dir}`"))

‚úîÔ∏è Table `daily_city_sales` √©crite dans SQLite

‚úîÔ∏è Exports CSV √©crits dans `/mnt/c/Users/alexa/Simplon/Esther/Exos/Starter stack pour Data Engineers - Partie 1/data/out`