## TRDE703 Atelier Int√©gration des Donn√©es

In [10]:
import sys
import os
from pathlib import Path
from pyspark.shell import spark

current_dir = Path(os.getcwd())

project_root = current_dir.parent if current_dir.name == "etl" else current_dir

if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

%load_ext autoreload
%autoreload 2

print(f"‚úÖ Racine du projet ajout√©e au path : {project_root}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
‚úÖ Racine du projet ajout√©e au path : /Users/cedricsanchez/Master1/Cours/integration_donnees_TP


In [11]:
from etl.shared.config import SPARK_CONFIG, MYSQL_CONFIG
from pyspark.sql import SparkSession

print("‚öôÔ∏è Configuration charg√©e avec succ√®s.")

‚öôÔ∏è Configuration charg√©e avec succ√®s.


In [12]:
json_filepath = str(project_root / "/Users/cedricsanchez/Master1/Cours/integration_donnees_TP/data/raw/openfoodfacts.csv")

print(f"üìÇ Fichier cible : {json_filepath}")

# Test de lecture
if os.path.exists(json_filepath):
    df = spark.read.csv(json_filepath)
    df.show(5)
else:
    print("‚ùå Fichier introuvable. V√©rifie le dossier data/raw/")

üìÇ Fichier cible : /Users/cedricsanchez/Master1/Cours/integration_donnees_TP/data/raw/openfoodfacts.csv
+--------------------+
|                 _c0|
+--------------------+
|code\turl\tcreato...|
|00000002\thttp://...|
|00000003\thttp://...|
|00000004\thttp://...|
|00000005\thttp://...|
+--------------------+
only showing top 5 rows


In [None]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, LongType, ArrayType

def get_bronze_schema():
    """
    Sch√©ma permissif pour la lecture (Bronze).
    On force beaucoup de champs en String pour √©viter les rejets √† la lecture.
    On castera proprement dans la couche Silver.
    """
    return StructType([
        # --- Cl√©s Techniques ---
        StructField("code", StringType(), True),           # Le code barre (EAN)
        StructField("url", StringType(), True),
        StructField("creator", StringType(), True),
        StructField("created_t", LongType(), True),        # Timestamp UNIX
        StructField("created_datetime", StringType(), True),
        StructField("last_modified_t", LongType(), True),  # CRUCIAL pour le SCD2
        StructField("last_modified_datetime", StringType(), True),

        # --- Infos Produit ---
        StructField("product_name", StringType(), True),
        StructField("generic_name", StringType(), True),
        StructField("quantity", StringType(), True),

        # --- Dimensions (Marques, Cat√©gories, Lieux) ---
        StructField("packaging", StringType(), True),
        StructField("packaging_tags", StringType(), True),
        StructField("brands", StringType(), True),
        StructField("brands_tags", StringType(), True),
        StructField("categories", StringType(), True),
        StructField("categories_tags", StringType(), True),
        StructField("origins", StringType(), True),
        StructField("manufacturing_places", StringType(), True),
        StructField("labels", StringType(), True),
        StructField("emb_codes", StringType(), True),
        StructField("first_packaging_code_geo", StringType(), True),
        StructField("cities", StringType(), True),
        StructField("purchase_places", StringType(), True),
        StructField("stores", StringType(), True),
        StructField("countries", StringType(), True),
        StructField("countries_tags", StringType(), True), # Utile pour filtrer FR/EN

        # --- Ingr√©dients & Additifs ---
        StructField("ingredients_text", StringType(), True),
        StructField("allergens", StringType(), True),
        StructField("traces", StringType(), True),
        StructField("additives_n", IntegerType(), True),   # Nombre d'additifs
        StructField("additives", StringType(), True),

        # --- Scores Nutritionnels (Qualit√©) ---
        StructField("nutriscore_score", IntegerType(), True),
        StructField("nutriscore_grade", StringType(), True),
        StructField("nova_group", StringType(), True),     # Parfois 1, parfois "1", on g√®re en String
        StructField("pnns_groups_1", StringType(), True),
        StructField("pnns_groups_2", StringType(), True),

        # --- Valeurs Nutritionnelles (100g) ---
        StructField("energy-kcal_100g", FloatType(), True),
        StructField("energy_100g", FloatType(), True),
        StructField("fat_100g", FloatType(), True),
        StructField("saturated-fat_100g", FloatType(), True),
        StructField("carbohydrates_100g", FloatType(), True),
        StructField("sugars_100g", FloatType(), True),
        StructField("fiber_100g", FloatType(), True),
        StructField("proteins_100g", FloatType(), True),
        StructField("salt_100g", FloatType(), True),
        StructField("sodium_100g", FloatType(), True)
    ])

In [None]:
# Chemin vers ton fichier (utilise project_root comme on a vu)
raw_file_path = str(project_root / "data/raw/en.openfoodfacts.org.products.jsonl")
# Note: Si tu utilises le CSV, change l'extension et utilise spark.read.csv(sep='\t')

print(f"‚è≥ Lecture du fichier : {raw_file_path}")

try:
    # Lecture JSONL (Format recommand√© pour OFF car il g√®re mieux les textes complexes)
    df_bronze = spark.read \
        .schema(get_bronze_schema()) \
        .json(raw_file_path)

    # SI tu utilises le CSV, d√©commente ci-dessous :
    # df_bronze = spark.read \
    #    .schema(get_bronze_schema()) \
    #    .option("header", "true") \
    #    .option("sep", "\t") \
    #    .csv(raw_file_path)

    print(f"‚úÖ Ingestion termin√©e. Lignes brutes : {df_bronze.count()}")

except Exception as e:
    print(f"‚ùå Erreur critique : {e}")