In [1]:
import pyspark.sql.functions as F

# Chargement des données

In [95]:
# Liste des fichiers CSV
csv_files = {
    "customers": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/customers.csv",
    "sellers": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/sellers.csv",
    "orders": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/orders.csv",
    "order_items": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/order_items.csv",
    "order_payments": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/order_payments.csv",
    "order_reviews": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/order_reviews.csv",
    "products": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/products.csv",
    "products_translated": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/product_category_name_translation.csv",
    "geolocation": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/geolocation.csv",
    "states_name": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/raw_data/states_name.csv"
}

In [96]:
# Créer un dictionnaire de DataFrames
dataframes = {name: spark.read.csv(path, header=True, inferSchema=True) for name, path in csv_files.items()}
dataframes

# Changement type variables

In [97]:
dataframes["order_reviews"] = dataframes["order_reviews"] \
    .withColumn("review_creation_date_good", F.to_timestamp("review_creation_date", "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("review_answer_timestamp_good", F.to_timestamp("review_answer_timestamp", "yyyy-MM-dd HH:mm:ss")) \
    .drop("review_creation_date", "review_answer_timestamp") \
    .withColumnRenamed("review_creation_date_good", "review_creation_date") \
    .withColumnRenamed("review_answer_timestamp_good", "review_answer_timestamp")

In [98]:
dataframes["order_reviews"].printSchema()

# Nettoyage des données dupliquées

In [99]:
# Parcourir chaque DataFrame et compter les lignes dupliquées
for key, df in dataframes.items():
    duplicate_rows = df.groupBy(df.columns).count().where(F.col("count") > 1)
    print(f"Duplicates lines in {key} DataFrame : {duplicate_rows.count()}")
    if duplicate_rows.count() > 0 :
        duplicate_rows.show(truncate=False)

In [100]:
# Suppression des lignes dupliquées dans le Dataframe geolocation
dataframes["geolocation"] = dataframes["geolocation"].dropDuplicates()

# Vérification que les lignes dupliquées ont bien été supprimées
print(f"Duplicates lines in geolocation DataFrame : {duplicate_rows.count()}")

# Remplacement noms catégories de portugais à anglais

In [101]:
# Vérification que toutes les catégories en portugais ont une traduction en anglais
p = dataframes["products"].select("product_category_name").distinct()
t = dataframes["products_translated"].select("product_category_name").distinct()

p.subtract(t).show()

In [102]:
# Jointure entre products et products_translated sur la colonne product_category_name
joined_df = dataframes["products"].alias("p") \
    .join(
        dataframes["products_translated"].alias("pt"),
        F.col("p.product_category_name") == F.col("pt.product_category_name"),
        "left"
    )

# Suppression du champs avec le om du produit en portugais
joined_df = joined_df.drop("product_category_name")

# Renommage de la colonne product_category_name_english
joined_df = joined_df.withColumnRenamed("product_category_name_english", "product_category_name")

# Réagencement des colonnes du DataFrame
new_columns = [joined_df.columns[0]] + ["product_category_name"] + [col for col in joined_df.columns[1:-1]]
dataframes["products"] = joined_df.select(new_columns)

# Mettre à jour le DataFrame dans le dictionnaire
dataframes["products"].show(5)


# Nettoyage des valeurs nulles

In [103]:
# Fonction pour compter les valeurs nulles par colonne
def count_nulls(df):
    return df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns])

In [104]:
# Parcourir chaque DataFrame et compter les valeurs nulles
for key, df in dataframes.items():
    null_counts = count_nulls(df)
    print(f"Null values in {key} DataFrame :")
    null_counts.show(truncate=False)

In [105]:
# Suppression des valuers nulles de "order_reviews"
dataframes["order_reviews"] = dataframes["order_reviews"] \
    .filter(F.col("review_creation_date").isNotNull() & F.col("review_answer_timestamp").isNotNull())

# Vérification que toutes les valeurs nulles de order_reviews ont été supprimées
null_counts = count_nulls(dataframes["order_reviews"])
print(f"Null values in order_reviews DataFrame :")
null_counts.show(truncate=False)

In [106]:
# Remplacement des valeurs nulles de "products" : String -> "Other"  &  Int -> -1
dataframes["products"] = dataframes["products"] \
    .withColumn("product_category_name", F.when(F.col("product_category_name").isNull(), "unknown").otherwise(F.col("product_category_name"))) \
    .withColumn("product_name_length", F.when(F.col("product_name_length").isNull(), -1).otherwise(F.col("product_name_length"))) \
    .withColumn("product_description_length", F.when(F.col("product_description_length").isNull(), -1).otherwise(F.col("product_description_length"))) \
    .withColumn("product_photos_qty", F.when(F.col("product_photos_qty").isNull(), -1).otherwise(F.col("product_photos_qty"))) \
    .withColumn("product_weight_g", F.when(F.col("product_weight_g").isNull(), -1).otherwise(F.col("product_weight_g"))) \
    .withColumn("product_length_cm", F.when(F.col("product_length_cm").isNull(), -1).otherwise(F.col("product_length_cm"))) \
    .withColumn("product_height_cm", F.when(F.col("product_height_cm").isNull(), -1).otherwise(F.col("product_height_cm"))) \
    .withColumn("product_width_cm", F.when(F.col("product_width_cm").isNull(), -1).otherwise(F.col("product_width_cm")))

# Vérification que toutes les valeurs nulles de products ont été remplacée
null_counts = count_nulls(dataframes["products"])
print(f"Null values in products DataFrame :")
null_counts.show(truncate=False)

# Ajout  champs 

In [107]:
dataframes["order_items"] = dataframes["order_items"] \
    .withColumn("total_items_value", F.col("price") * F.col("order_item_id")) \
    .withColumn("total_freight_value", F.col("freight_value") * F.col("order_item_id")) \
    .withColumn("total_order_value", F.col("total_items_value") + F.col("total_freight_value"))

dataframes["order_items"].show(10)

# Chargement des Dataframes nettoyés en Parquet

In [108]:
# Chemin de destination dans ADLS Gen2
output_path = {
    "customers": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/customers",
    "sellers": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/sellers",
    "orders": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/orders",
    "order_items": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/order_items",
    "order_payments": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/order_payments",
    "order_reviews": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/order_reviews",
    "products": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/products",
    "products_translated": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/products_translated",
    "geolocation": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/geolocation",
    "states_name": "abfss://projetcloud@datalakecloud.dfs.core.windows.net/cleaned_data/states_name"
}

# Écrire les DataFrame au format Parquet
for key, df in dataframes.items() :
    df.write.mode("overwrite").parquet(output_path[key])