In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number, col, split, month, year, dayofweek, hour, datediff, sum as _sum, \
    avg as _avg, count as _count, max as _max, current_date, round as _round, coalesce, when, lit
from pyspark.sql.window import Window

In [3]:

# Créer une session Spark avec des configurations optimisées
spark = SparkSession.builder \
    .appName("E-commerce Amazing Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.driver.cores", "2") \
    .config("spark.executor.cores", "2") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.2") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "4g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/23 15:22:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:



# Définir le chemin du fichier Parquet
output_path = "/home/jovyan/work/filtered_df_output.parquet"

# Lire le fichier Parquet
filtered_df = spark.read.parquet(output_path)

# Ajouter les colonnes 'month' et 'year'
filtered_df = filtered_df.withColumn("month", month(col("event_time"))) \
                         .withColumn("year", year(col("event_time")))

# Filtrer les événements d'achat
purchase_df = filtered_df.filter(col("event_type") == "purchase")

# Extraire les paires uniques category_id et category_code
category_mapping_df = purchase_df.select("category_id", "category_code").distinct()

# Sauvegarder ce mapping dans un fichier Parquet pour une utilisation ultérieure
mapping_output_path = "/home/jovyan/work/category_mapping.parquet"
category_mapping_df.write.mode("overwrite").parquet(mapping_output_path)

# Charger le mapping depuis le fichier Parquet
category_mapping_df = spark.read.parquet(mapping_output_path)

# Renommer la colonne 'category_code' dans le DataFrame de mapping pour éviter l'ambiguïté
category_mapping_df = category_mapping_df.withColumnRenamed("category_code", "mapped_category_code")

# Joindre purchase_df avec category_mapping_df pour ajouter la colonne 'mapped_category_code'
purchase_df_with_mapping = purchase_df.join(category_mapping_df, on="category_id", how="left")

# Remplacer les valeurs NULL dans 'category_code' par les valeurs correspondantes de la jointure
purchase_df = purchase_df_with_mapping.withColumn(
    "category_code",
    coalesce(purchase_df_with_mapping["category_code"], purchase_df_with_mapping["mapped_category_code"])
)

# Créer de nouvelles variables qualitatives
filtered_df = filtered_df.withColumn("event_day_of_week", dayofweek(col("event_time"))) \
                         .withColumn("event_hour", hour(col("event_time"))) \
                         .withColumn("event_weekend", when(col("event_day_of_week").isin([1, 7]), "weekend").otherwise("weekday")) \
                         .withColumn("price_category", when(col("price") < 50, "low") \
                                                        .when((col("price") >= 50) & (col("price") < 200), "medium") \
                                                        .otherwise("high")) \
                         .withColumn("time_of_day", when(col("event_hour").between(0, 6), "night") \
                                                    .when(col("event_hour").between(7, 12), "morning") \
                                                    .when(col("event_hour").between(13, 18), "afternoon") \
                                                    .otherwise("evening"))

# 1. Nombre de vues par utilisateur par mois
number_of_views_per_month = filtered_df.filter(col("event_type") == "view") \
                                       .groupBy("user_id", "year", "month") \
                                       .agg(_count("event_type").alias("number_of_views_per_month"))

# 2. Nombre de produits ajoutés au panier par utilisateur 
number_of_carts_per_month = filtered_df.filter(col("event_type") == "cart") \
                                       .groupBy("user_id", "year", "month") \
                                       .agg(_count("event_type").alias("number_of_carts_per_month"))

# 3. Nombre total d'achats précédents par utilisateur par mois
previous_purchases_per_month = filtered_df.filter(col("event_type") == "purchase") \
                                          .groupBy("user_id", "year", "month") \
                                          .agg(_count("event_type").alias("user_previous_purchases_per_month"))

# 4. Valeur moyenne des achats précédents par utilisateur par mois
average_purchase_value_per_month = filtered_df.filter(col("event_type") == "purchase") \
                                              .groupBy("user_id", "year", "month") \
                                              .agg(_round(_avg("price"), 2).alias("user_average_purchase_value_per_month"))

# 5. Temps écoulé depuis le dernier achat
last_purchase_date = filtered_df.filter(col("event_type") == "purchase") \
                                .groupBy("user_id") \
                                .agg(_max("event_time").alias("last_purchase_date"))

days_since_last_purchase = last_purchase_date.withColumn("days_since_last_purchase", 
                                                         datediff(current_date(), col("last_purchase_date")))

# 6. Nombre de produits ajoutés au panier mais non achetés (abandons de panier) par mois
cart_abandonments_per_month = filtered_df.filter(col("event_type") == "cart") \
                                         .groupBy("user_id", "product_id", "year", "month") \
                                         .agg(_count("event_type").alias("cart_count")) \
                                         .join(purchase_df.groupBy("user_id", "product_id", "year", "month").agg(_count("event_type").alias("purchase_count")),
                                               on=["user_id", "product_id", "year", "month"], how="left") \
                                         .withColumn("purchase_count", col("purchase_count").cast("int")) \
                                         .na.fill(0) \
                                         .filter(col("purchase_count") == 0) \
                                         .groupBy("user_id", "year", "month") \
                                         .agg(_count("product_id").alias("cart_abandonments_per_month"))

# 7. Valeur totale des achats par utilisateur par mois
total_purchase_value_per_month = filtered_df.filter(col("event_type") == "purchase") \
                                            .groupBy("user_id", "year", "month") \
                                            .agg(_round(_sum("price"), 2).alias("total_purchase_value_per_month"))

# 8. Nombre total de sessions par utilisateur par mois
number_of_sessions_per_month = filtered_df.groupBy("user_id", "year", "month") \
                                          .agg(_count("user_session").alias("number_of_sessions_per_month"))

# Calculer la valeur moyenne des achats pour déterminer le segment utilisateur
average_purchase_value = filtered_df.filter(col("event_type") == "purchase") \
                                    .groupBy("user_id") \
                                    .agg(_avg("price").alias("avg_purchase_value"))

average_purchase_value = average_purchase_value.withColumn("user_segment", when(col("avg_purchase_value") >= 100, "high spender").otherwise("regular buyer"))

# Identifier la catégorie de produit préférée de l'utilisateur
preferred_category = filtered_df.filter(col("event_type") == "purchase") \
                                .groupBy("user_id", "category_code") \
                                .agg(_count("category_code").alias("category_count"))

window_spec = Window.partitionBy("user_id").orderBy(col("category_count").desc())
preferred_category = preferred_category.withColumn("rank", row_number().over(window_spec)) \
                                       .filter(col("rank") == 1) \
                                       .select("user_id", "category_code") \
                                       .withColumnRenamed("category_code", "preferred_category")

# Indicateur de fidélité à une marque spécifique
brand_loyalty = filtered_df.filter(col("event_type") == "purchase") \
                           .groupBy("user_id", "brand") \
                           .agg(_count("brand").alias("brand_count"))

window_spec = Window.partitionBy("user_id").orderBy(col("brand_count").desc())
brand_loyalty = brand_loyalty.withColumn("rank", row_number().over(window_spec)) \
                             .filter(col("rank") == 1) \
                             .select("user_id", "brand") \
                             .withColumnRenamed("brand", "preferred_brand")

# Joindre les DataFrames pour créer le DataFrame final
final_df = filtered_df.join(number_of_views_per_month, ["user_id", "year", "month"], "left") \
                      .join(number_of_carts_per_month, ["user_id", "year", "month"], "left") \
                      .join(previous_purchases_per_month, ["user_id", "year", "month"], "left") \
                      .join(average_purchase_value_per_month, ["user_id", "year", "month"], "left") \
                      .join(days_since_last_purchase, "user_id", "left") \
                      .join(cart_abandonments_per_month, ["user_id", "year", "month"], "left") \
                      .join(total_purchase_value_per_month, ["user_id", "year", "month"], "left") \
                      .join(number_of_sessions_per_month, ["user_id", "year", "month"], "left") \
                      .join(average_purchase_value.select("user_id", "user_segment"), on="user_id", how="left") \
                      .join(preferred_category, on="user_id", how="left") \
                      .join(brand_loyalty, on="user_id", how="left") \
                      .distinct()  # Suppression des duplicatas si nécessaire

# Remplacer les valeurs NULL par des valeurs par défaut si nécessaire
final_df = final_df.fillna({
    "number_of_views_per_month": 0,
    "number_of_carts_per_month": 0,
    "user_previous_purchases_per_month": 0,
    "user_average_purchase_value_per_month": 0.0,
    "days_since_last_purchase": 9999,  # Utiliser une valeur par défaut pour indiquer une absence de précédent achat
    "cart_abandonments_per_month": 0,
    "total_purchase_value_per_month": 0.0,
    "number_of_sessions_per_month": 0
})

# Afficher les noms des colonnes pour vérifier les nouvelles variables
print("Les noms des colonnes du DataFrame final sont :")
for column in final_df.columns:
    print(column)

# Afficher un échantillon des données pour vérifier les nouvelles variables
final_df.show(10)

# Sauvegarder le DataFrame final dans un fichier Parquet
final_output_path = "/home/jovyan/work/final_df_output.parquet"
final_df.write.mode("overwrite").parquet(final_output_path)




                                                                                

Les noms des colonnes du DataFrame final sont :
user_id
year
month
event_time
event_type
product_id
category_id
category_code
brand
price
user_session
event_day_of_week
event_hour
event_weekend
price_category
time_of_day
number_of_views_per_month
number_of_carts_per_month
user_previous_purchases_per_month
user_average_purchase_value_per_month
last_purchase_date
days_since_last_purchase
cart_abandonments_per_month
total_purchase_value_per_month
number_of_sessions_per_month
user_segment
preferred_category
preferred_brand


24/06/23 15:23:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---------+----+-----+-------------------+----------+----------+-------------------+--------------------+-------+------+--------------------+-----------------+----------+-------------+--------------+-----------+-------------------------+-------------------------+---------------------------------+-------------------------------------+-------------------+------------------------+---------------------------+------------------------------+----------------------------+------------+--------------------+---------------+
|  user_id|year|month|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|        user_session|event_day_of_week|event_hour|event_weekend|price_category|time_of_day|number_of_views_per_month|number_of_carts_per_month|user_previous_purchases_per_month|user_average_purchase_value_per_month| last_purchase_date|days_since_last_purchase|cart_abandonments_per_month|total_purchase_value_per_month|number_of_sessions_per_month|user_segment|

                                                                                

In [5]:
# Afficher un échantillon des données pour vérifier les nouvelles variables
final_df.show(10)

ERROR:root:Exception while sending command.+ 0) / 2][Stage 168:>  (0 + 0) / 2]1]
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_comman

Py4JError: An error occurred while calling o408.showString

In [None]:
# # Arrêter la session Spark
# spark.stop()