In [1]:
output_path = "/home/jovyan/work/filtered_df_output.parquet"

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("E-commerce Amazing Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.driver.cores", "2") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()


# Augmenter la fraction de mémoire allouée aux tâches d'exécution
spark.conf.set("spark.memory.fraction", "0.8")

# Ajuster la fraction de mémoire réservée pour la gestion interne
spark.conf.set("spark.memory.storageFraction", "0.2")

from pyspark.sql.functions import row_number, col, split, month, year, datediff, sum as _sum,\
    avg as _avg, count as _count, max as _max, current_date, round as _round, coalesce
from pyspark.sql.window import Window

# Lire le fichier Parquet
filtered_df = spark.read.parquet(output_path)

# Exemple de repartitionnement des données
filtered_df = filtered_df.repartition(100)






# Ajouter les colonnes 'month' et 'year'
filtered_df = filtered_df.withColumn("month", month(col("event_time"))) \
                         .withColumn("year", year(col("event_time")))

# Filtrer les événements d'achat
purchase_df = filtered_df.filter(col("event_type") == "purchase")

# Extraire les paires uniques category_id et category_code
category_mapping_df = purchase_df.select("category_id", "category_code").distinct()

# Sauvegarder ce mapping dans un fichier Parquet pour une utilisation ultérieure
mapping_output_path = "/home/jovyan/work/category_mapping.parquet"
category_mapping_df.write.mode("overwrite").parquet(mapping_output_path)

# Charger le mapping depuis le fichier Parquet
category_mapping_df = spark.read.parquet(mapping_output_path)

# Renommer la colonne 'category_code' dans le DataFrame de mapping pour éviter l'ambiguïté
category_mapping_df = category_mapping_df.withColumnRenamed("category_code", "mapped_category_code")

# Joindre purchase_df avec category_mapping_df pour ajouter la colonne 'mapped_category_code'
purchase_df_with_mapping = purchase_df.join(category_mapping_df, on="category_id", how="left")

# Remplacer les valeurs NULL dans 'category_code' par les valeurs correspondantes de la jointure
purchase_df = purchase_df_with_mapping.withColumn(
    "category_code",
    coalesce(purchase_df_with_mapping["category_code"], purchase_df_with_mapping["mapped_category_code"])
)

# Calcul des variables explicatives supplémentaires
# 1. Nombre de vues par utilisateur
number_of_views = filtered_df.filter(col("event_type") == "view") \
                             .groupBy("user_id") \
                             .agg(_count("event_type").alias("number_of_views"))

# 2. Nombre de produits ajoutés au panier par utilisateur
number_of_carts = filtered_df.filter(col("event_type") == "cart") \
                             .groupBy("user_id") \
                             .agg(_count("event_type").alias("number_of_carts"))

# 3. Nombre total d'achats précédents par utilisateur
previous_purchases = filtered_df.filter(col("event_type") == "purchase") \
                                .groupBy("user_id") \
                                .agg(_count("event_type").alias("user_previous_purchases"))

# 4. Valeur moyenne des achats précédents par utilisateur
average_purchase_value = filtered_df.filter(col("event_type") == "purchase") \
                                    .groupBy("user_id") \
                                    .agg(_round(_avg("price"), 2).alias("user_average_purchase_value"))

# 5. Temps écoulé depuis le dernier achat
last_purchase_date = filtered_df.filter(col("event_type") == "purchase") \
                                .groupBy("user_id") \
                                .agg(_max("event_time").alias("last_purchase_date"))

days_since_last_purchase = last_purchase_date.withColumn("days_since_last_purchase", 
                                                         datediff(current_date(), col("last_purchase_date")))

# 6. Nombre de produits ajoutés au panier mais non achetés (abandons de panier)
cart_abandonments = filtered_df.filter(col("event_type") == "cart") \
                               .groupBy("user_id", "product_id") \
                               .agg(_count("event_type").alias("cart_count")) \
                               .join(purchase_df.groupBy("user_id", "product_id").agg(_count("event_type").alias("purchase_count")),
                                     on=["user_id", "product_id"], how="left") \
                               .withColumn("purchase_count", col("purchase_count").cast("int")) \
                               .na.fill(0) \
                               .filter(col("purchase_count") == 0) \
                               .groupBy("user_id") \
                               .agg(_count("product_id").alias("cart_abandonments"))

# 7. Valeur totale des achats par utilisateur
total_purchase_value = filtered_df.filter(col("event_type") == "purchase") \
                                  .groupBy("user_id") \
                                  .agg(_round(_sum("price"), 2).alias("total_purchase_value"))

# 8. Nombre total de sessions par utilisateur
number_of_sessions = filtered_df.groupBy("user_id") \
                                .agg(_count("user_session").alias("number_of_sessions"))

# Joindre les DataFrames pour créer le DataFrame final
final_df = number_of_views.join(number_of_carts, "user_id", "left") \
                          .join(previous_purchases, "user_id", "left") \
                          .join(average_purchase_value, "user_id", "left") \
                          .join(days_since_last_purchase, "user_id", "left") \
                          .join(cart_abandonments, "user_id", "left") \
                          .join(total_purchase_value, "user_id", "left") \
                          .join(number_of_sessions, "user_id", "left")

# Remplacer les valeurs NULL par des valeurs par défaut si nécessaire
final_df = final_df.fillna({
    "number_of_views": 0,
    "number_of_carts": 0,
    "user_previous_purchases": 0,
    "user_average_purchase_value": 0.0,
    "days_since_last_purchase": 9999,  # Utiliser une valeur par défaut pour indiquer une absence de précédent achat
    "cart_abandonments": 0,
    "total_purchase_value": 0.0,
    "number_of_sessions": 0
})

# Sauvegarder le DataFrame final dans un fichier Parquet
final_output_path = "/home/jovyan/work/final_df_output.parquet"
final_df.write.mode("overwrite").parquet(final_output_path)




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/20 05:28:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [2]:
final_df.show()

                                                                                

+---------+---------------+---------------+-----------------------+---------------------------+-------------------+------------------------+-----------------+--------------------+------------------+
|  user_id|number_of_views|number_of_carts|user_previous_purchases|user_average_purchase_value| last_purchase_date|days_since_last_purchase|cart_abandonments|total_purchase_value|number_of_sessions|
+---------+---------------+---------------+-----------------------+---------------------------+-------------------+------------------------+-----------------+--------------------+------------------+
|512609101|             39|              1|                      0|                        0.0|               NULL|                    9999|                1|                 0.0|                40|
|515230351|            123|              1|                      0|                        0.0|               NULL|                    9999|                1|                 0.0|               124|
|5196

In [3]:
# Lire le fichier Parquet
filtered_df = spark.read.parquet(output_path)

# Afficher les noms des colonnes
columns = filtered_df.columns
print("Les noms des colonnes du DataFrame sont :")
for column in columns:
    print(column)

Les noms des colonnes du DataFrame sont :
event_time
event_type
product_id
category_id
category_code
brand
price
user_id
user_session


In [4]:


from pyspark.sql.functions import col, year, month, dayofweek, hour, when, count, max as _max, lit



# Lire le fichier Parquet
filtered_df = spark.read.parquet(output_path)

# Extraire les colonnes nécessaires
filtered_df = filtered_df.select("event_time", "event_type", "product_id", "category_id", "category_code", "brand", "price", "user_id", "user_session")

# Créer de nouvelles variables qualitatives
filtered_df = filtered_df.withColumn("event_year", year(col("event_time"))) \
                         .withColumn("event_month", month(col("event_time"))) \
                         .withColumn("event_day_of_week", dayofweek(col("event_time"))) \
                         .withColumn("event_hour", hour(col("event_time"))) \
                         .withColumn("event_weekend", when(col("event_day_of_week").isin([1, 7]), "weekend").otherwise("weekday")) \
                         .withColumn("price_category", when(col("price") < 50, "low") \
                                                        .when((col("price") >= 50) & (col("price") < 200), "medium") \
                                                        .otherwise("high")) \
                         .withColumn("time_of_day", when(col("event_hour").between(0, 6), "night") \
                                                    .when(col("event_hour").between(7, 12), "morning") \
                                                    .when(col("event_hour").between(13, 18), "afternoon") \
                                                    .otherwise("evening"))

# Calculer la valeur moyenne des achats pour déterminer le segment utilisateur
average_purchase_value = filtered_df.filter(col("event_type") == "purchase") \
                                    .groupBy("user_id") \
                                    .agg(_avg("price").alias("avg_purchase_value"))

average_purchase_value = average_purchase_value.withColumn("user_segment", when(col("avg_purchase_value") >= 100, "high spender").otherwise("regular buyer"))

# Identifier la catégorie de produit préférée de l'utilisateur
preferred_category = filtered_df.filter(col("event_type") == "purchase") \
                                .groupBy("user_id", "category_code") \
                                .agg(count("category_code").alias("category_count"))

window_spec = Window.partitionBy("user_id").orderBy(col("category_count").desc())
preferred_category = preferred_category.withColumn("rank", row_number().over(window_spec)) \
                                       .filter(col("rank") == 1) \
                                       .select("user_id", "category_code") \
                                       .withColumnRenamed("category_code", "preferred_category")

# Indicateur de fidélité à une marque spécifique
brand_loyalty = filtered_df.filter(col("event_type") == "purchase") \
                           .groupBy("user_id", "brand") \
                           .agg(count("brand").alias("brand_count"))

window_spec = Window.partitionBy("user_id").orderBy(col("brand_count").desc())
brand_loyalty = brand_loyalty.withColumn("rank", row_number().over(window_spec)) \
                             .filter(col("rank") == 1) \
                             .select("user_id", "brand") \
                             .withColumnRenamed("brand", "preferred_brand")

# Ajouter une colonne factice pour le type d'appareil (exemple)
# Note: Cela nécessite d'avoir une colonne dans les données initiales qui identifie le type d'appareil.
# Pour l'exemple, nous allons générer une colonne aléatoire.
import random
device_types = ["mobile", "desktop", "tablet"]
filtered_df = filtered_df.withColumn("device_type", lit(random.choice(device_types)))

# Joindre les nouvelles variables qualitatives au DataFrame final
final_df = filtered_df.join(average_purchase_value.select("user_id", "user_segment"), on="user_id", how="left") \
                      .join(preferred_category, on="user_id", how="left") \
                      .join(brand_loyalty, on="user_id", how="left") \
                      .distinct()  # Suppression des duplicatas si nécessaire

# Afficher les noms des colonnes pour vérifier les nouvelles variables
print("Les noms des colonnes du DataFrame modifié sont :")
for column in final_df.columns:
    print(column)

# Afficher un échantillon des données pour vérifier les nouvelles variables
final_df.show(10)



Les noms des colonnes du DataFrame modifié sont :
user_id
event_time
event_type
product_id
category_id
category_code
brand
price
user_session
event_year
event_month
event_day_of_week
event_hour
event_weekend
price_category
time_of_day
device_type
user_segment
preferred_category
preferred_brand


24/06/20 05:51:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/06/20 05:51:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/06/20 05:51:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/06/20 05:51:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/06/20 05:52:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/06/20 05:52:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/06/20 05:52:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/06/20 05:52:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/06/20 05:52:15 WARN RowBasedKeyValueBatch: Calling spill() on

+---------+-------------------+----------+----------+-------------------+--------------------+--------+-------+--------------------+----------+-----------+-----------------+----------+-------------+--------------+-----------+-----------+------------+------------------+---------------+
|  user_id|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|        user_session|event_year|event_month|event_day_of_week|event_hour|event_weekend|price_category|time_of_day|device_type|user_segment|preferred_category|preferred_brand|
+---------+-------------------+----------+----------+-------------------+--------------------+--------+-------+--------------------+----------+-----------+-----------------+----------+-------------+--------------+-----------+-----------+------------+------------------+---------------+
|220134341|2020-01-21 14:26:08|      view|  12300304|2053013563743667055|appliances.kitche...|  alteco|  43.24|d393e798-57d7-4f1...|      2020

                                                                                

In [None]:
# Arrêter la session Spark
spark.stop()