# Initialiser la Session Spark et Lire les Données

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number, split, month, year, dayofweek, hour, col, sum as _sum, avg as _avg, count as _count, max as _max, round as _round, when, coalesce, datediff, current_date
from pyspark.sql.window import Window

# Créer une session Spark avec des configurations optimisées
spark = SparkSession.builder \
    .appName("E-commerce Amazing Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.driver.cores", "2") \
    .config("spark.executor.cores", "2") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.2") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "4g") \
    .getOrCreate()

# Définir le chemin du fichier Parquet
output_path = "/home/jovyan/work/filtered_df_output.parquet"

# Lire le fichier Parquet
filtered_df = spark.read.parquet(output_path)


ModuleNotFoundError: No module named 'pyspark'

# Prétraitement des Données

In [4]:
# Ajouter les colonnes 'month' et 'year'
filtered_df = filtered_df.withColumn("month", month(col("event_time"))) \
                         .withColumn("year", year(col("event_time")))

# Filtrer les événements d'achat
purchase_df = filtered_df.filter(col("event_type") == "purchase")

# Extraire les paires uniques category_id et category_code
category_mapping_df = purchase_df.select("category_id", "category_code").distinct()

# Sauvegarder ce mapping dans un fichier Parquet pour une utilisation ultérieure
mapping_output_path = "/home/jovyan/work/category_mapping.parquet"
category_mapping_df.write.mode("overwrite").parquet(mapping_output_path)

# Charger le mapping depuis le fichier Parquet
category_mapping_df = spark.read.parquet(mapping_output_path)

# Renommer la colonne 'category_code' dans le DataFrame de mapping pour éviter l'ambiguïté
category_mapping_df = category_mapping_df.withColumnRenamed("category_code", "mapped_category_code")

# Joindre purchase_df avec category_mapping_df pour ajouter la colonne 'mapped_category_code'
purchase_df_with_mapping = purchase_df.join(category_mapping_df, on="category_id", how="left")

# Remplacer les valeurs NULL dans 'category_code' par les valeurs correspondantes de la jointure
purchase_df = purchase_df_with_mapping.withColumn(
    "category_code",
    coalesce(purchase_df_with_mapping["category_code"], purchase_df_with_mapping["mapped_category_code"])
)


                                                                                

# Calculer les Variables Quantitatives et Qualitatives

## Créer des variables qualitatives à partir de la colonne 'event_time'

In [None]:
filtered_df = filtered_df.withColumn("event_day_of_week", dayofweek(col("event_time"))) \
                         .withColumn("event_hour", hour(col("event_time"))) \
                         .withColumn("event_weekend", when(col("event_day_of_week").isin([1, 7]), "weekend").otherwise("weekday")) \
                         .withColumn("price_category", when(col("price") < 50, "low") \
                                                        .when((col("price") >= 50) & (col("price") < 200), "medium") \
                                                        .otherwise("high")) \
                         .withColumn("time_of_day", when(col("event_hour").between(0, 6), "night") \
                                                    .when(col("event_hour").between(7, 12), "morning") \
                                                    .when(col("event_hour").between(13, 18), "afternoon") \
                                                    .otherwise("evening"))

### 1. Nombre total de vues par utilisateur

In [None]:
number_of_views = filtered_df.filter(col("event_type") == "view") \
                             .groupBy("user_id") \
                             .agg(_count("event_type").alias("number_of_views"))

### 2. Nombre total de produits ajoutés au panier par utilisateur

In [None]:
number_of_carts = filtered_df.filter(col("event_type") == "cart") \
                             .groupBy("user_id") \
                             .agg(_count("event_type").alias("number_of_carts"))


### 3. Nombre total d'achats précédents par utilisateur


In [None]:
previous_purchases = filtered_df.filter(col("event_type") == "purchase") \
                                .groupBy("user_id") \
                                .agg(_count("event_type").alias("user_previous_purchases"))

### 4. Valeur moyenne des achats précédents par utilisateur

In [None]:
average_purchase_value = filtered_df.filter(col("event_type") == "purchase") \
                                    .groupBy("user_id") \
                                    .agg(_round(_avg("price"), 2).alias("user_average_purchase_value"))

### 5. Temps écoulé depuis le dernier achat

In [None]:

last_purchase_date = filtered_df.filter(col("event_type") == "purchase") \
                                .groupBy("user_id") \
                                .agg(_max("event_time").alias("last_purchase_date"))
days_since_last_purchase = last_purchase_date.withColumn("days_since_last_purchase", 
                                                         datediff(current_date(), col("last_purchase_date")))

### 6. Nombre de produits ajoutés au panier mais non achetés (abandons de panier)

In [None]:

cart_abandonments = filtered_df.filter(col("event_type") == "cart") \
                               .groupBy("user_id", "product_id") \
                               .agg(_count("event_type").alias("cart_count")) \
                               .join(purchase_df.groupBy("user_id", "product_id").agg(_count("event_type").alias("purchase_count")),
                                     on=["user_id", "product_id"], how="left") \
                               .withColumn("purchase_count", col("purchase_count").cast("int")) \
                               .na.fill(0) \
                               .filter(col("purchase_count") == 0) \
                               .groupBy("user_id") \
                               .agg(_count("product_id").alias("cart_abandonments"))

### 7. Valeur totale des achats par utilisateur

In [None]:

total_purchase_value = filtered_df.filter(col("event_type") == "purchase") \
                                  .groupBy("user_id") \
                                  .agg(_round(_sum("price"), 2).alias("total_purchase_value"))


### 8. Nombre total de sessions par utilisateur

In [None]:
number_of_sessions = filtered_df.groupBy("user_id") \
                                .agg(_count("user_session").alias("number_of_sessions"))

### 9. Calculer la valeur moyenne des achats pour déterminer le segment utilisateur

In [None]:
average_purchase_value = average_purchase_value.withColumn("user_segment", when(col("user_average_purchase_value") >= 100, "high spender").otherwise("regular buyer"))


### 10. Identifier la catégorie de produit préférée de l'utilisateur

In [None]:
preferred_category = filtered_df.filter(col("event_type") == "purchase") \
                                .groupBy("user_id", "category_code") \
                                .agg(_count("category_code").alias("category_count"))

window_spec = Window.partitionBy("user_id").orderBy(col("category_count").desc())
preferred_category = preferred_category.withColumn("rank", row_number().over(window_spec)) \
                                       .filter(col("rank") == 1) \
                                       .select("user_id", "category_code") \
                                       .withColumnRenamed("category_code", "preferred_category")

### 11. Indicateur de fidélité à une marque spécifique

In [5]:
brand_loyalty = filtered_df.filter(col("event_type") == "purchase") \
                           .groupBy("user_id", "brand") \
                           .agg(_count("brand").alias("brand_count"))

window_spec = Window.partitionBy("user_id").orderBy(col("brand_count").desc())
brand_loyalty = brand_loyalty.withColumn("rank", row_number().over(window_spec)) \
                             .filter(col("rank") == 1) \
                             .select("user_id", "brand") \
                             .withColumnRenamed("brand", "preferred_brand")


# Joindre les Variables Quantitatives et Qualitatives

## Joindre les DataFrames pour créer le DataFrame final

In [None]:
final_df = number_of_views.join(number_of_carts, "user_id", "left") \
                          .join(previous_purchases, "user_id", "left") \
                          .join(average_purchase_value.select("user_id", "user_average_purchase_value", "user_segment"), "user_id", "left") \
                          .join(days_since_last_purchase, "user_id", "left") \
                          .join(cart_abandonments, "user_id", "left") \
                          .join(total_purchase_value, "user_id", "left") \
                          .join(number_of_sessions, "user_id", "left") \
                          .join(preferred_category, "user_id", "left") \
                          .join(brand_loyalty, "user_id", "left") \
                          .distinct()  # Suppression des duplicatas si nécessaire


## Remplacer les valeurs NULL par des valeurs par défaut si nécessaire

In [None]:
final_df = final_df.fillna({
    "number_of_views": 0,
    "number_of_carts": 0,
    "user_previous_purchases": 0,
    "user_average_purchase_value": 0.0,
    "days_since_last_purchase": 9999,  # Utiliser une valeur par défaut pour indiquer une absence de précédent achat
    "cart_abandonments": 0,
    "total_purchase_value": 0.0,
    "number_of_sessions": 0,
    "preferred_category": "unknown",
    "preferred_brand": "unknown"
})

In [6]:
# Afficher les noms des colonnes pour vérifier les nouvelles variables
# print("Les noms des colonnes du DataFrame final sont :")
# for column in final_df.columns:
#     print(column)

# Afficher un échantillon des données pour vérifier les nouvelles variables
final_df.show(10)

Les noms des colonnes du DataFrame final sont :
user_id
number_of_views
number_of_carts
user_previous_purchases
user_average_purchase_value
user_segment
last_purchase_date
days_since_last_purchase
cart_abandonments
total_purchase_value
number_of_sessions
preferred_category
preferred_brand


                                                                                

+---------+---------------+---------------+-----------------------+---------------------------+-------------+-------------------+------------------------+-----------------+--------------------+------------------+--------------------+---------------+
|  user_id|number_of_views|number_of_carts|user_previous_purchases|user_average_purchase_value| user_segment| last_purchase_date|days_since_last_purchase|cart_abandonments|total_purchase_value|number_of_sessions|  preferred_category|preferred_brand|
+---------+---------------+---------------+-----------------------+---------------------------+-------------+-------------------+------------------------+-----------------+--------------------+------------------+--------------------+---------------+
|346998191|            207|              2|                      1|                       96.5|regular buyer|2020-03-13 14:25:20|                    1564|                1|                96.5|               210|appliances.enviro...|          huter|


# Sauvegarder le DataFrame final dans un fichier Parquet

In [None]:
final_output_path = "/home/jovyan/work/final_df_output.parquet"
final_df.write.mode("overwrite").parquet(final_output_path)

## Arrêter la session Spark

In [None]:
spark.stop()