In [38]:
from pyspark.sql import Row
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

In [39]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [40]:
import pandas as pd
import matplotlib.pyplot as plt

# Chargement des données

In [41]:
customers = spark.sql("SELECT * FROM `Database_Olist`.`customers`")
orders = spark.sql("SELECT * FROM `Database_Olist`.`orders`")
products = spark.sql("SELECT * FROM `Database_Olist`.`products`")
order_items = spark.sql("SELECT * FROM `Database_Olist`.`order_items`")
states_name = spark.sql("SELECT * FROM `Database_Olist`.`states_name`")

# Jointure des Dataframes

In [42]:
# Création d'un Dataframe global
sales = order_items.alias("i") \
    .join(products.alias("p"), F.col("i.product_id") == F.col("p.product_id"), "left") \
    .join(orders.alias("o"), F.col("i.order_id") == F.col("o.order_id"), "left") \
    .join(customers.alias("c"), F.col("o.customer_id") == F.col("c.customer_id"), "left") \
    .join(states_name.alias("s"), F.col("c.customer_state") == F.col("s.geolocation_state"), "left") \
    .withColumn("order_purchase_year", F.year(F.col("o.order_purchase_timestamp"))) \
    .withColumn("order_purchase_month", F.month(F.col("o.order_purchase_timestamp"))) \
    .withColumn("order_purchase_day", F.dayofmonth(F.col("o.order_purchase_timestamp")))


sales = sales.select("i.order_id", 
                     "i.product_id",
                     "o.customer_id",
                     "i.seller_id", 
                     "i.order_item_id", 
                     "p.product_category_name", 
                     "i.price", 
                     "i.freight_value", 
                     "i.total_items_value", 
                     "i.total_freight_value", 
                     "i.total_order_value", 
                     "p.product_description_length",
                     "p.product_photos_qty",
                     "o.order_status",
                     "o.order_purchase_timestamp",
                     "order_purchase_year",
                     "order_purchase_month",
                     "order_purchase_day",
                     "c.customer_city",
                     "s.state_name")

sales.printSchema()

# Régression linéaire

In [43]:
# Regression linéaire : prévision des prix
schema = StructType([
    StructField("category", StringType(), False),
    StructField("sales", IntegerType(), False),
    StructField("year", IntegerType(), False),
    StructField("month", IntegerType(), False),
    StructField("date", TimestampType(), True),
])

price_predictions = spark.createDataFrame(sc.emptyRDD(), schema)

price_predictions.show()

In [44]:
# Récupération de toutes les catégories
categories = sales.select(F.col("product_category_name").alias("category")).distinct().orderBy(F.col("product_category_name").asc()).collect()

# Sélectionner les colonnes pertinentes pour la régression linéaire
sales_reg = sales.select(
    F.col("order_purchase_year").alias("year"),
    F.col("order_purchase_month").alias("month"),
    F.col("product_category_name").alias("category"),
    F.col("total_order_value").alias("sales")
)

In [45]:
# Fonction permettant de réaliser la Régression linéaire
def regression_lineaire(df_train, future_date, category) :
    df_train = df_train.groupBy("year", "month") \
        .agg(F.sum("sales").alias("sales"))
    
    # Assembler les fonctionnalités
    assembler = VectorAssembler(
        inputCols=["year", "month"],
        outputCol="features"
    )

    # Transformer les données en utilisant l'assembler
    sales_data = assembler.transform(df_train)
    sales_data = sales_data.select("features", "sales")

    # Créer le modèle de régression linéaire
    lr = LinearRegression(featuresCol="features", labelCol="sales")

    # Ajuster le modèle aux données d'entraînement
    lr_model = lr.fit(sales_data)

    # Transformer les données en utilisant l'assembler
    df_predicted = spark.createDataFrame(future_date, ["year", "month"])
    df_predicted = assembler.transform(df_predicted).select("features")

    # Prédire les prix pour les dates futures
    df_predicted = lr_model.transform(df_predicted)

    # Assemblage des DataFrames train et predicted en RDD
    rdd = sc.parallelize(sales_data.union(df_predicted).rdd.collect()) \
        .map(lambda row: Row(features=row.features.toArray().tolist(), sales=row.sales)) \
        .map(lambda x: (int(x.features[0]), int(x.features[1]), x.sales))
    
    # Créer un DataFrame à partir de l'RDD avec les noms de colonnes appropriés
    df = rdd.toDF(["year", "month", "sales"]) \
        .withColumn("category", F.lit(category)) \
        .withColumn("date", F.to_date(F.concat(F.col("year"), F.lit("-"), F.col("month"), F.lit("-01")))) \
        .select("category", "sales", "year", "month", "date")

    return df

In [46]:
# Création d'une liste de listes contenant les dates à prédire
future_date = [[2018,9], [2018, 10], [2018, 11], [2018, 12]] + \
    [[year, month] for year in range(2019, 2022) for month in range(1, 13)]


# Prédictions des prix pour chaque catégorie
for c in categories :
    category = c.category

    df_train = sales_reg.filter(F.col("category") == category)
    df = regression_lineaire(df_train, future_date, category)
    price_predictions = price_predictions.union(df)


price_predictions.show()

In [47]:
n = 10

# Récupération de la plus grande date de prédiction
max_date_row = price_predictions.agg(F.max("date")).head()
max_date = max_date_row[0]

# Récupération des n catégories ayant les meilleures ventes
best_categories_predicted = price_predictions.select("category") \
    .filter(F.col("date") == max_date) \
    .orderBy(F.col("sales").desc()) \
    .head(n)

# Mise sous forme de liste
best_categories_predicted = [c.category for c in best_categories_predicted]
best_categories_predicted

In [48]:
# Création du dataframe avec uniquement les n meilleures catégories
price_predictions_top_category = price_predictions \
    .filter(F.col("category").isin(best_categories_predicted))
    
price_predictions_top_category.show()

# Représentation graphique des prédictions

In [49]:
# Initialiser la figure
plt.figure(figsize=(10, 6))

for c in categories :
    category = c.category

    # Choisir la bonne catégorie et trier les données par date
    predictions_pd = price_predictions.filter(F.col("category") == category).orderBy("date").toPandas()

    # Tracer les données
    if not predictions_pd.empty:
        plt.plot(predictions_pd['date'], predictions_pd['sales'], label=category)


plt.xlabel('Date')
plt.ylabel('Sales')
plt.title('Linear Regression Predictions')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True)
plt.show()

In [50]:
# Initialiser la figure
plt.figure(figsize=(10, 6))

for category in best_categories_predicted :
    # Choisir la bonne catégorie et trier les données par date
    predictions_pd = price_predictions_top_category.filter(F.col("category") == category).orderBy("date").toPandas()

    # Tracer les données
    if not predictions_pd.empty:
        plt.plot(predictions_pd['date'], predictions_pd['sales'], label=category)


plt.xlabel('Date')
plt.ylabel('Sales')
plt.title('Linear Regression Predictions Top Categories')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True)
plt.show()

# Chargement des Dataframes en Parquet

In [51]:
# Chemin de destination dans ADLS Gen2
predictions_path = "abfss://projetcloud@datalakecloud.dfs.core.windows.net/linear_regression/price_predictions"
top_category_predictions_path = "abfss://projetcloud@datalakecloud.dfs.core.windows.net/linear_regression/price_predictions_top_category"

# Écrire les DataFrame au format Parquet
price_predictions.write.mode("overwrite").parquet(predictions_path)
price_predictions_top_category.write.mode("overwrite").parquet(top_category_predictions_path)