In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
import pyspark.sql.functions as F

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
import boto3
import pandas as pd
from io import StringIO

import matplotlib.pyplot as plt

# Initialisation de Spark

In [3]:
# Initialisation de la session Spark
spark = SparkSession.builder \
    .appName("Projet de Fin d'Etude") \
    .config("spark.jars.packages", "org.apache.spark:spark-hadoop-cloud_2.12:3.3.0") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.fast.upload", True) \
    .config("spark.hadoop.fs.s3a.fast.upload.buffer", "bytebuffer") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.port", "9000") \
    .master("local[*]") \
    .getOrCreate()

In [4]:
sc = spark.sparkContext

# Chargement des données

In [5]:
# Informations de connexion à RDS
jdbc_url = "jdbc:mysql://database-grp6.chcmvnn5gnpv.us-east-1.rds.amazonaws.com:3306/cleaned_data?serverTimezone=UTC"
db_properties = {
    "user": "admin",
    "password": "xxx",
    "driver": "com.mysql.jdbc.Driver"
}

In [6]:
# Liste des noms des tables RDS
table_names = [
    "orders",
    "order_items",
    "order_payments",
    "order_reviews",
    "products",
    "products_translated",
    "geolocation",
    "states_name",
    "customers",
    "sellers"
]

In [7]:
# Créer un dictionnaire de DataFrames
dataframes = {table_name: spark.read.jdbc(url=jdbc_url, table=table_name, properties=db_properties) for table_name in table_names}
dataframes

{'orders': DataFrame[order_id: string, customer_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp],
 'order_items': DataFrame[order_id: string, order_item_id: int, product_id: string, seller_id: string, shipping_limit_date: timestamp, price: double, freight_value: double, total_items_value: double, total_freight_value: double, total_order_value: double],
 'order_payments': DataFrame[order_id: string, payment_sequential: int, payment_type: string, payment_installments: int, payment_value: double],
 'order_reviews': DataFrame[review_id: string, order_id: string, review_score: int, review_comment_title: string, review_comment_message: string, review_creation_date: timestamp, review_answer_timestamp: timestamp],
 'products': DataFrame[product_id: string, product_category_name: string, product_name_length: int, product

# Analyse des données

In [8]:
# Création d'un Dataframe global
sales = dataframes["order_items"].alias("i") \
    .join(dataframes["products"].alias("p"), F.col("i.product_id") == F.col("p.product_id"), "left") \
    .join(dataframes["orders"].alias("o"), F.col("i.order_id") == F.col("o.order_id"), "left") \
    .join(dataframes["customers"].alias("c"), F.col("o.customer_id") == F.col("c.customer_id"), "left") \
    .join(dataframes["states_name"].alias("s"), F.col("c.customer_state") == F.col("s.geolocation_state"), "left") \
    .withColumn("order_purchase_year", F.year(F.col("o.order_purchase_timestamp"))) \
    .withColumn("order_purchase_month", F.month(F.col("o.order_purchase_timestamp"))) \
    .withColumn("order_purchase_day", F.dayofmonth(F.col("o.order_purchase_timestamp")))


sales = sales.select("i.order_id", 
                     "i.product_id",
                     "o.customer_id",
                     "i.seller_id", 
                     "i.order_item_id", 
                     "p.product_category_name", 
                     "i.price", 
                     "i.freight_value", 
                     "i.total_items_value", 
                     "i.total_freight_value", 
                     "i.total_order_value", 
                     "p.product_description_length",
                     "p.product_photos_qty",
                     "o.order_status",
                     "o.order_purchase_timestamp",
                     "order_purchase_year",
                     "order_purchase_month",
                     "order_purchase_day",
                     "c.customer_city",
                     "s.state_name")

sales.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- total_items_value: double (nullable = true)
 |-- total_freight_value: double (nullable = true)
 |-- total_order_value: double (nullable = true)
 |-- product_description_length: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_purchase_year: integer (nullable = true)
 |-- order_purchase_month: integer (nullable = true)
 |-- order_purchase_day: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- state_name: string (nullable = true)



# Régression Linéaire

In [9]:
# Regression linéaire : prévision des prix
schema = StructType([
    StructField("category", StringType(), False),
    StructField("sales", IntegerType(), False),
    StructField("year", IntegerType(), False),
    StructField("month", IntegerType(), False),
    StructField("date", TimestampType(), True),
])

price_predictions = spark.createDataFrame(sc.emptyRDD(), schema)

price_predictions.show()

+--------+-----+----+-----+----+
|category|sales|year|month|date|
+--------+-----+----+-----+----+
+--------+-----+----+-----+----+



In [10]:
# Récupération de toutes les catégories
categories = sales.select(F.col("product_category_name").alias("category")).distinct().orderBy(F.col("product_category_name").asc()).collect()

# Sélectionner les colonnes pertinentes pour la régression linéaire
sales_reg = sales.select(
    F.col("order_purchase_year").alias("year"),
    F.col("order_purchase_month").alias("month"),
    F.col("product_category_name").alias("category"),
    F.col("total_order_value").alias("sales")
)

In [11]:
# Fonction permettant de réaliser la Régression linéaire
def regression_lineaire(df_train, future_date, category) :
    df_train = df_train.groupBy("year", "month") \
        .agg(F.sum("sales").alias("sales"))
    
    # Assembler les fonctionnalités
    assembler = VectorAssembler(
        inputCols=["year", "month"],
        outputCol="features"
    )

    # Transformer les données en utilisant l'assembler
    sales_data = assembler.transform(df_train)
    sales_data = sales_data.select("features", "sales")

    # Créer le modèle de régression linéaire
    lr = LinearRegression(featuresCol="features", labelCol="sales")

    # Ajuster le modèle aux données d'entraînement
    lr_model = lr.fit(sales_data)

    # Transformer les données en utilisant l'assembler
    df_predicted = spark.createDataFrame(future_date, ["year", "month"])
    df_predicted = assembler.transform(df_predicted).select("features")

    # Prédire les prix pour les dates futures
    df_predicted = lr_model.transform(df_predicted)

    # Assemblage des DataFrames train et predicted en RDD
    rdd = sc.parallelize(sales_data.union(df_predicted).rdd.collect()) \
        .map(lambda row: Row(features=row.features.toArray().tolist(), sales=row.sales)) \
        .map(lambda x: (int(x.features[0]), int(x.features[1]), x.sales))
    
    # Créer un DataFrame à partir de l'RDD avec les noms de colonnes appropriés
    df = rdd.toDF(["year", "month", "sales"]) \
        .withColumn("category", F.lit(category)) \
        .withColumn("date", F.to_date(F.concat(F.col("year"), F.lit("-"), F.col("month"), F.lit("-01")))) \
        .select("category", "sales", "year", "month", "date")

    return df

In [12]:
# Création d'une liste de listes contenant les dates à prédire
future_date = [[2018,9], [2018, 10], [2018, 11], [2018, 12]] + \
    [[year, month] for year in range(2019, 2022) for month in range(1, 13)]


# Prédictions des prix pour chaque catégorie
for c in categories :
    category = c.category

    df_train = sales_reg.filter(F.col("category") == category)
    df = regression_lineaire(df_train, future_date, category)
    price_predictions = price_predictions.union(df)


price_predictions.show()

+--------------------+------------------+----+-----+-------------------+
|            category|             sales|year|month|               date|
+--------------------+------------------+----+-----+-------------------+
|agro_industry_and...|            110.69|2017|    3|2017-03-01 00:00:00|
|agro_industry_and...|             224.6|2017|    8|2017-08-01 00:00:00|
|agro_industry_and...|3558.4299999999994|2017|   10|2017-10-01 00:00:00|
|agro_industry_and...|           6296.51|2018|    1|2018-01-01 00:00:00|
|agro_industry_and...| 6567.089999999999|2018|    3|2018-03-01 00:00:00|
|agro_industry_and...| 5543.740000000001|2018|    8|2018-08-01 00:00:00|
|agro_industry_and...|           1199.47|2017|    7|2017-07-01 00:00:00|
|agro_industry_and...|           3433.22|2018|    5|2018-05-01 00:00:00|
|agro_industry_and...| 5664.040000000001|2017|   12|2017-12-01 00:00:00|
|agro_industry_and...|           2119.69|2017|    9|2017-09-01 00:00:00|
|agro_industry_and...| 8735.280000000002|2018|    7

In [13]:
n = 10

# Récupération de la plus grande date de prédiction
max_date_row = price_predictions.agg(F.max("date")).head()
max_date = max_date_row[0]

# Récupération des n catégories ayant les meilleures ventes
best_categories_predicted = price_predictions.select("category") \
    .filter(F.col("date") == max_date) \
    .orderBy(F.col("sales").desc()) \
    .head(n)

# Mise sous forme de liste
best_categories_predicted = [c.category for c in best_categories_predicted]
best_categories_predicted

['health_beauty',
 'watches_gifts',
 'bed_bath_table',
 'sports_leisure',
 'housewares',
 'computers_accessories',
 'furniture_decor',
 'auto',
 'baby',
 'telephony']

In [14]:
# Création du dataframe avec uniquement les n meilleures catégories
price_predictions_top_category = price_predictions \
    .filter(F.col("category").isin(best_categories_predicted))
    
price_predictions_top_category.show()

+--------+------------------+----+-----+-------------------+
|category|             sales|year|month|               date|
+--------+------------------+----+-----+-------------------+
|    auto|16971.079999999994|2017|    3|2017-03-01 00:00:00|
|    auto| 24000.62000000001|2017|    8|2017-08-01 00:00:00|
|    auto|24917.620000000003|2017|   10|2017-10-01 00:00:00|
|    auto| 41256.23000000002|2018|    1|2018-01-01 00:00:00|
|    auto|52723.470000000045|2018|    3|2018-03-01 00:00:00|
|    auto|52783.059999999976|2018|    8|2018-08-01 00:00:00|
|    auto|16261.740000000002|2017|    7|2017-07-01 00:00:00|
|    auto| 47979.92000000003|2018|    5|2018-05-01 00:00:00|
|    auto|           2257.56|2016|   10|2016-10-01 00:00:00|
|    auto|          44979.33|2017|   12|2017-12-01 00:00:00|
|    auto|16940.350000000006|2017|    9|2017-09-01 00:00:00|
|    auto|          17785.29|2017|    4|2017-04-01 00:00:00|
|    auto| 50830.56999999997|2018|    7|2018-07-01 00:00:00|
|    auto| 51816.6500000

# Chargement des Dataframes en CSV

In [15]:
# Ajout des Dataframes de prédiction de prix au dictionnaire de Dataframes
dataframes["price_predictions"] = price_predictions
dataframes["price_predictions_top_category"] = price_predictions_top_category

In [16]:
# Configurer la connexion à S3
s3 = boto3.client('s3')

# Enregistrer les DataFrame en tant que CSV sur S3
for table_name, df in dataframes.items() :
    print(f"Chargement du Dataframe {table_name}")

    # Transformer le Dataframe spark en Dataframe Pandas
    pdf = df.toPandas()

    # Créer une chaîne CSV à partir du DataFrame Pandas
    csv_buffer = StringIO()
    pdf.to_csv(csv_buffer, index=False)

    # Spécifier le chemin S3
    s3_path = f"output_data/{table_name}.csv"

    # Écrire le fichier CSV sur S3
    s3.put_object(Bucket='grp6-pfe-data-engineering', Key=s3_path, Body=csv_buffer.getvalue())

  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
