In [30]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LeituraCSV") \
    .master("local[*]") \
    .getOrCreate()

print("Spark iniciado com sucesso! Versão:", spark.version)

Spark iniciado com sucesso! Versão: 4.0.1


In [31]:
customers = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_customers_dataset.csv", header=True, inferSchema=True)
geolocation = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_geolocation_dataset.csv", header=True, inferSchema=True)
order_items = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_order_items_dataset.csv", header=True, inferSchema=True)
order_payments = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_order_payments_dataset.csv", header=True, inferSchema=True)
orders = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_orders_dataset.csv", header=True, inferSchema=True)
products = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_products_dataset.csv", header=True, inferSchema=True)


In [42]:
from pyspark.sql.functions import col

df_base = (
    orders.alias("O")
    # Join com customers
    .join(
        customers.alias("C"),
        on=col("O.customer_id") == col("C.customer_id"),
        how="left"
    )
    # Join com order_items
    .join(
        order_items.alias("I"),
        on=col("O.order_id") == col("I.order_id"),
        how="left"
    )
    # Join com order_payments
    .join(
        order_payments.alias("P"),
        on=col("O.order_id") == col("P.order_id"),
        how="left"
    )
    # Join com products
    .join(
        products.alias("PR"),
        on=col("I.product_id") == col("PR.product_id"),
        how="left"
    )
    # Join com geolocation (ligação por prefixo do CEP do cliente)
    .join(
        geolocation.alias("G"),
        on=col("C.customer_zip_code_prefix") == col("G.geolocation_zip_code_prefix"),
        how="left"
    )
    .select(
        # ---- Pedidos (orders) ----
        col("O.order_id"),
        col("O.customer_id"),
        col("O.order_status"),
        col("O.order_purchase_timestamp"),
        col("O.order_approved_at"),

        # Clientes 
        col("C.customer_unique_id"),
        col("C.customer_city"),
        col("C.customer_state"),

        # Itens do pedido 
        col("I.order_item_id"),
        col("I.product_id"),
        col("I.price"),
        col("I.freight_value"),

        # Pagamentos
        col("P.payment_type"),
        col("P.payment_installments"),
        col("P.payment_value"),

        # Produtos
        col("PR.product_category_name"),
        col("PR.product_weight_g"),
        col("PR.product_length_cm"),
        col("PR.product_height_cm"),
        col("PR.product_width_cm"),

        #Geolocation
        col("G.geolocation_lat"),
        col("G.geolocation_lng"),
        col("G.geolocation_city"),
        col("G.geolocation_state")

    )
)



In [43]:
df_base.show(20, truncate=False)


+--------------------------------+--------------------------------+------------+------------------------+-------------------+--------------------------------+-------------+--------------+-------------+--------------------------------+-----+-------------+------------+--------------------+-------------+---------------------+----------------+-----------------+-----------------+----------------+-------------------+-------------------+----------------+-----------------+
|order_id                        |customer_id                     |order_status|order_purchase_timestamp|order_approved_at  |customer_unique_id              |customer_city|customer_state|order_item_id|product_id                      |price|freight_value|payment_type|payment_installments|payment_value|product_category_name|product_weight_g|product_length_cm|product_height_cm|product_width_cm|geolocation_lat    |geolocation_lng    |geolocation_city|geolocation_state|
+--------------------------------+--------------------------

In [38]:
from pyspark.sql.functions import col

df_base = (
    customers.alias("C")
    .join(orders.alias("O"), col("C.customer_id") == col("O.customer_id"), "right")
    .join(order_items.alias("I"), col("O.order_id") == col("I.order_id"), "left")
    .join(products.alias("P"), col("I.product_id") == col("P.product_id"), "left")
    .join(order_payments.alias("PY"), col("O.order_id") == col("PY.order_id"), "left")
    .select(
        col("O.order_id"),
        col("O.order_status"),
        col("O.order_purchase_timestamp"),
        col("C.customer_city"),
        col("C.customer_state"),
        col("P.product_id"),
        col("P.product_category_name"),
        col("I.price"),
        col("I.freight_value"),
        col("PY.payment_value")
        
    )
)

df_base.show(20, truncate=False)


+--------------------------------+------------+------------------------+-----------------------+--------------+--------------------------------+----------------------+------+-------------+-------------+
|order_id                        |order_status|order_purchase_timestamp|customer_city          |customer_state|product_id                      |product_category_name |price |freight_value|payment_value|
+--------------------------------+------------+------------------------+-----------------------+--------------+--------------------------------+----------------------+------+-------------+-------------+
|e481f51cbdc54678b7cc49136f2d6af7|delivered   |2017-10-02 10:56:33     |sao paulo              |SP            |87285b34884572647811a353c7ac498a|utilidades_domesticas |29.99 |8.72         |18.59        |
|e481f51cbdc54678b7cc49136f2d6af7|delivered   |2017-10-02 10:56:33     |sao paulo              |SP            |87285b34884572647811a353c7ac498a|utilidades_domesticas |29.99 |8.72         |

In [65]:
from pyspark.sql.functions import col, avg

# Agrega geolocalização por cidade e estado
geo_base = (
    geolocation
    .groupBy("geolocation_city", "geolocation_state")
    .agg(
        avg("geolocation_lat").alias("avg_lat"),
        avg("geolocation_lng").alias("avg_lng")
    )
)

# Faz os joins
df_base = (
    customers.alias("C")
    .join(orders.alias("O"), col("C.customer_id") == col("O.customer_id"), "right")
    .join(order_items.alias("I"), col("O.order_id") == col("I.order_id"), "left")
    .join(products.alias("P"), col("I.product_id") == col("P.product_id"), "left")
    .join(order_payments.alias("PY"), col("O.order_id") == col("PY.order_id"), "left")
    .join(
        geo_base.alias("G"),
        (col("C.customer_city") == col("G.geolocation_city")) &
        (col("C.customer_state") == col("G.geolocation_state")),
        "left"
    )
    .select(
        col("O.order_id"),
        col("O.order_status"),
        col("O.order_purchase_timestamp"),
        col("C.customer_city"),
        col("C.customer_state"),
        col("G.avg_lat").alias("latitude_media"),
        col("G.avg_lng").alias("longitude_media"),
        col("P.product_id"),
        col("P.product_category_name"),
        col("I.price"),
        col("I.freight_value"),
        col("PY.payment_value")
    )
)

df_base.show(20, truncate=False)


+--------------------------------+------------+------------------------+-----------------------+--------------+-------------------+-------------------+--------------------------------+----------------------+------+-------------+-------------+
|order_id                        |order_status|order_purchase_timestamp|customer_city          |customer_state|latitude_media     |longitude_media    |product_id                      |product_category_name |price |freight_value|payment_value|
+--------------------------------+------------+------------------------+-----------------------+--------------+-------------------+-------------------+--------------------------------+----------------------+------+-------------+-------------+
|e481f51cbdc54678b7cc49136f2d6af7|delivered   |2017-10-02 10:56:33     |sao paulo              |SP            |-23.570860168124305|-46.63324168698368 |87285b34884572647811a353c7ac498a|utilidades_domesticas |29.99 |8.72         |18.59        |
|e481f51cbdc54678b7cc49136f2

In [66]:
df_base = df_base.dropDuplicates(["order_id", "product_id"])

df_base.show(20, truncate=False)

+--------------------------------+------------+------------------------+--------------+--------------+-------------------+-------------------+--------------------------------+---------------------------------+------+-------------+-------------+
|order_id                        |order_status|order_purchase_timestamp|customer_city |customer_state|latitude_media     |longitude_media    |product_id                      |product_category_name            |price |freight_value|payment_value|
+--------------------------------+------------+------------------------+--------------+--------------+-------------------+-------------------+--------------------------------+---------------------------------+------+-------------+-------------+
|00048cc3ae777c65dbb7d2a0634bc1ea|delivered   |2017-05-15 21:42:34     |uberaba       |MG            |-19.752823866371905|-47.935837449992846|ef92defde845ab8450f9d70c526ef70f|utilidades_domesticas            |21.9  |12.69        |34.59        |
|000576fe39319847cbb

In [67]:
#Aqui começa a limpeza dos dados, pessoal
#Vendo quantas linhas aparece "NULO"

from pyspark.sql.functions import col, when, count

df_base.select([count(when(col(c).isNull(), c)).alias(c) for c in df_base.columns]).show()


+--------+------------+------------------------+-------------+--------------+--------------+---------------+----------+---------------------+-----+-------------+-------------+
|order_id|order_status|order_purchase_timestamp|customer_city|customer_state|latitude_media|longitude_media|product_id|product_category_name|price|freight_value|payment_value|
+--------+------------+------------------------+-------------+--------------+--------------+---------------+----------+---------------------+-----+-------------+-------------+
|       0|           0|                       0|            0|             0|            76|             76|       775|                 2235|  775|          775|            1|
+--------+------------+------------------------+-------------+--------------+--------------+---------------+----------+---------------------+-----+-------------+-------------+



In [68]:
#Como algumas linhas nao fazem sentido estar nulo, precisa tirar essas linhas da tabela pra nao interferir

from pyspark.sql.functions import col

# Limpeza geral da tabela
df_base = (
    df_base
    # Remove linhas essenciais que não podem ser nulas
    .dropna(subset=["product_id", "price", "payment_value", "latitude_media", "longitude_media"])
    
    # Substitui valores ausentes em colunas opcionais
    .fillna({
        "product_category_name": "desconhecido",
        "freight_value": 0
    })
)



In [69]:
df_base.select([count(when(col(c).isNull(), c)).alias(c) for c in df_base.columns]).show()


+--------+------------+------------------------+-------------+--------------+--------------+---------------+----------+---------------------+-----+-------------+-------------+
|order_id|order_status|order_purchase_timestamp|customer_city|customer_state|latitude_media|longitude_media|product_id|product_category_name|price|freight_value|payment_value|
+--------+------------+------------------------+-------------+--------------+--------------+---------------+----------+---------------------+-----+-------------+-------------+
|       0|           0|                       0|            0|             0|             0|              0|         0|                    0|    0|            0|            0|
+--------+------------+------------------------+-------------+--------------+--------------+---------------+----------+---------------------+-----+-------------+-------------+



In [70]:
from pyspark.sql.functions import to_date

# Converte o timestamp para apenas data
df_base = df_base.withColumn("order_date", to_date("order_purchase_timestamp"))

# Mostra resultado
df_base.select("order_purchase_timestamp", "order_date").show(10, truncate=False)


+------------------------+----------+
|order_purchase_timestamp|order_date|
+------------------------+----------+
|2017-05-14 20:28:25     |2017-05-14|
|2017-06-13 21:11:26     |2017-06-13|
|2017-11-28 21:00:44     |2017-11-28|
|2018-06-04 08:33:09     |2018-06-04|
|2017-07-12 18:45:24     |2017-07-12|
|2017-12-07 15:40:14     |2017-12-07|
|2017-03-09 16:18:47     |2017-03-09|
|2017-11-26 21:21:07     |2017-11-26|
|2017-10-24 11:34:33     |2017-10-24|
|2017-10-07 00:16:57     |2017-10-07|
+------------------------+----------+
only showing top 10 rows


In [71]:
from pyspark.sql.functions import lower, trim, col, upper

df_base = (
    df_base
    .withColumn("order_status", lower(trim(col("order_status"))))
    .withColumn("customer_state", upper(trim(col("customer_state"))))
    .withColumn("product_category_name", lower(trim(col("product_category_name"))))
    .withColumn("price", col("price").cast("double"))
    .withColumn("freight_value", col("freight_value").cast("double"))
    .withColumn("payment_value", col("payment_value").cast("double"))

)

df_base.printSchema()
df_base.show(15, truncate=False)


root
 |-- order_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- latitude_media: double (nullable = true)
 |-- longitude_media: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = false)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = false)
 |-- payment_value: double (nullable = true)
 |-- order_date: date (nullable = true)

+--------------------------------+------------+------------------------+--------------+--------------+-------------------+-------------------+--------------------------------+---------------------------------+-----+-------------+-------------+----------+
|order_id                        |order_status|order_purchase_timestamp|customer_city |customer_state|latitude_media     |longitude_media    |produc

In [72]:
df_base.describe(["price", "freight_value", "payment_value"]).show()
df_base.groupBy("order_status").count().show()
df_base.groupBy("customer_state").count().show()


+-------+------------------+------------------+------------------+
|summary|             price|     freight_value|     payment_value|
+-------+------------------+------------------+------------------+
|  count|            102349|            102349|            102349|
|   mean|124.40438001347657|20.104408543316925|161.60666982578988|
| stddev|188.91316646363524|15.870970831547204| 221.0484858186441|
|    min|              0.85|               0.0|               0.0|
|    max|            6735.0|            409.68|          13664.08|
+-------+------------------+------------------+------------------+

+------------+------+
|order_status| count|
+------------+------+
|     shipped|  1126|
|    canceled|   465|
|    approved|     2|
|    invoiced|   322|
|   delivered|100122|
| unavailable|     7|
|  processing|   305|
+------------+------+

+--------------+-----+
|customer_state|count|
+--------------+-----+
|            SC| 3737|
|            RO|  258|
|            PI|  504|
|            AM

In [73]:
output_path = r"C:\Users\Visitante\Desktop\Bootcamp\ml-data-engineering-project\notebooks\dataset_pronto"
df_base.write.mode("overwrite").parquet(output_path)



In [74]:
df_base.coalesce(1).write.csv(
    r"C:\Users\Visitante\Desktop\Bootcamp\ml-data-engineering-project\notebooks\dataset_pronto",
    header=True,
    mode="overwrite"
)


In [75]:
df_base.show(15, truncate=False)

+--------------------------------+------------+------------------------+--------------+--------------+-------------------+-------------------+--------------------------------+---------------------------------+-----+-------------+-------------+----------+
|order_id                        |order_status|order_purchase_timestamp|customer_city |customer_state|latitude_media     |longitude_media    |product_id                      |product_category_name            |price|freight_value|payment_value|order_date|
+--------------------------------+------------+------------------------+--------------+--------------+-------------------+-------------------+--------------------------------+---------------------------------+-----+-------------+-------------+----------+
|00048cc3ae777c65dbb7d2a0634bc1ea|delivered   |2017-05-15 21:42:34     |uberaba       |MG            |-19.752823866371905|-47.935837449992846|ef92defde845ab8450f9d70c526ef70f|utilidades_domesticas            |21.9 |12.69        |34.59 