In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LeituraCSV") \
    .master("local[*]") \
    .getOrCreate()

print("Spark iniciado com sucesso! Versão:", spark.version)

Spark iniciado com sucesso! Versão: 4.0.1


In [2]:
customers = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_customers_dataset.csv", header=True, inferSchema=True)
geolocation = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_geolocation_dataset.csv", header=True, inferSchema=True)
order_items = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_order_items_dataset.csv", header=True, inferSchema=True)
order_payments = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_order_payments_dataset.csv", header=True, inferSchema=True)
orders = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_orders_dataset.csv", header=True, inferSchema=True)
products = spark.read.csv(r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset Principal\olist\olist_products_dataset.csv", header=True, inferSchema=True)


In [3]:
from pyspark.sql.functions import col

df_base = (
    orders.alias("O")
    # Join com customers
    .join(
        customers.alias("C"),
        on=col("O.customer_id") == col("C.customer_id"),
        how="left"
    )
    # Join com order_items
    .join(
        order_items.alias("I"),
        on=col("O.order_id") == col("I.order_id"),
        how="left"
    )
    # Join com order_payments
    .join(
        order_payments.alias("P"),
        on=col("O.order_id") == col("P.order_id"),
        how="left"
    )
    # Join com products
    .join(
        products.alias("PR"),
        on=col("I.product_id") == col("PR.product_id"),
        how="left"
    )
    # Join com geolocation (ligação por prefixo do CEP do cliente)
    .join(
        geolocation.alias("G"),
        on=col("C.customer_zip_code_prefix") == col("G.geolocation_zip_code_prefix"),
        how="left"
    )
    .select(
        # ---- Pedidos (orders) ----
        col("O.order_id"),
        col("O.customer_id"),
        col("O.order_status"),
        col("O.order_purchase_timestamp"),
        col("O.order_approved_at"),
        col("O.order_delivered_carrier_date"),
        col("O.order_delivered_customer_date"),
        col("O.order_estimated_delivery_date"),

        # Clientes 
        col("C.customer_unique_id"),
        col("C.customer_city"),
        col("C.customer_state"),

        # Itens do pedido 
        col("I.order_item_id"),
        col("I.product_id"),
        col("I.price"),
        col("I.freight_value"),

        # Pagamentos
        col("P.payment_type"),
        col("P.payment_installments"),
        col("P.payment_value"),

        # Produtos
        col("PR.product_category_name"),
        col("PR.product_weight_g"),
        col("PR.product_length_cm"),
        col("PR.product_height_cm"),
        col("PR.product_width_cm"),


    )
)



In [5]:
df_base.show(20, truncate=False)


+--------------------------------+--------------------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------------------+-------------+--------------+-------------+--------------------------------+-----+-------------+------------+--------------------+-------------+---------------------+----------------+-----------------+-----------------+----------------+
|order_id                        |customer_id                     |order_status|order_purchase_timestamp|order_approved_at  |order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|customer_unique_id              |customer_city|customer_state|order_item_id|product_id                      |price|freight_value|payment_type|payment_installments|payment_value|product_category_name|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+-------------------------------

In [6]:
from pyspark.sql.functions import col

df_base = (
    customers.alias("C")
    .join(orders.alias("O"), col("C.customer_id") == col("O.customer_id"), "right")
    .join(order_items.alias("I"), col("O.order_id") == col("I.order_id"), "left")
    .join(products.alias("P"), col("I.product_id") == col("P.product_id"), "left")
    .join(order_payments.alias("PY"), col("O.order_id") == col("PY.order_id"), "left")
    .select(
        col("O.order_id"),
        col("O.order_status"),
        col("O.order_purchase_timestamp"),
        col("C.customer_state"),
        col("P.product_id"),
        col("P.product_category_name"),
        col("I.price"),
        col("I.freight_value"),
        col("PY.payment_value")
    )
)

df_base.show(20, truncate=False)


+--------------------------------+------------+------------------------+--------------+--------------------------------+----------------------+------+-------------+-------------+
|order_id                        |order_status|order_purchase_timestamp|customer_state|product_id                      |product_category_name |price |freight_value|payment_value|
+--------------------------------+------------+------------------------+--------------+--------------------------------+----------------------+------+-------------+-------------+
|e481f51cbdc54678b7cc49136f2d6af7|delivered   |2017-10-02 10:56:33     |SP            |87285b34884572647811a353c7ac498a|utilidades_domesticas |29.99 |8.72         |18.59        |
|e481f51cbdc54678b7cc49136f2d6af7|delivered   |2017-10-02 10:56:33     |SP            |87285b34884572647811a353c7ac498a|utilidades_domesticas |29.99 |8.72         |2.0          |
|e481f51cbdc54678b7cc49136f2d6af7|delivered   |2017-10-02 10:56:33     |SP            |87285b348845726478

In [7]:
#Aqui começa a limpeza dos dados, pessoal
#Vendo quantas linhas aparece "NULO"

from pyspark.sql.functions import col, when, count

df_base.select([count(when(col(c).isNull(), c)).alias(c) for c in df_base.columns]).show()


+--------+------------+------------------------+--------------+----------+---------------------+-----+-------------+-------------+
|order_id|order_status|order_purchase_timestamp|customer_state|product_id|product_category_name|price|freight_value|payment_value|
+--------+------------+------------------------+--------------+----------+---------------------+-----+-------------+-------------+
|       0|           0|                       0|             0|       830|                 2528|  830|          830|            3|
+--------+------------+------------------------+--------------+----------+---------------------+-----+-------------+-------------+



In [8]:
from pyspark.sql.functions import lit
#Como algumas linhas nao fazem sentido estar nulo, precisa tirar essas linhas da tabela pra nao interferir

df_base = df_base.dropna(subset=["product_id", "price", "payment_value"])

# Aqui substitui o valor da linha
df_base = df_base.fillna({
    "product_category_name": "desconhecido",
    "freight_value": 0
})


In [9]:
df_base.select([count(when(col(c).isNull(), c)).alias(c) for c in df_base.columns]).show()


+--------+------------+------------------------+--------------+----------+---------------------+-----+-------------+-------------+
|order_id|order_status|order_purchase_timestamp|customer_state|product_id|product_category_name|price|freight_value|payment_value|
+--------+------------+------------------------+--------------+----------+---------------------+-----+-------------+-------------+
|       0|           0|                       0|             0|         0|                    0|    0|            0|            0|
+--------+------------+------------------------+--------------+----------+---------------------+-----+-------------+-------------+



In [22]:
#Agora criei linha adicionais que juntam as compras por mês e ano, assim fica mais fácil juntar com a API 

from pyspark.sql.functions import year, month, dayofmonth

# Cria as colunas de ano, mês e dia
df_base = df_base.withColumn("order_year", year("order_purchase_timestamp")) \
                 .withColumn("order_month", month("order_purchase_timestamp")) \
                 .withColumn("order_day", dayofmonth("order_purchase_timestamp"))

df_base.select("order_purchase_timestamp", "order_year", "order_month", "order_day").show(10)


+------------------------+----------+-----------+---------+
|order_purchase_timestamp|order_year|order_month|order_day|
+------------------------+----------+-----------+---------+
|     2017-10-02 10:56:33|      2017|         10|        2|
|     2017-10-02 10:56:33|      2017|         10|        2|
|     2017-10-02 10:56:33|      2017|         10|        2|
|     2018-07-24 20:41:37|      2018|          7|       24|
|     2018-08-08 08:38:49|      2018|          8|        8|
|     2017-11-18 19:28:06|      2017|         11|       18|
|     2018-02-13 21:18:39|      2018|          2|       13|
|     2017-07-09 21:57:05|      2017|          7|        9|
|     2017-04-11 12:22:08|      2017|          4|       11|
|     2017-05-16 13:10:30|      2017|          5|       16|
+------------------------+----------+-----------+---------+
only showing top 10 rows


In [24]:
from pyspark.sql.functions import lower, trim, col, upper

df_base = (
    df_base
    # Padroniza texto
    .withColumn("order_status", lower(trim(col("order_status"))))
    .withColumn("customer_state", upper(trim(col("customer_state"))))  # estado em maiúsculo é mais comum
    .withColumn("product_category_name", lower(trim(col("product_category_name"))))
    
    # Garante tipo das numéricas
    .withColumn("price", col("price").cast("double"))
    .withColumn("freight_value", col("freight_value").cast("double"))
    .withColumn("payment_value", col("payment_value").cast("double"))
    .withColumn("order_year", col("order_year").cast("int"))
    .withColumn("order_month", col("order_month").cast("int"))
)

df_base.printSchema()
df_base.show(15, truncate=False)


root
 |-- order_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = false)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = false)
 |-- payment_value: double (nullable = true)
 |-- order_year: integer (nullable = true)
 |-- order_month: integer (nullable = true)

+--------------------------------+------------+------------------------+--------------+--------------------------------+---------------------+------+-------------+-------------+----------+-----------+
|order_id                        |order_status|order_purchase_timestamp|customer_state|product_id                      |product_category_name|price |freight_value|payment_value|order_year|order_month|
+--------------------------------+------------+------------------------+--------------+-----

In [25]:
df_base.describe(["price", "freight_value", "payment_value"]).show()
df_base.groupBy("order_status").count().show()
df_base.groupBy("customer_state").count().show()


+-------+------------------+------------------+-----------------+
|summary|             price|     freight_value|    payment_value|
+-------+------------------+------------------+-----------------+
|  count|            117601|            117601|           117601|
|   mean| 120.8247832926656| 20.04599034021739|172.6867518983651|
| stddev|184.47932270072533|15.861314907587618|267.5922896141393|
|    min|              0.85|               0.0|              0.0|
|    max|            6735.0|            409.68|         13664.08|
+-------+------------------+------------------+-----------------+

+------------+------+
|order_status| count|
+------------+------+
|     shipped|  1244|
|    canceled|   566|
|    invoiced|   371|
|   delivered|115035|
|  processing|   375|
|    approved|     3|
| unavailable|     7|
+------------+------+

+--------------+-----+
|customer_state|count|
+--------------+-----+
|            SC| 4302|
|            RO|  286|
|            PI|  573|
|            AM|  171|
|

In [27]:
output_path = r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset tratado\dataset_pronto"
df_base.write.mode("overwrite").parquet(output_path)



In [29]:
df_base.coalesce(1).write.csv(
    r"C:\Users\Visitante\Desktop\Bootcamp\Data-Engineering-and-Big-Data-Project\Dataset tratado\dataset_pronto",
    header=True,
    mode="overwrite"
)
