In [1]:
%pip uninstall delta_spark -y
%pip install unidecode delta_spark -U

Found existing installation: delta-spark 2.4.0
Uninstalling delta-spark-2.4.0:
  Successfully uninstalled delta-spark-2.4.0
Note: you may need to restart the kernel to use updated packages.
Collecting delta_spark
  Using cached delta_spark-2.4.0-py3-none-any.whl (20 kB)
Installing collected packages: delta_spark
Successfully installed delta_spark-2.4.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from typing import List
from pathlib import Path

from unidecode import unidecode
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession, DataFrame, Row
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType

In [3]:
JAR_PACKAGES = ",".join([str(x) for x in Path("../../jars").glob("*.jar")])
JAR_PACKAGES

'../../jars/antlr4-runtime-4.9.3.jar,../../jars/aws-java-sdk-bundle-1.12.392.jar,../../jars/delta-core_2.12-2.4.0.jar,../../jars/delta-storage-2.4.0.jar,../../jars/hadoop-aws-3.3.1.jar,../../jars/wildfly-openssl-1.0.7.Final.jar'

## Read The Datasets

## Overview of Each Dataset


### Dataset High Level Overview

For each Dataset get the ff. (tabular)
- Number of Rows
- Number of Columns
- Number of Null Values
- Number Columns with Null Values
- List of Columns with Null

In [4]:
SPARK_URI = "spark://spark:7077"
HIVE_URI = "thrift://hive-metastore:9083"
MINIO_URI = "http://minio:9000"

builder = SparkSession.builder.appName("olist_silver_transformer").master(SPARK_URI) \
    .config("spark.jars", JAR_PACKAGES) \
    .config("spark.sql.warehouse.dir", "s3a:///") \
    .config("spark.hadoop.hive.metastore.uris", HIVE_URI) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.access.key", "datalake") \
    .config("spark.hadoop.fs.s3a.secret.key", "datalake") \
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_URI) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

In [5]:
bronze_container_path = "s3a://bronze"

In [6]:
customers_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_customers_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [7]:
geolocation_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_geolocation_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
geolocation_df.printSchema()

root
 |-- geolocation_zip_code_prefix: integer (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)



In [8]:
order_items_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_order_items_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
order_items_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



In [9]:
order_payments_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_order_payments_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
order_payments_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [10]:
def get_order_reviews_schema() -> StructType:
    return StructType(
        [
            StructField("review_id", StringType()),
            StructField("order_id", StringType()),
            StructField("review_score", FloatType()),
            StructField("review_comment_title", StringType()),
            StructField("review_comment_message", StringType()),
            StructField("review_creation_date", TimestampType()),
            StructField("review_answer_timestamp", TimestampType())
        ]
    )

In [11]:
order_reviews_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_order_reviews_dataset.csv") \
    .option("header", True) \
    .schema(get_order_reviews_schema()) \
    .load()
order_reviews_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: float (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: timestamp (nullable = true)
 |-- review_answer_timestamp: timestamp (nullable = true)



In [12]:
orders_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_orders_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [13]:
products_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_products_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
products_df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)



In [14]:
sellers_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_sellers_dataset.csv") \
    .option("header", True) \
    .option("inferScema", True) \
    .load()
sellers_df.printSchema()

root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: string (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)



In [15]:
product_category_name_translation_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/product_category_name_translation.csv") \
    .option("header", True) \
    .option("inferScema", True) \
    .load()
product_category_name_translation_df.printSchema()

root
 |-- product_category_name: string (nullable = true)
 |-- product_category_name_english: string (nullable = true)



In [16]:
orders_df.where("order_id = '8272b63d03f5f79c56e9e4120aec44ef'").show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|8272b63d03f5f79c5...|fc3d1daec319d62d4...|   delivered|     2017-07-16 18:19:25|2017-07-17 18:25:23|         2017-07-20 15:45:53|          2017-07-31 18:03:02|          2017-07-28 00:00:00|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+



In [17]:
def get_timestamp_fields(df):
    return [c for c in df.columns if dict(df.dtypes).get(c) == "timestamp"]

def get_date_df(df_list):
    fields = get_timestamp_fields(df_list[0])
    base_df = df_list[0].select(F.explode(F.array(*fields)).alias("dates"))

    for df in df_list[1:]:
        fields = [c for c in df.columns if dict(df.dtypes).get(c) == "timestamp"]
        new_df = df.select(F.explode(F.array(*fields)).alias("dates"))
        base_df = base_df.union(new_df)
    return base_df.distinct()

my_df = get_date_df(df_list=[orders_df, order_items_df, order_reviews_df])

In [18]:
x = orders_df.select(F.explode(F.array('order_purchase_timestamp',
 'order_approved_at',
 'order_delivered_carrier_date',
 'order_delivered_customer_date',
 'order_estimated_delivery_date')).alias('dates'),
                )

In [19]:
y = order_items_df.selectExpr("shipping_limit_date AS dates")

In [20]:
z = order_reviews_df.select(F.explode(F.array('review_creation_date',
 'review_answer_timestamp')).alias('dates'),
                )

In [21]:
x.union(y).union(z).distinct().count()

548037

In [22]:
my_df.count()

548037

### Dataset Detailed Info

Create a tabular data of the ff.

- Column name
- Null amount
- Null percentage among the respective dataset
- Data type
- total categorical entries

In [23]:
def show_detailed_stats(df_list: List[DataFrame]):
    columns = ["dataset_name", "column_name", "row_count", "col_count", "null_amount", "null_percentage", "data_type", "unique_count"]
    all_df = []
    
    for df in df_list:
        data = [
            (
                df.select(F.element_at(F.split(F.input_file_name(), "/"), -1)).first()[0],
                c, 
                df.count(), 
                len(df.columns), 
                df.where(df[c].isNull()).count(), 
                round(100 * df.where(df[c].isNull()).count() / df.count(), 2),
                dict(df.dtypes)[c], 
                df.select(c).distinct().count()
             ) 
                 for c in df.columns
        ]
        all_df.extend(data)
        all_df.append(("-", "-", "-", "-", "-", "-", "-", "-"))
    rdd = spark.sparkContext.parallelize(all_df)
    result_df = rdd.toDF(columns)
    return result_df

stats_df = show_detailed_stats([
    customers_df,
    geolocation_df,
    order_items_df,
    order_payments_df,
    order_reviews_df,
    orders_df,
    products_df,
    sellers_df,
    product_category_name_translation_df,
])
stats_df.show(n=100, truncate=False)

+-------------------------------------+-----------------------------+---------+---------+-----------+---------------+---------+------------+
|dataset_name                         |column_name                  |row_count|col_count|null_amount|null_percentage|data_type|unique_count|
+-------------------------------------+-----------------------------+---------+---------+-----------+---------------+---------+------------+
|olist_customers_dataset.csv          |customer_id                  |99441    |5        |0          |0.0            |string   |99441       |
|olist_customers_dataset.csv          |customer_unique_id           |99441    |5        |0          |0.0            |string   |96096       |
|olist_customers_dataset.csv          |customer_zip_code_prefix     |99441    |5        |0          |0.0            |int      |14994       |
|olist_customers_dataset.csv          |customer_city                |99441    |5        |0          |0.0            |string   |4119        |
|olist_custom

In [24]:
order_id = "8272b63d03f5f79c56e9e4120aec44ef"
order_payments_df.where(F.col("order_id") == order_id).show(n=100, truncate=False)
order_items_df.where(F.col("order_id") == order_id).show(n=100, truncate=False)

+--------------------------------+------------------+------------+--------------------+-------------+
|order_id                        |payment_sequential|payment_type|payment_installments|payment_value|
+--------------------------------+------------------+------------+--------------------+-------------+
|8272b63d03f5f79c56e9e4120aec44ef|1                 |credit_card |2                   |196.11       |
+--------------------------------+------------------+------------+--------------------+-------------+

+--------------------------------+-------------+--------------------------------+--------------------------------+-------------------+-----+-------------+
|order_id                        |order_item_id|product_id                      |seller_id                       |shipping_limit_date|price|freight_value|
+--------------------------------+-------------+--------------------------------+--------------------------------+-------------------+-----+-------------+
|8272b63d03f5f79c56e9e41

In [25]:
order_payments_df.groupBy("order_id").agg(F.count("order_id").alias("cnt")).where("cnt > 2").show(truncate=False)


+--------------------------------+---+
|order_id                        |cnt|
+--------------------------------+---+
|8ca5bdac5ebe8f2d6fc9171d5ebc906a|9  |
|251f0a3981c4a8cb853a9cc9d6ba49ad|5  |
|ac3b0c224349e4ca9a0b0f2e8fbc4c75|3  |
|0fa927b252421189a0e0d5725fb3832d|4  |
|1eed3691b23af50a487c4f8828287734|3  |
|db97652cf517d2cd03db63dec489ca62|3  |
|6064862631581009b8eb676bc264d91f|4  |
|0cd93455c51655ebb590c07a06d584a1|3  |
|27a940efdd448db29463b53ea0cfa2f4|11 |
|54ba74414bed49a8b13864b3e8666f57|3  |
|f1d6313ed30c82ba1523aad41c60685e|4  |
|b293f468141dc24b16033560c8d008a6|3  |
|465c2e1bee4561cb39e0db8c5993aafc|12 |
|54e52a2c76449b9a634705a5ae9684a9|6  |
|f8a8d05d951ce7e359dfd0c67f1de017|10 |
|77f0ae10c13cc7c78a5393b27d121044|3  |
|4d20a88009a1ccbe54db2c4c7e184228|3  |
|591083bc42b589c7052118aa83118e76|6  |
|b7f12e29e55ecae289d444db518400ed|3  |
|2dc89150e25e0d36a6b80cea3c8a1290|3  |
+--------------------------------+---+
only showing top 20 rows



In [26]:
order_items_df.groupBy("order_id").agg(F.count("order_id").alias("total_orders")).orderBy(F.desc("total_orders")).show(truncate=False)

+--------------------------------+------------+
|order_id                        |total_orders|
+--------------------------------+------------+
|8272b63d03f5f79c56e9e4120aec44ef|21          |
|1b15974a0141d54e36626dca3fdc731a|20          |
|ab14fdcfbe524636d65ee38360e22ce8|20          |
|428a2f660dc84138d969ccd69a0ab6d5|15          |
|9ef13efd6949e4573a18964dd1bbe7f5|15          |
|9bdc4d4c71aa1de4606060929dee888c|14          |
|73c8ab38f07dc94389065f7eba4f297a|14          |
|37ee401157a3a0b28c9c6d0ed8c3b24b|13          |
|2c2a19b5703863c908512d135aa6accc|12          |
|3a213fcdfe7d98be74ea0dc05a8b31ae|12          |
|637617b3ffe9e2f7a2411243829226d0|12          |
|af822dacd6f5cff7376413c03a388bb7|12          |
|c05d6a79e55da72ca780ce90364abed9|12          |
|6c355e2913545fa6f72c40cbca57729e|11          |
|5a3b1c29a49756e75f1ef513383c0c12|11          |
|71dab1155600756af6de79de92e712e3|11          |
|7f2c22c54cbae55091a09a9653fd2b8a|11          |
|9aec4e1ae90b23c7bf2d2b3bfafbd943|10    

In [27]:
# order_items_df.groupBy("order_id").agg(F.countDistinct("seller_id").alias("cnt")).where("cnt > 3").show(truncate=False)

order_items_df.where("order_id = '8c2b13adf3f377c8f2b06b04321b0925'").show()

+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|8c2b13adf3f377c8f...|            1|6f59fe49d85eb1353...|977f9f63dd360c2a3...|2017-11-23 20:31:40|  90.9|        21.08|
|8c2b13adf3f377c8f...|            2|5c818ca21204caf8c...|54965bbe3e4f07ae0...|2017-11-23 20:31:40| 160.0|        21.08|
|8c2b13adf3f377c8f...|            3|b75ad41bddb7dc94c...|1dfe5347016252a78...|2017-11-23 20:31:40|  61.0|        21.08|
|8c2b13adf3f377c8f...|            4|601a360bd2a916ece...|7a67c85e85bb2ce85...|2017-11-23 20:31:40|129.99|        42.16|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+



## Exploratory Data Analysis (EDA)

### How many orders we have for each status?

In [29]:
orders_df.select(F.col("order_estimated_delivery_date"), F.date_format(F.col("order_estimated_delivery_date"), "yyyy-MM-dd").alias("dt")).show()

+-----------------------------+----------+
|order_estimated_delivery_date|        dt|
+-----------------------------+----------+
|          2017-10-18 00:00:00|2017-10-18|
|          2018-08-13 00:00:00|2018-08-13|
|          2018-09-04 00:00:00|2018-09-04|
|          2017-12-15 00:00:00|2017-12-15|
|          2018-02-26 00:00:00|2018-02-26|
|          2017-08-01 00:00:00|2017-08-01|
|          2017-05-09 00:00:00|2017-05-09|
|          2017-06-07 00:00:00|2017-06-07|
|          2017-03-06 00:00:00|2017-03-06|
|          2017-08-23 00:00:00|2017-08-23|
|          2017-06-07 00:00:00|2017-06-07|
|          2017-08-08 00:00:00|2017-08-08|
|          2018-07-18 00:00:00|2018-07-18|
|          2018-08-08 00:00:00|2018-08-08|
|          2018-03-21 00:00:00|2018-03-21|
|          2018-07-04 00:00:00|2018-07-04|
|          2018-02-06 00:00:00|2018-02-06|
|          2018-01-29 00:00:00|2018-01-29|
|          2017-12-11 00:00:00|2017-12-11|
|          2017-11-23 00:00:00|2017-11-23|
+----------

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 43394)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pysp

In [28]:
orders_df.groupBy("order_status") \
    .agg(F.count(F.col("order_id")).alias("total_orders")) \
    .orderBy(F.desc("total_orders")) \
    .show()

+------------+------------+
|order_status|total_orders|
+------------+------------+
|   delivered|       96478|
|     shipped|        1107|
|    canceled|         625|
| unavailable|         609|
|    invoiced|         314|
|  processing|         301|
|     created|           5|
|    approved|           2|
+------------+------------+



### What is the total number of orders per month, per week, per time of the day (Dawn, Morning, Afternoon, Night)?

Line Chart

1. Is there any growing trend on brazilian e-commerce?
2. On what day of week brazilians customers tend to do online purchasing?
3. What time brazilians customers tend do buy (Dawn, Morning, Afternoon or Night)?

#### Transform the order_purchase_timestamp to get the following:

Date Dimension:

- Year Month
- Day of Week
- Time of Day

In [None]:
orders_processed_df = orders_df.withColumn("year", F.date_format(F.col("order_purchase_timestamp"), "yyyy")) \
    .withColumn("year_month", F.date_format(F.col("order_purchase_timestamp"), "yyyyMM")) \
    .withColumn("day_of_week", F.date_format(F.col("order_purchase_timestamp"), "EEE")) \
    .withColumn(
        "time_of_day",
        F.when(
            F.date_format(F.col("order_purchase_timestamp"), "HH:mm").between("00:01", "05:00"), "Dawn"
        ).when(
            F.date_format(F.col("order_purchase_timestamp"), "HH:mm").between("05:01", "11:59"), "Morning"
        ).when(
            F.date_format(F.col("order_purchase_timestamp"), "HH:mm").between("12:00", "18:00"), "Afternoon"
        ).when(
            F.date_format(F.col("order_purchase_timestamp"), "HH:mm").between("18:01", "24:00"), "Night"
        )
    )

### Create a Comparison Between 2017 and 2018 Total Orders (Month by Month and YoY)

In [None]:
orders_processed_df.groupBy("year_month") \
    .agg(F.count("order_id").alias("total_orders")) \
    .orderBy("year_month") \
    .show()

In [None]:
orders_processed_df.groupBy("year") \
    .agg(F.count("order_id").alias("total_orders")) \
    .orderBy("year") \
    .show()

### Geospatial Visualization

#### Data Processing Required
1. Merge the orders data to order_items data;
2. Use an API (brazilian government) to return the region of each customer_state;
3. Replace portuguese phonetic with regular english letters.
4. Purpose useful charts to answear business questions.

In [None]:
regions = {'AC': 'North', 'RO': 'North', 'AM': 'North', 'RR': 'North', 'PA': 'North', 'AP': 'North', 'TO': 'North', 'MT': 'Center-West', 'GO': 'Center-West', 'DF': 'Center-West', 'MS': 'Center-West', 'MA': 'Northeast', 'PI': 'Northeast', 'BA': 'Northeast', 'SE': 'Northeast', 'AL': 'Northeast', 'PE': 'Northeast', 'PB': 'Northeast', 'RN': 'Northeast', 'CE': 'Northeast', 'SP': 'Southeast', 'MG': 'Southeast', 'ES': 'Southeast', 'RJ': 'Southeast', 'PR': 'South', 'SC': 'South', 'RS': 'South'}

def map_region(state: str) -> str:
    return regions.get(state)

In [None]:
udf_map_region = F.udf(map_region, StringType())
udf_normalize_phonetic = F.udf(unidecode, StringType())

In [None]:
geolocation_df = geolocation_df.withColumn("region", udf_map_region(F.col("geolocation_state"))) \
    .withColumn("geolocation_city", udf_normalize_phonetic(F.col("geolocation_city")))

#### Total Number of Customer Order by Region, State, City

- Comparison of Each Region Orders per Month
- Top 10 Brazilian Cities with More Orders
- Total of Customers Orders by State

In [None]:
orders_processed_df.selectExpr("year_month", "order_id", "customer_id AS order_customer_id") \
    .join(customers_df, F.col("customer_id") == F.col("order_customer_id"), "inner") \
    .join(geolocation_df, F.col("customer_zip_code_prefix") == F.col("geolocation_zip_code_prefix"), "inner") \
    .groupBy("year_month", "region") \
    .agg(F.count("order_id").alias("total_orders")).show()


In [None]:
orders_processed_df.selectExpr("year_month", "order_id", "customer_id AS order_customer_id") \
    .join(customers_df, F.col("customer_id") == F.col("order_customer_id"), "inner") \
    .join(geolocation_df, F.col("customer_zip_code_prefix") == F.col("geolocation_zip_code_prefix"), "inner") \
    .groupBy("year_month", "geolocation_state") \
    .agg(F.count("order_id").alias("total_orders")).show()


In [None]:
orders_processed_df.selectExpr("year_month", "order_id", "customer_id AS order_customer_id") \
    .join(customers_df, F.col("customer_id") == F.col("order_customer_id"), "inner") \
    .join(geolocation_df, F.col("customer_zip_code_prefix") == F.col("geolocation_zip_code_prefix"), "inner") \
    .groupBy("year_month", "geolocation_city") \
    .agg(F.count("order_id").alias("total_orders")).show()


#### How customers are distributed in Brazil? (Map and Heatmap)

In [None]:
### Dimensional Modeling

#### Total Orders vs. Total Amount Sold
- Per Month (Line)
- Per year (Number)

In [None]:
orders_processed_df.selectExpr("year_month", "year", "order_id", "customer_id AS order_customer_id") \
    .join(order_items_df, order_items_df["order_id"] == orders_df["order_id"], "inner") \
    .groupBy("year_month") \
    .agg(F.round(F.sum(order_items_df["price"]), 2).alias("total_amount_sold")) \
    .show()


In [None]:
orders_processed_df.selectExpr("year_month", "year", "order_id", "customer_id AS order_customer_id") \
    .join(order_items_df, order_items_df["order_id"] == orders_df["order_id"], "inner") \
    .groupBy("year") \
    .agg(F.round(F.sum(order_items_df["price"]), 2).alias("total_amount_sold")) \
    .show()


#### How the total sales (sum of price) are concentraded in brazilian states?

- Average price per Customer State
- Total price by Customer State

In [None]:
orders_processed_df.selectExpr("year_month", "order_id", "customer_id AS order_customer_id") \
    .join(customers_df, F.col("customer_id") == F.col("order_customer_id"), "inner") \
    .join(geolocation_df, F.col("customer_zip_code_prefix") == F.col("geolocation_zip_code_prefix"), "inner") \
    .join(order_items_df, order_items_df["order_id"] == orders_df["order_id"], "inner") \
    .groupBy("geolocation_state") \
    .agg(F.round(F.sum(order_items_df["price"]), 2).alias("total_price"), F.round(F.mean(order_items_df["price"]), 2).alias("average_price")).show()


#### What are the best states to buy in Brazil? An analysis on sales, freight and delivery time

- Average Freight Paid for Online Shopping
- Top 5 States with Highest Avg Freight Value
- Top 5 States with Lowest Average Freight Value

- Average Delay (days) for delivery for online shopping
- Top 5 States with Highest Average Time to Delivery
- Top 5 States with Lowest Average Time to Delivery
- Average Difference between delivery and estimated date
- Top 5 States Delivery is Really Fast
- Top 5 States Delivery is not so Fast


In [None]:
order_items_df.agg(F.mean("freight_value").alias("average_freight_paid")).show()

In [None]:
order_items_df.join(orders_df, orders_df["order_id"] == order_items_df["order_id"], "inner") \
    .join(customers_df, customers_df["customer_id"] == orders_df["customer_id"], "inner") \
    .join(geolocation_df, F.col("customer_zip_code_prefix") == F.col("geolocation_zip_code_prefix"), "inner") \
    .groupBy("geolocation_state") \
    .agg(F.mean("freight_value").alias("average_freight_paid")) \
    .orderBy(F.desc("average_freight_paid")).show(n=5)

In [None]:
order_items_df.join(orders_df, orders_df["order_id"] == order_items_df["order_id"], "inner") \
    .join(customers_df, customers_df["customer_id"] == orders_df["customer_id"], "inner") \
    .join(geolocation_df, F.col("customer_zip_code_prefix") == F.col("geolocation_zip_code_prefix"), "inner") \
    .groupBy("geolocation_state") \
    .agg(F.mean("freight_value").alias("average_freight_paid")) \
    .orderBy("average_freight_paid").show(n=5)

In [None]:
orders_df.withColumn("time_to_deliver", F.datediff(F.col("order_delivered_customer_date"), F.col("order_purchase_timestamp"))) \
    .agg(F.mean("time_to_deliver").alias("avg_time_to_deliver")) \
    .show()

In [None]:
orders_df.withColumn("time_to_deliver", F.datediff(F.col("order_delivered_customer_date"), F.col("order_purchase_timestamp"))) \
    .join(customers_df, customers_df["customer_id"] == orders_df["customer_id"], "inner") \
    .join(geolocation_df, geolocation_df["geolocation_zip_code_prefix"] == customers_df["customer_zip_code_prefix"], "inner") \
    .groupBy("geolocation_state") \
    .agg(F.round(F.mean("time_to_deliver"), 2).alias("avg_time_to_deliver")) \
    .orderBy("avg_time_to_deliver") \
    .show(n=5)

In [None]:
orders_df.withColumn("time_to_deliver", F.datediff(F.col("order_delivered_customer_date"), F.col("order_purchase_timestamp"))) \
    .join(customers_df, customers_df["customer_id"] == orders_df["customer_id"], "inner") \
    .join(geolocation_df, geolocation_df["geolocation_zip_code_prefix"] == customers_df["customer_zip_code_prefix"], "inner") \
    .groupBy("geolocation_state") \
    .agg(F.round(F.mean("time_to_deliver"), 2).alias("avg_time_to_deliver")) \
    .orderBy(F.desc("avg_time_to_deliver")) \
    .show(n=5)

In [None]:
orders_df.withColumn("delivery_delay", F.datediff(F.col("order_delivered_customer_date"), F.col("order_estimated_delivery_date"))) \
    .where(F.expr("delivery_delay > 0")) \
    .agg(F.mean("delivery_delay").alias("avg_days_delay")) \
    .show()

In [None]:
orders_df.withColumn("delivery_delay", F.datediff(F.col("order_delivered_customer_date"), F.col("order_estimated_delivery_date"))) \
    .join(customers_df, customers_df["customer_id"] == orders_df["customer_id"], "inner") \
    .join(geolocation_df, geolocation_df["geolocation_zip_code_prefix"] == customers_df["customer_zip_code_prefix"], "inner") \
    .where(F.expr("delivery_delay > 0")) \
    .groupBy("geolocation_state") \
    .agg(F.round(F.mean("delivery_delay"), 2).alias("avg_days_delay")) \
    .orderBy(F.desc("avg_days_delay")) \
    .show(n=5)

In [None]:
orders_df.withColumn("delivery_delay", F.datediff(F.col("order_delivered_customer_date"), F.col("order_estimated_delivery_date"))) \
    .join(customers_df, customers_df["customer_id"] == orders_df["customer_id"], "inner") \
    .join(geolocation_df, geolocation_df["geolocation_zip_code_prefix"] == customers_df["customer_zip_code_prefix"], "inner") \
    .where(F.expr("delivery_delay > 0")) \
    .groupBy("geolocation_state") \
    .agg(F.round(F.mean("delivery_delay"), 2).alias("avg_days_delay")) \
    .orderBy("avg_days_delay") \
    .show(n=5)

## Payment Type Analysis

- Total Transactions by Payment Type
- Distribution of Payment Installments
- Total Orders Purcharsed By Month

In [None]:
order_payments_df.groupBy("payment_type") \
    .agg(F.count("order_id").alias("total_orders")) \
    .orderBy(F.desc("total_orders")).show()

In [None]:
order_payments_df.groupBy("payment_installments") \
    .agg(F.count("order_id").alias("total_orders")) \
    .orderBy("payment_installments").show()

In [None]:
orders_processed_df.join(order_payments_df, order_payments_df["order_id"] == orders_df["order_id"], "inner") \
    .groupBy("year_month") \
    .agg(F.round(F.sum("payment_value"), 2).alias("total_payment")) \
    .orderBy(F.desc("year_month")).show()