In [68]:
from typing import List
from pathlib import Path
from pyspark.sql import SparkSession, DataFrame, Row
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [69]:
JAR_PACKAGES = ",".join([str(x) for x in Path("../../jars").glob("*.jar")])
JAR_PACKAGES

'../../jars/antlr4-runtime-4.9.3.jar,../../jars/aws-java-sdk-bundle-1.12.392.jar,../../jars/delta-core_2.12-2.4.0.jar,../../jars/delta-storage-2.4.0.jar,../../jars/hadoop-aws-3.3.1.jar,../../jars/wildfly-openssl-1.0.7.Final.jar'

## Read The Datasets

## Overview of Each Dataset


### Dataset High Level Overview

For each Dataset get the ff. (tabular)
- Number of Rows
- Number of Columns
- Number of Null Values
- Number Columns with Null Values
- List of Columns with Null

In [70]:
spark = SparkSession.builder.appName("olist_data_profiling").master("spark://spark:7077") \
            .config("spark.jars", JAR_PACKAGES) \
            .config("spark.hadoop.fs.s3a.access.key","datalake") \
            .config("spark.hadoop.fs.s3a.secret.key","datalake") \
            .config("spark.hadoop.fs.s3a.endpoint","http://minio:9000") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .getOrCreate()

In [71]:
bronze_container_path = "s3a://bronze"

In [72]:
customers_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_customers_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [73]:
geolocation_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_geolocation_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
geolocation_df.printSchema()

root
 |-- geolocation_zip_code_prefix: integer (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)



In [74]:
order_items_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_order_items_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
order_items_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



In [75]:
order_payments_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_order_payments_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
order_payments_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [76]:
order_reviews_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_order_reviews_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
order_reviews_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: string (nullable = true)
 |-- review_answer_timestamp: string (nullable = true)



In [77]:
orders_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_orders_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [78]:
products_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_products_dataset.csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load()
products_df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)



In [79]:
sellers_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/olist_sellers_dataset.csv") \
    .option("header", True) \
    .option("inferScema", True) \
    .load()
sellers_df.printSchema()

root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: string (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)



In [80]:
product_category_name_translation_df = spark.read.format("csv") \
    .option("path", f"{bronze_container_path}/olist/product_category_name_translation.csv") \
    .option("header", True) \
    .option("inferScema", True) \
    .load()
product_category_name_translation_df.printSchema()

root
 |-- product_category_name: string (nullable = true)
 |-- product_category_name_english: string (nullable = true)



### Dataset Detailed Info

Create a tabular data of the ff.

- Column name
- Null amount
- Null percentage among the respective dataset
- Data type
- total categorical entries

In [81]:
def show_detailed_stats(df_list: List[DataFrame]):
    columns = ["dataset_name", "column_name", "row_count", "col_count", "null_amount", "null_percentage", "data_type", "unique_count"]
    all_df = []
    
    for df in df_list:
        data = [
            (
                df.select(F.element_at(F.split(F.input_file_name(), "/"), -1)).first()[0],
                c, 
                df.count(), 
                len(df.columns), 
                df.where(df[c].isNull()).count(), 
                round(100 * df.where(df[c].isNull()).count() / df.count(), 2),
                dict(df.dtypes)[c], 
                df.select(c).distinct().count()
             ) 
                 for c in df.columns
        ]
        all_df.extend(data)
        all_df.append(("-", "-", "-", "-", "-", "-", "-", "-"))
    rdd = spark.sparkContext.parallelize(all_df)
    result_df = rdd.toDF(columns)
    return result_df

stats_df = show_detailed_stats([
    customers_df,
    geolocation_df,
    order_items_df,
    order_payments_df,
    order_reviews_df,
    orders_df,
    products_df,
    sellers_df,
    product_category_name_translation_df,
])
stats_df.show(n=100, truncate=False)

+-------------------------------------+-----------------------------+---------+---------+-----------+---------------+---------+------------+
|dataset_name                         |column_name                  |row_count|col_count|null_amount|null_percentage|data_type|unique_count|
+-------------------------------------+-----------------------------+---------+---------+-----------+---------------+---------+------------+
|olist_customers_dataset.csv          |customer_id                  |99441    |5        |0          |0.0            |string   |99441       |
|olist_customers_dataset.csv          |customer_unique_id           |99441    |5        |0          |0.0            |string   |96096       |
|olist_customers_dataset.csv          |customer_zip_code_prefix     |99441    |5        |0          |0.0            |int      |14994       |
|olist_customers_dataset.csv          |customer_city                |99441    |5        |0          |0.0            |string   |4119        |
|olist_custom

## Exploratory Data Analysis (EDA)

### How many orders we have for each status?

### What is the total number of orders per month, per week, per time of the day (Dawn, Morning, Afternoon, Night)?

Line Chart

1. Is there any growing trend on brazilian e-commerce?
2. On what day of week brazilians customers tend to do online purchasing?
3. What time brazilians customers tend do buy (Dawn, Morning, Afternoon or Night)?

#### Transform the order_purchase_timestamp to get the following:

Date Dimension:

- Year Month
- Day of Week
- Time of Day

### Create a Comparison Between 2017 and 2018 Total Orders (Month by Month and YoY)

### Geospatial Visualization

#### Data Processing Required
1. Merge the orders data to order_items data;
2. Use an API (brazilian government) to return the region of each customer_state;
3. Purpose useful charts to answear business questions.

#### Total Number of Customer Order by Region, State, City

- Comparison of Each Region Orders per Month
- Top 10 Brazilian Cities with More Orders
- Total of Customers Orders by State

#### How customers are distributed in Brazil? (Map and Heatmap)

#### Total Orders vs. Total Amount Sold
- Per Month (Line)
- Per year (Number)

#### How the total sales (sum of price) are concentraded in brazilian states?

- Average price per Customer State
- Total price by Customer State

#### What are the best states to buy in Brazil? An analysis on sales, freight and delivery time

- Average Freight Paid for Online Shopping
- Top 5 States with Highest Avg Freight Value
- Top 5 States with Lowest Average Freight Value

- Average Delay (days) for delivery for online shopping
- Top 5 States with Highest Average Time to Delivery
- Top 5 States with Lowest Average Time to Delivery
- Average Difference between delivery and estimated date
- Top 5 States Delivery is Really Fast
- Top 5 States Delivery is not so Fast


## Payment Type Analysis

- Total Transactions by Payment Type
- Distribution of Payment Installments
- Total Orders Purcharsed By Month