## Import Packages

In [46]:
from pathlib import Path
import os

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import count, col, when

In [3]:
JAR_PACKAGES = ",".join([str(x) for x in Path("../../jars").glob("*.jar")])
JAR_PACKAGES

'../../jars/antlr4-runtime-4.9.3.jar,../../jars/aws-java-sdk-bundle-1.12.392.jar,../../jars/delta-core_2.12-2.4.0.jar,../../jars/delta-storage-2.4.0.jar,../../jars/hadoop-aws-3.3.1.jar,../../jars/wildfly-openssl-1.0.7.Final.jar'

In [4]:
spark = SparkSession.builder.appName("olist_data_profiling").master("spark://spark:7077") \
            .config("spark.jars", JAR_PACKAGES) \
            .config("spark.hadoop.fs.s3a.access.key","datalake") \
            .config("spark.hadoop.fs.s3a.secret.key","datalake") \
            .config("spark.hadoop.fs.s3a.endpoint","http://minio:9000") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .getOrCreate()

In [None]:
bronze_container_path = "s3a://bronze"

## Product Category

In [7]:
product_category_df = spark.read.format("csv") \
        .option("header", True) \
        .option("inferSchema", True) \
        .load(f"{bronze_container_path}/olist/product_category_name_translation.csv")

In [14]:
product_category_df.printSchema()

root
 |-- product_category_name: string (nullable = true)
 |-- product_category_name_english: string (nullable = true)



In [10]:
product_category_df.show(truncate=False)

+---------------------------+-----------------------------+
|product_category_name      |product_category_name_english|
+---------------------------+-----------------------------+
|beleza_saude               |health_beauty                |
|informatica_acessorios     |computers_accessories        |
|automotivo                 |auto                         |
|cama_mesa_banho            |bed_bath_table               |
|moveis_decoracao           |furniture_decor              |
|esporte_lazer              |sports_leisure               |
|perfumaria                 |perfumery                    |
|utilidades_domesticas      |housewares                   |
|telefonia                  |telephony                    |
|relogios_presentes         |watches_gifts                |
|alimentos_bebidas          |food_drink                   |
|bebes                      |baby                         |
|papelaria                  |stationery                   |
|tablets_impressao_imagem   |tablets_pri

In [11]:
product_category_df.describe().show()

+-------+---------------------+-----------------------------+
|summary|product_category_name|product_category_name_english|
+-------+---------------------+-----------------------------+
|  count|                   71|                           71|
|   mean|                 null|                         null|
| stddev|                 null|                         null|
|    min| agro_industria_e_...|         agro_industry_and...|
|    max| utilidades_domest...|                watches_gifts|
+-------+---------------------+-----------------------------+



In [12]:
product_category_df.where("product_category_name_english IS NULL").count()

0

In [13]:
product_category_df.where("product_category_name IS NULL").count()

0

## Closed Deals

In [15]:
closed_deals_df = spark.read.format("csv") \
        .option("header", True) \
        .option("inferSchema", True) \
        .load(f"{bronze_container_path}/olist/olist_closed_deals_dataset.csv")

In [16]:
closed_deals_df.printSchema()

root
 |-- mql_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- sdr_id: string (nullable = true)
 |-- sr_id: string (nullable = true)
 |-- won_date: timestamp (nullable = true)
 |-- business_segment: string (nullable = true)
 |-- lead_type: string (nullable = true)
 |-- lead_behaviour_profile: string (nullable = true)
 |-- has_company: boolean (nullable = true)
 |-- has_gtin: boolean (nullable = true)
 |-- average_stock: string (nullable = true)
 |-- business_type: string (nullable = true)
 |-- declared_product_catalog_size: double (nullable = true)
 |-- declared_monthly_revenue: double (nullable = true)



In [27]:
closed_deals_df.describe().show(vertical=True)

-RECORD 0---------------------------------------------
 summary                       | count                
 mql_id                        | 842                  
 seller_id                     | 842                  
 sdr_id                        | 842                  
 sr_id                         | 842                  
 business_segment              | 841                  
 lead_type                     | 836                  
 lead_behaviour_profile        | 665                  
 average_stock                 | 66                   
 business_type                 | 832                  
 declared_product_catalog_size | 69                   
 declared_monthly_revenue      | 842                  
-RECORD 1---------------------------------------------
 summary                       | mean                 
 mql_id                        | null                 
 seller_id                     | null                 
 sdr_id                        | null                 
 sr_id    

In [29]:
closed_deals_df.show(3, vertical=True)

-RECORD 0---------------------------------------------
 mql_id                        | 5420aad7fec3549a8... 
 seller_id                     | 2c43fb513632d29b3... 
 sdr_id                        | a8387c01a09e99ce0... 
 sr_id                         | 4ef15afb4b2723d8f... 
 won_date                      | 2018-02-26 19:58:54  
 business_segment              | pet                  
 lead_type                     | online_medium        
 lead_behaviour_profile        | cat                  
 has_company                   | null                 
 has_gtin                      | null                 
 average_stock                 | null                 
 business_type                 | reseller             
 declared_product_catalog_size | null                 
 declared_monthly_revenue      | 0.0                  
-RECORD 1---------------------------------------------
 mql_id                        | a555fb36b9368110e... 
 seller_id                     | bbb7d7893a4506604... 
 sdr_id   

In [55]:
def print_null_count_per_column(df: DataFrame):
    df.select([count(when(col(c).isNull(), 1)).alias(c) for c in df.columns]).show(vertical=True)

In [56]:
print_null_count_per_column(df=closed_deals_df)

-RECORD 0----------------------------
 mql_id                        | 0   
 seller_id                     | 0   
 sdr_id                        | 0   
 sr_id                         | 0   
 won_date                      | 0   
 business_segment              | 1   
 lead_type                     | 6   
 lead_behaviour_profile        | 177 
 has_company                   | 779 
 has_gtin                      | 778 
 average_stock                 | 776 
 business_type                 | 10  
 declared_product_catalog_size | 773 
 declared_monthly_revenue      | 0   

