### Step 1: Setting Up the Spark Environment

 we will :

1.  **1\. Deploy a Spark Cluster** (like AWS EMR, GCP Dataproc, or an on-prem Hadoop cluster, Azure HD Insight).
2.  **2\. Store Data in HDFS** instead of local storage.

*   Load data from Kaggle i.e. Data Source (#!/bin/bash curl -L -o ~/olist/brazilian-ecommerce.zip\\ https://www.kaggle.com/api/v1/datasets/download/olistbr/brazilian-ecommerce)
*     !unzip brazilian-ecommerce.zip -d ~/olist/data/
    

3.  **3\. Use PySpark** to interact with data.


In [8]:
from pyspark.sql import SparkSession 
spark = SparkSession.builder\
    .appName('OlistData') \
    .getOrCreate()
    

25/09/15 10:12:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [9]:
spark

In [10]:
!hadoop fs -ls  /data/olist/

Found 9 items
-rw-r--r--   2 amanpathak8802 hadoop    9033957 2025-09-10 10:38 /data/olist/olist_customers_dataset.csv
-rw-r--r--   2 amanpathak8802 hadoop   61273883 2025-09-10 10:38 /data/olist/olist_geolocation_dataset.csv
-rw-r--r--   2 amanpathak8802 hadoop   15438671 2025-09-10 10:38 /data/olist/olist_order_items_dataset.csv
-rw-r--r--   2 amanpathak8802 hadoop    5777138 2025-09-10 10:38 /data/olist/olist_order_payments_dataset.csv
-rw-r--r--   2 amanpathak8802 hadoop   14451670 2025-09-10 10:38 /data/olist/olist_order_reviews_dataset.csv
-rw-r--r--   2 amanpathak8802 hadoop   17654914 2025-09-10 10:38 /data/olist/olist_orders_dataset.csv
-rw-r--r--   2 amanpathak8802 hadoop    2379446 2025-09-10 10:38 /data/olist/olist_products_dataset.csv
-rw-r--r--   2 amanpathak8802 hadoop     174703 2025-09-10 10:38 /data/olist/olist_sellers_dataset.csv
-rw-r--r--   2 amanpathak8802 hadoop       2613 2025-09-10 10:38 /data/olist/product_category_name_translation.csv


In [11]:
hdfs_path = '/data/olist/'

In [12]:
customer_df = spark.read.csv(hdfs_path +'olist_customers_dataset.csv',header = True , inferSchema = True)

In [13]:
customer_df.show(5)

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            campinas|            SP|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 5 rows



In [14]:
customers_df = spark.read.csv(hdfs_path + 'olist_customers_dataset.csv',header=True,inferSchema=True)
orders_df = spark.read.csv(hdfs_path + 'olist_orders_dataset.csv',header=True,inferSchema=True)
order_item_df = spark.read.csv(hdfs_path + 'olist_order_items_dataset.csv',header=True,inferSchema=True)
payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv',header=True,inferSchema=True)
reviews_df = spark.read.csv(hdfs_path + 'olist_order_reviews_dataset.csv',header=True,inferSchema=True)
products_df = spark.read.csv(hdfs_path + 'olist_products_dataset.csv',header=True,inferSchema=True)
sellers_df = spark.read.csv(hdfs_path + 'olist_sellers_dataset.csv',header=True,inferSchema=True)
geolocation_df = spark.read.csv(hdfs_path + 'olist_geolocation_dataset.csv',header=True,inferSchema=True)
category_translation_df = spark.read.csv(hdfs_path + 'product_category_name_translation.csv',header=True,inferSchema=True)

                                                                                

In [15]:
customer_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [16]:
orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [17]:
orders_df.show(5)


+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [18]:
# cheking data lekage or drop 
print(f'customers:{customers_df.count()} rows')
print(f'orders :{orders_df.count()} rows')
print(f'order_item :{order_item_df.count()}rows')
print(f'payments :{payments_df.count()}rows')
print(f'reviews :{reviews_df.count()}rows')
print(f'products :{products_df.count()}rows')
print(f'sellers :{sellers_df.count()}rows')
print(f'geolocation :{geolocation_df.count()}rows')
print(f'category_translation :{sellers_df.count()}rows')

customers:99441 rows
orders :99441 rows
order_item :112650rows
payments :103886rows
reviews :104162rows
products :32951rows
sellers :3095rows
geolocation :1000163rows
category_translation :3095rows


In [19]:
#checking nulls 
# Check for nulls in critical fields


from pyspark.sql.functions import col , count , when 

customer_df.select([count(when(col(c).isNull(),1)).alias(c) for c in customer_df.columns ]).show()

+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
|          0|                 0|                       0|            0|             0|
+-----------+------------------+------------------------+-------------+--------------+



In [20]:
# for  duplicate values 
customer_df.groupBy('customer_id').count().filter('count >1').show()

[Stage 55:>                                                         (0 + 2) / 2]

+-----------+-----+
|customer_id|count|
+-----------+-----+
+-----------+-----+



                                                                                

In [28]:
#customer Distribution by state 

customer_df.groupBy('customer_state').count().orderBy('count', ascending = False ).show()


+--------------+-----+
|customer_state|count|
+--------------+-----+
|            SP|41746|
|            RJ|12852|
|            MG|11635|
|            RS| 5466|
|            PR| 5045|
|            SC| 3637|
|            BA| 3380|
|            DF| 2140|
|            ES| 2033|
|            GO| 2020|
|            PE| 1652|
|            CE| 1336|
|            PA|  975|
|            MT|  907|
|            MA|  747|
|            MS|  715|
|            PB|  536|
|            PI|  495|
|            RN|  485|
|            AL|  413|
+--------------+-----+
only showing top 20 rows



In [33]:
orders_df.show(5)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [32]:
#order - order data distribution 
orders_df.groupBy('order_status').count().orderBy('count',ascending = False).show()

+------------+-----+
|order_status|count|
+------------+-----+
|   delivered|96478|
|     shipped| 1107|
|    canceled|  625|
| unavailable|  609|
|    invoiced|  314|
|  processing|  301|
|     created|    5|
|    approved|    2|
+------------+-----+



In [37]:
#payments  payment data distributed 

payments_df.groupBy('payment_type').count().orderBy('count',ascending = False ).show()


+------------+-----+
|payment_type|count|
+------------+-----+
| credit_card|76795|
|      boleto|19784|
|     voucher| 5775|
|  debit_card| 1529|
| not_defined|    3|
+------------+-----+



In [38]:
reviews_df.show(5)

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                NULL|                  NULL| 2018-03-10 00:00:00|    2018-03-11 03:05:13|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|                NULL|                  NULL| 2018-02-17 00:00:00|    2018-02-18 14:36:24|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|                NULL|  Recebi bem antes ...| 2017-04-21 00:00:00|   

In [42]:
#Top selling Products
from pyspark.sql.functions import sum 
top_product = order_item_df.groupBy('product_id').agg(sum('price').alias('total_sales'))
top_product.orderBy('total_sales',ascending = False).show()

+--------------------+------------------+
|          product_id|       total_sales|
+--------------------+------------------+
|bb50f2e236e5eea01...|           63885.0|
|6cdd53843498f9289...| 54730.20000000005|
|d6160fb7873f18409...|48899.340000000004|
|d1c427060a0f73f6b...| 47214.51000000006|
|99a4788cb24856965...|43025.560000000085|
|3dd2a17168ec895c7...| 41082.60000000005|
|25c38557cf793876c...| 38907.32000000001|
|5f504b3a1c75b73d6...|37733.899999999994|
|53b36df67ebb7c415...| 37683.42000000001|
|aca2eb7d00ea1a7b8...| 37608.90000000007|
|e0d64dcfaa3b6db5c...|          31786.82|
|d285360f29ac7fd97...|31623.809999999983|
|7a10781637204d8d1...|           30467.5|
|f1c7f353075ce59d8...|          29997.36|
|f819f0c84a64f02d3...|29024.479999999996|
|588531f8ec37e7d5f...|28291.989999999998|
|422879e10f4668299...|26577.219999999972|
|16c4e87b98a9370a9...|           25034.0|
|5a848e4ab52fd5445...|24229.029999999962|
|a62e25e09e05e6faf...|           24051.0|
+--------------------+------------

In [44]:
orders_df.show(4)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [61]:
#average Delivery Time Analysis 

delivery_df = orders_df.select('order_id','order_purchase_timestamp','order_delivered_customer_date')

In [69]:
from pyspark.sql.functions import datediff,to_date

delivery_detail_df = delivery_df\
.withColumn('delivery_time',datediff(col('order_delivered_customer_date'),col('order_purchase_timestamp')))
                                    
delivery_detail_df.orderBy('delivery_time',ascending = False).show()

+--------------------+------------------------+-----------------------------+-------------+
|            order_id|order_purchase_timestamp|order_delivered_customer_date|delivery_time|
+--------------------+------------------------+-----------------------------+-------------+
|ca07593549f1816d2...|     2017-02-21 23:31:27|          2017-09-19 14:36:39|          210|
|1b3190b2dfa9d789e...|     2018-02-23 14:57:35|          2018-09-19 23:24:07|          208|
|440d0d17af552815d...|     2017-03-07 23:59:51|          2017-09-19 15:12:50|          196|
|2fb597c2f772eca01...|     2017-03-08 18:09:02|          2017-09-19 14:33:17|          195|
|285ab9426d6982034...|     2017-03-08 22:47:40|          2017-09-19 14:00:04|          195|
|0f4519c5f1c541dde...|     2017-03-09 13:26:57|          2017-09-19 14:38:21|          194|
|47b40429ed8cce3ae...|     2018-01-03 09:44:01|          2018-07-13 20:51:31|          191|
|2fe324febf907e3ea...|     2017-03-13 20:17:10|          2017-09-19 17:00:07|   