# 1. Data Extraction

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('olist_project') \
    .getOrCreate()

spark

<pyspark.sql.connect.session.SparkSession at 0x7fd76adf8df0>

## Linking Azure Data Lake Gen2

In [0]:

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", application_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", service_credential)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{directory_id}/oauth2/token")

## Reading Datasets From ADLS Gen2

In [0]:
dbutils.fs.ls("abfss://olistdataset@olistprojectstorage.dfs.core.windows.net/bronze/")


[FileInfo(path='abfss://olistdataset@olistprojectstorage.dfs.core.windows.net/bronze/customers.csv', name='customers.csv', size=9033957, modificationTime=1751715768000),
 FileInfo(path='abfss://olistdataset@olistprojectstorage.dfs.core.windows.net/bronze/items.csv', name='items.csv', size=15438671, modificationTime=1751715787000),
 FileInfo(path='abfss://olistdataset@olistprojectstorage.dfs.core.windows.net/bronze/location.csv', name='location.csv', size=21471832, modificationTime=1751715807000),
 FileInfo(path='abfss://olistdataset@olistprojectstorage.dfs.core.windows.net/bronze/orders.csv', name='orders.csv', size=17654914, modificationTime=1751715824000),
 FileInfo(path='abfss://olistdataset@olistprojectstorage.dfs.core.windows.net/bronze/payments.csv', name='payments.csv', size=5777138, modificationTime=1751715843000),
 FileInfo(path='abfss://olistdataset@olistprojectstorage.dfs.core.windows.net/bronze/products.csv', name='products.csv', size=2379446, modificationTime=1751715863000

In [0]:

storage_path = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/bronze/"

In [0]:
customer_df = spark.read.format("csv").option('inferSchema',True).option('header',True).load(storage_path + 'customers.csv')
orders_df = spark.read.format("csv").option('inferSchema',True).option('header',True).load(storage_path + 'orders.csv')
items_df = spark.read.format("csv").option('inferSchema',True).option('header',True).load(storage_path + 'items.csv')
payments_df = spark.read.format("csv").option('inferSchema',True).option('header',True).load(storage_path + 'payments.csv')
location_df = spark.read.format("csv").option('inferSchema',True).option('header',True).load(storage_path + 'location.csv') 
products_df = spark.read.format("csv").option('inferSchema',True).option('header',True).load(storage_path + 'products.csv') 
sellers_df = spark.read.format("csv").option('inferSchema',True).option('header',True).load(storage_path + 'sellers.csv')
review_df = spark.read.format("csv").option('inferSchema',True).option('header',True).load(storage_path + 'reviews.csv')


# 2. Data Exploration / EDA

## Data Model

![Data Model](https://i.imgur.com/HRhd2Y0.png)

In [0]:
def print_schema(df, name):
    if df is None:
        print(f"{name} is None. Cannot print schema.")
    else:
        print(f"Schema for {name}:")
        df.printSchema()

In [0]:
print_schema(customer_df,'customer dataset')
print_schema(orders_df,'orders dataset')
print_schema(items_df,'items dataset')
print_schema(payments_df,'payments dataset')
print_schema(location_df,'location dataset')
print_schema(products_df,'products dataset') 
print_schema(sellers_df,'sellers dataset')
print_schema(review_df,'review dataset')
print_schema(payments_df,'payments dataset')

Schema for customer dataset:
root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)

Schema for orders dataset:
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)

Schema for items dataset:
root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- pri

# 3. Data Validation / Quality Analysis

In [0]:
from pyspark.sql.functions import *

def data_validation(df,name):
    total_count = df.count()
    unique = df.dropDuplicates().count()
    dedup = total_count-unique

    print(name)
    print(f"Total Count : {total_count}")
    print(f"Total Duplicates : {dedup}")
    print(f"Total Nulls : ")
    
    null_count = df.select([count(when(col(c).isNull(),1)).alias(c) for c in df.columns]).show(vertical=True)

In [0]:
data_validation(customer_df,'customer dataset')
data_validation(orders_df,'orders dataset')
data_validation(items_df,'items dataset')
data_validation(payments_df,'payments dataset')
data_validation(location_df,'location dataset')
data_validation(products_df,'products dataset') 
data_validation(sellers_df,'sellers dataset')
data_validation(review_df,'review dataset')
data_validation(payments_df,'payments dataset')

customer dataset
Total Count : 99441
Total Duplicates : 0
Total Nulls : 
-RECORD 0-----------------------
 customer_id              | 0   
 customer_unique_id       | 0   
 customer_zip_code_prefix | 0   
 customer_city            | 0   
 customer_state           | 0   

orders dataset
Total Count : 99441
Total Duplicates : 0
Total Nulls : 
-RECORD 0-----------------------------
 order_id                      | 0    
 customer_id                   | 0    
 order_status                  | 0    
 order_purchase_timestamp      | 0    
 order_approved_at             | 160  
 order_delivered_carrier_date  | 1783 
 order_delivered_customer_date | 2965 
 order_estimated_delivery_date | 0    

items dataset
Total Count : 112650
Total Duplicates : 0
Total Nulls : 
-RECORD 0------------------
 order_id            | 0   
 order_item_id       | 0   
 product_id          | 0   
 seller_id           | 0   
 shipping_limit_date | 0   
 price               | 0   
 freight_value       | 0   

payments 

# 4. Data Curation

## Analysing Required Columns

In [0]:
order = orders_df.select('order_id','order_status','order_purchase_timestamp','order_delivered_customer_date','order_estimated_delivery_date','customer_id')

customer = customer_df.select('customer_id','customer_city','customer_state')

item = items_df.select("order_id","product_id","seller_id","price","freight_value")

payment = payments_df.select('order_id','payment_type','payment_value')

product = products_df.select('product_id','product_category_name')

review = review_df.select("order_id",'review_score')

## Joining Dataset

In [0]:
from pyspark.sql.functions import broadcast

final_df = order.join(broadcast(customer),on='customer_id',how='left') \
                .join(broadcast(review),on='order_id',how='left') \
                .join(payment,on='order_id',how='left') \
                .join(item,on='order_id',how='left') \
                .join(broadcast(product),on='product_id',how='left')

# 5. Optimization : Caching on final_df

In [0]:
final_df.cache()

DataFrame[product_id: string, order_id: string, customer_id: string, order_status: string, order_purchase_timestamp: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, customer_city: string, customer_state: string, review_score: string, payment_type: string, payment_value: double, seller_id: string, price: double, freight_value: double, product_category_name: string]

In [0]:
final_df.columns

['product_id',
 'order_id',
 'customer_id',
 'order_status',
 'order_purchase_timestamp',
 'order_delivered_customer_date',
 'order_estimated_delivery_date',
 'customer_city',
 'customer_state',
 'review_score',
 'payment_type',
 'payment_value',
 'seller_id',
 'price',
 'freight_value',
 'product_category_name']

# 6. Re-Validation on Merged Dataset

In [0]:
data_validation(final_df,"Merged Dataset")

Merged Dataset
Total Count : 119143
Total Duplicates : 11696
Total Nulls : 
-RECORD 0-----------------------------
 product_id                    | 833  
 order_id                      | 0    
 customer_id                   | 0    
 order_status                  | 0    
 order_purchase_timestamp      | 0    
 order_delivered_customer_date | 3421 
 order_estimated_delivery_date | 0    
 customer_city                 | 0    
 customer_state                | 0    
 review_score                  | 997  
 payment_type                  | 3    
 payment_value                 | 3    
 seller_id                     | 833  
 price                         | 833  
 freight_value                 | 833  
 product_category_name         | 2542 



# 7. Data Transformation

## Renaming Columns 

In [0]:
final_df = final_df.withColumnRenamed("order_purchase_timestamp","purchase_ts")
final_df = final_df.withColumnRenamed("order_delivered_customer_date","delivered_date")
final_df = final_df.withColumnRenamed("order_estimated_delivery_date","est_delivery")
final_df = final_df.withColumnRenamed("product_category_name","category")
final_df = final_df.withColumnRenamed("payment_type","pay_type")
final_df = final_df.withColumnRenamed("seller_id","seller")
final_df = final_df.withColumnRenamed("product_id","product")
final_df = final_df.withColumnRenamed("freight_value","shipping_fee")

In [0]:
final_df.printSchema()

root
 |-- product: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- purchase_ts: timestamp (nullable = true)
 |-- delivered_date: timestamp (nullable = true)
 |-- est_delivery: timestamp (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- pay_type: string (nullable = true)
 |-- payment_value: double (nullable = true)
 |-- seller: string (nullable = true)
 |-- price: double (nullable = true)
 |-- shipping_fee: double (nullable = true)
 |-- category: string (nullable = true)



## Structuring Columns

In [0]:
final_df.select('review_score','payment_value','price','shipping_fee').show(10)

+------------+-------------+-----+------------+
|review_score|payment_value|price|shipping_fee|
+------------+-------------+-----+------------+
|           4|        18.59|29.99|        8.72|
|           4|          2.0|29.99|        8.72|
|           4|        18.12|29.99|        8.72|
|           4|       141.46|118.7|       22.76|
|           5|       179.12|159.9|       19.22|
|           5|         72.2| 45.0|        27.2|
|           5|        28.62| 19.9|        8.72|
|           4|       175.26|147.9|       27.36|
|           2|        65.95| 49.9|       16.05|
|           5|        75.16|59.99|       15.17|
+------------+-------------+-----+------------+
only showing top 10 rows



In [0]:
final_df = final_df.withColumn('review_score',col('review_score').cast('Integer'))

In [0]:
final_df.printSchema()

root
 |-- product: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- purchase_ts: timestamp (nullable = true)
 |-- delivered_date: timestamp (nullable = true)
 |-- est_delivery: timestamp (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- pay_type: string (nullable = true)
 |-- payment_value: double (nullable = true)
 |-- seller: string (nullable = true)
 |-- price: double (nullable = true)
 |-- shipping_fee: double (nullable = true)
 |-- category: string (nullable = true)



## Data Cleaning

In [0]:
final_df = final_df.dropDuplicates()

In [0]:
data_validation(final_df,'Deduped Data')

Deduped Data
Total Count : 107447
Total Duplicates : 0
Total Nulls : 
-RECORD 0--------------
 product        | 812  
 order_id       | 0    
 customer_id    | 0    
 order_status   | 0    
 purchase_ts    | 0    
 delivered_date | 0    
 est_delivery   | 0    
 customer_city  | 0    
 customer_state | 0    
 review_score   | 0    
 pay_type       | 1    
 payment_value  | 0    
 seller         | 812  
 price          | 0    
 shipping_fee   | 0    
 category       | 2341 
 delivery_gap   | 0    



# 8 Feature Engineering

In [0]:
# filling nulls with mean values on price , payment_value and shipping_fee

from pyspark.sql.functions import col

def impute_with_mean(df, columns):
    for c in columns:
        mean_val = df.selectExpr(f"avg({c}) as mean").collect()[0]['mean']
        df = df.fillna({c: mean_val})
    return df

final_df = impute_with_mean(final_df, ['price', 'payment_value', 'shipping_fee','review_score'])

In [0]:
from pyspark.sql.functions import when, col

final_df = final_df.withColumn(
    "delivered_date",
    when(col("delivered_date").isNull(), col("est_delivery")).otherwise(col("delivered_date"))
)

In [0]:
# Calculating Average Order Value

final_df = final_df.groupBy('customer_id')\
    .agg(
        count('order_id').alias('total_orders'),
        sum('price').alias('total_spent')
    ) \
    .withColumn('AOV'\
        , col('total_spent')/col('total_orders')
        )

In [0]:
data_validation(final_df,"Transfromed Data")

Transfromed Data
Total Count : 119143
Total Duplicates : 11696
Total Nulls : 
-RECORD 0--------------
 product        | 833  
 order_id       | 0    
 customer_id    | 0    
 order_status   | 0    
 purchase_ts    | 0    
 delivered_date | 0    
 est_delivery   | 0    
 customer_city  | 0    
 customer_state | 0    
 review_score   | 0    
 pay_type       | 3    
 payment_value  | 0    
 seller         | 833  
 price          | 0    
 shipping_fee   | 0    
 category       | 2542 
 delivery_gap   | 0    



# 9. Business Insights

In [0]:
# finding Delivery Gap (delivery_gap = delivered_date - purchase_date)

final_df = final_df.withColumn(
    "delivery_gap",
    col("delivered_date").cast("date") - col("purchase_ts").cast("date")
)

final_df.select('delivered_date','purchase_ts','delivery_gap').show(5)

+-------------------+-------------------+-----------------+
|     delivered_date|        purchase_ts|     delivery_gap|
+-------------------+-------------------+-----------------+
|2017-10-10 21:25:13|2017-10-02 10:56:33| INTERVAL '8' DAY|
|2017-10-10 21:25:13|2017-10-02 10:56:33| INTERVAL '8' DAY|
|2017-10-10 21:25:13|2017-10-02 10:56:33| INTERVAL '8' DAY|
|2018-08-07 15:27:45|2018-07-24 20:41:37|INTERVAL '14' DAY|
|2018-08-17 18:06:29|2018-08-08 08:38:49| INTERVAL '9' DAY|
+-------------------+-------------------+-----------------+
only showing top 5 rows



In [0]:
# AOV Segmentation

final_df.select(min('AOV'),max('AOV'),avg('AOV')).show()

+--------+--------+------------------+
|min(AOV)|max(AOV)|          avg(AOV)|
+--------+--------+------------------+
|    0.85|  6735.0|125.90113394832466|
+--------+--------+------------------+



In [0]:
# Customer Segmentation based on AOV

final_df = final_df.withColumn(
    "customer_segmentation" \
    , when(col("AOV") < 125, "Low Value") \
    .when((col("AOV") > 125) & (col("AOV") <= 300), "Medium Value") \
    .when((col("AOV") > 300) & (col("AOV") <= 500), "High Value") \
    .otherwise("Premium Customer")
)

In [0]:
final_df.groupBy('customer_segmentation').count().show()

+---------------------+-----+
|customer_segmentation|count|
+---------------------+-----+
|     Premium Customer| 3234|
|         Medium Value|22141|
|           High Value| 3513|
|            Low Value|70553|
+---------------------+-----+



# 10. Data Serving

In [0]:
# Storing Transformed dataset into Silver layer

final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("customer_segmentation") \
    .save(silver_path)