In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName('OlistData') \
.getOrCreate()

25/09/19 08:28:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
hdfs_path = '/data/olist/'

In [3]:
spark

In [4]:
customers_df = spark.read.csv(hdfs_path + 'olist_customers_dataset.csv',header=True,inferSchema=True)
orders_df = spark.read.csv(hdfs_path + 'olist_orders_dataset.csv',header=True,inferSchema=True)
order_item_df = spark.read.csv(hdfs_path + 'olist_order_items_dataset.csv',header=True,inferSchema=True)
payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv',header=True,inferSchema=True)
reviews_df = spark.read.csv(hdfs_path + 'olist_order_reviews_dataset.csv',header=True,inferSchema=True)
products_df = spark.read.csv(hdfs_path + 'olist_products_dataset.csv',header=True,inferSchema=True)
sellers_df = spark.read.csv(hdfs_path + 'olist_sellers_dataset.csv',header=True,inferSchema=True)
geolocation_df = spark.read.csv(hdfs_path + 'olist_geolocation_dataset.csv',header=True,inferSchema=True)
category_translation_df = spark.read.csv(hdfs_path + 'product_category_name_translation.csv',header=True,inferSchema=True)

                                                                                

In [5]:
# Cache Frequently used Data for Better Performance

orders_df.cache()
customers_df.cache()
order_item_df.cache()

DataFrame[order_id: string, order_item_id: int, product_id: string, seller_id: string, shipping_limit_date: timestamp, price: double, freight_value: double]

In [6]:
orders_items_joined_df = orders_df.join(order_item_df,'order_id','inner')

In [7]:
orders_items_products_df = orders_items_joined_df.join(products_df,'product_id','inner')

In [8]:
orders_items_products_sellers_df = orders_items_products_df.join(sellers_df,'seller_id','inner')

In [9]:
full_orders_df = orders_items_products_sellers_df.join(customers_df,'customer_id','inner')

In [10]:
# GEolocation Data

full_orders_df = full_orders_df.join(geolocation_df,full_orders_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix,'left')

In [11]:
full_orders_df = full_orders_df.join(reviews_df,'order_id','left')

In [12]:
full_orders_df = full_orders_df.join(payments_df,'order_id','left')

In [13]:
full_orders_df.cache()

25/09/19 08:29:43 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

In [14]:
from pyspark.sql.functions import *

In [15]:
full_orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

# Total Revenues Per Seller

In [16]:
# Total Revenues Per Seller


seller_revenue_df = full_orders_df.groupBy('seller_id').agg(sum('price'))

In [17]:
seller_revenue_df.show(5)



+--------------------+------------------+
|           seller_id|        sum(price)|
+--------------------+------------------+
|d650b663c3b5f6fb3...|         2253869.1|
|cd06602b43d8800bd...| 353150.9800000004|
|3c487ae8f8d7542be...| 1618845.700000005|
|d354c38a7182125a7...|318455.87000000104|
|e9779976487b77c6d...|  6293200.69000001|
+--------------------+------------------+
only showing top 5 rows



                                                                                

In [18]:
# total orders per unique customer

orders_by_customer = (
    full_orders_df.groupBy("customer_unique_id")
      .agg(countDistinct("order_id").alias("total_orders"))
      .orderBy(desc("total_orders"))
)

orders_by_customer.show(4)




+--------------------+------------+
|  customer_unique_id|total_orders|
+--------------------+------------+
|8d50f5eadf50201cc...|          16|
|3e43e6105506432c9...|           9|
|6469f99c1f9dfae77...|           7|
|ca77025e7201e3b30...|           7|
+--------------------+------------+
only showing top 4 rows



                                                                                

In [19]:
# Average Review Score Per Seller
avg_review_per_seller = (
    full_orders_df.groupBy("seller_id")
      .agg(avg("review_score").alias("avg_review_score"))
      .orderBy(desc("avg_review_score"))
)
# Most Sold Products ( Top 10 )
# Top Custoemrs By Spending

In [30]:
avg_review_per_seller.show()



+--------------------+----------------+
|           seller_id|avg_review_score|
+--------------------+----------------+
|50a7e551a7b3f8d37...|             5.0|
|4f1f1c8666cdb6442...|             5.0|
|74871d19219c7d518...|             5.0|
|0af2ab31141893d26...|             5.0|
|9d69ba0a7a626a64d...|             5.0|
|97e50a621f8e801f4...|             5.0|
|48e5ee06fb2dc74df...|             5.0|
|404e1ba01358af4cd...|             5.0|
|702835e4b785b67a0...|             5.0|
|10cdd491d1dbea8fb...|             5.0|
|a663d9c3797e90eac...|             5.0|
|a3b0df0065e264a91...|             5.0|
|ce3ad9de960102d06...|             5.0|
|6c17baf138731a4aa...|             5.0|
|c3e2398fcc7e581cd...|             5.0|
|bbe87dce25ba8b38b...|             5.0|
|98dddbc4601dd4443...|             5.0|
|9c1c0c36cd23c2089...|             5.0|
|5721089ba9214e800...|             5.0|
|6561d6bf844e464b4...|             5.0|
+--------------------+----------------+
only showing top 20 rows



                                                                                

# Optimized Joins For Data integration

In [20]:
from pyspark.sql.functions import *

In [21]:
orders_items_joined_df = orders_df.join(order_item_df,'order_id','inner')

In [22]:
orders_items_products_df = orders_items_joined_df.join(products_df,'product_id','inner')

In [23]:
orders_items_products_sellers_df = orders_items_products_df.join(broadcast(sellers_df),'seller_id','inner')

In [24]:
full_orders_df = orders_items_products_sellers_df.join(customers_df,'customer_id','inner')

In [25]:
# GEolocation Data

full_orders_df = full_orders_df.join(broadcast(geolocation_df),full_orders_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix,'left')

In [26]:
full_orders_df = full_orders_df.join(broadcast(reviews_df),'order_id','left')

In [28]:
full_orders_df = full_orders_df.join(payments_df,'order_id','left')

In [29]:
full_orders_df.cache()

DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

# Aggregation

In [30]:
# Total Orders Per Customer

customer_order_count_df = full_orders_df.groupBy('customer_id')\
.agg(count('order_id').alias('total_orders'))\
.orderBy(desc('total_orders'))
         
customer_order_count_df.show(5)



+--------------------+------------+
|         customer_id|total_orders|
+--------------------+------------+
|270c23a11d024a44c...|      168021|
|351e40989da90e704...|      148551|
|13aa59158da63ba0e...|       98914|
|7bb57d182bdc11653...|       87612|
|eed931d3a5222a9a5...|       80142|
+--------------------+------------+
only showing top 5 rows



                                                                                

In [41]:
#Average Review Score Per Seller

seller_review_df = full_orders_df.groupBy('seller_id')\
.agg(avg('review_score').alias('avg_review_score'))\
.orderBy(desc('avg_review_score'))

seller_review_df.show()



+--------------------+----------------+
|           seller_id|avg_review_score|
+--------------------+----------------+
|436bf27f2f18474fc...|             5.0|
|c3f5668699a1b04c0...|             5.0|
|173d56ffbd4c56cb8...|             5.0|
|585175ec331ea177f...|             5.0|
|e439f7176d763f92d...|             5.0|
|9e94fa26a70ede4d2...|             5.0|
|bd43e172d599bed47...|             5.0|
|9dd459b29a14bf89d...|             5.0|
|d7f1cfc638473be93...|             5.0|
|ea846a0e7ad98a741...|             5.0|
|a9ae440659f48b784...|             5.0|
|8fb67a334bacec338...|             5.0|
|5e25c632f9db6cfb8...|             5.0|
|13e85aac53340018b...|             5.0|
|70f7e8018e3d46c74...|             5.0|
|10264f60a8f0a4d2f...|             5.0|
|702835e4b785b67a0...|             5.0|
|258447aad3f7b01e2...|             5.0|
|4dc2e811a1760953c...|             5.0|
|d8b8f2cf9ff6ba038...|             5.0|
+--------------------+----------------+
only showing top 20 rows



                                                                                

In [42]:
# Top 10 Most Sold Products

top_products_df = full_orders_df.groupBy('product_id')\
.agg(count('order_id').alias('total_sold'))\
.orderBy(desc('total_sold'))\
.limit(10)

top_products_df.show()



+--------------------+----------+
|          product_id|total_sold|
+--------------------+----------+
|aca2eb7d00ea1a7b8...|     86740|
|422879e10f4668299...|     81110|
|99a4788cb24856965...|     78775|
|389d119b48cf3043d...|     60248|
|d1c427060a0f73f6b...|     59274|
|368c6c730842d7801...|     58358|
|53759a2ecddad2bb8...|     52654|
|53b36df67ebb7c415...|     52105|
|154e7e31ebfa09220...|     42700|
|3dd2a17168ec895c7...|     40787|
+--------------------+----------+



                                                                                

In [60]:
# top 10 customer by spending 
from pyspark.sql.functions import sum

total_customer_spending = full_orders_df.groupby('customer_id').agg(sum('payment_value').alias('total_spending'))

In [61]:
total_customer_spending.orderBy(desc('total_spending')).show()



+--------------------+--------------------+
|         customer_id|      total_spending|
+--------------------+--------------------+
|1ff773612ab8934db...| 1.756825199999893E7|
|05455dfa7cd02f13d...|1.3282083359999327E7|
|ec5b2ba62e5743423...|1.0388528640000112E7|
|0c792d32a3251b4f6...|   8254681.600000529|
|78fc46047c4a639e8...|   7488519.999999339|
|1617b1357756262bf...|   7433259.520000033|
|1dbc055ccab23ed89...|   7216273.400000708|
|d5f2b3f597c7ccafb...|   6800018.119998923|
|dd3f1762eb601f41c...|  6746388.4800006235|
|10de381f8a8d23fff...|   5184499.500000076|
|30bb84b541c96af98...|   4740404.549999884|
|d72181923840c8895...|   4513322.700000229|
|e7d6802668de6e74d...|  4000041.4000000926|
|cb87122c4871e2027...|  3957404.0000001485|
|a972623b3481cbfd9...|  3716577.3600002443|
|df55c14d1476a9a34...|   3678102.619999968|
|f7622098214b4634b...|   3657923.519999952|
|cc803a2c412833101...|           3387471.0|
|f4d64a735d1f90f13...|   3224595.419999714|
|6361b9f3b85d41860...|   3207389

                                                                                

# Window Function and Ranking

In [50]:
from pyspark.sql.window import Window

In [51]:

# Dense Rank for Sellers Based on Revenue

window_spec = Window.partitionBy('seller_id').orderBy(desc('price'))

In [52]:
# Rank Top Selling Products Per seller

top_seller_products_df = full_orders_df.withColumn('rank',rank().over(window_spec)).filter(col('rank')<=5)

top_seller_products_df.select('seller_id','price','rank').show(5)

[Stage 95:>                                                         (0 + 1) / 1]

+--------------------+-----+----+
|           seller_id|price|rank|
+--------------------+-----+----+
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
+--------------------+-----+----+
only showing top 5 rows



                                                                                

In [None]:

# Dense Rank for Sellers Based on Revenue


In [69]:
top_dense_rank_seller = full_orders_df.withColumn('dense_rank',dense_rank().over(window_spec))

In [70]:
top_dense_rank_seller.select('seller_id','price','dense_rank').show()



+--------------------+-----+----------+
|           seller_id|price|dense_rank|
+--------------------+-----+----------+
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
|0015a82c2db000af6...|895.0|         1|
+--------------------+-----+----------+
only showing top 20 rows



                                                                                

# Advance Aggregation and Enrichment

In [32]:
full_orders_df

DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

In [46]:
# Total Revenue & Average Order Value (AOV) per Customer

customer_spending_df = full_orders_df.groupBy('customer_id')\
.agg(
    count('order_id').alias('total_orders'),
    sum('price').alias('total_spent'),
    round(avg('price'),2).alias('AOV')
)\
.orderBy(desc('total_spent'))


customer_spending_df.show()



+--------------------+------------+--------------------+-------+
|         customer_id|total_orders|         total_spent|    AOV|
+--------------------+------------+--------------------+-------+
|d3e82ccec3cb5f956...|       41256|         3.9977064E7|  969.0|
|9af2372a1e4934027...|       79054| 3.103264770004017E7| 392.55|
|63b964e79dee32a35...|       72864|         3.0019968E7|  412.0|
|de832e8dbb1f588a4...|       32850| 2.377485899998866E7| 723.74|
|351e40989da90e704...|      148551|1.2773900490027543E7|  85.99|
|d1ea705f2fdd8f98e...|       23324| 1.002908676000408E7| 429.99|
|13aa59158da63ba0e...|       98914|   7912130.860016024|  79.99|
|7bb57d182bdc11653...|       87612|   7613482.800011485|   86.9|
|fe5113a38e3575c04...|        4584|           6587208.0| 1437.0|
|7edaf2c4de16b0353...|       21296|           6367504.0|  299.0|
|270c23a11d024a44c...|      168021|   6148448.459990426|  36.59|
|155716e7d7dbee888...|       35478|   6030905.220005066| 169.99|
|2ba91e12e5e4c9f56...|   

                                                                                

In [52]:
6662844/6876

969.0

In [33]:
# Seller Performance Metrics ( Revenue, Average Review, Order Count)

seller_performance_df = full_orders_df.groupBy('seller_id') \
.agg(
    count('order_id').alias('total_orders'),
    sum('price').alias('total_revenue'),
    round(avg('review_score'),2).alias('avg_review_score'),
    round(stddev('price'),2).alias('price_variability')
)\
.orderBy(desc('total_revenue'))

In [34]:
seller_performance_df.show()



+--------------------+------------+--------------------+----------------+-----------------+
|           seller_id|total_orders|       total_revenue|avg_review_score|price_variability|
+--------------------+------------+--------------------+----------------+-----------------+
|4a3ca9315b744ce9f...|      552121| 5.189229274001529E7|             3.6|            48.74|
|53243585a1d6dc264...|       76498| 4.489675170999993E7|            4.08|            506.2|
|7c67e1448b00f6e96...|      355604| 4.433624773001587E7|            3.87|            51.85|
|ccb83a794700270fd...|       42901|         4.1477135E7|            4.94|            12.15|
|da8622b14eb17ae28...|      352453| 3.912570772999701E7|            3.94|             64.8|
|4869f7a5dfa277a7d...|      194273|3.8509463320000015E7|            4.06|           116.29|
|e59aa562b9f8076dd...|       88595|        3.71827091E7|            4.85|           119.85|
|fa1c13f2614d7b5c4...|       94340|3.2552205930000003E7|            4.37|       

                                                                                

In [55]:
# Product Popularity Metrics

product_metrics_df = full_orders_df.groupBy('product_id')\
.agg(
    count('order_id').alias('total_sales'),
    sum('price').alias('total_revenue'),
    round(avg('price'),2).alias('avg_price'),
    round(stddev('price'),2).alias('price_volatility'),\
    collect_set('seller_id').alias('unique_sellers')    
)\
.orderBy(desc('total_sales'))

In [56]:
product_metrics_df.show()



+--------------------+-----------+------------------+---------+----------------+--------------------+
|          product_id|total_sales|     total_revenue|avg_price|price_volatility|      unique_sellers|
+--------------------+-----------+------------------+---------+----------------+--------------------+
|aca2eb7d00ea1a7b8...|      86740| 6164630.300000014|    71.07|            3.17|[955fee9216a65b61...|
|422879e10f4668299...|      81110| 4442791.510000011|    54.77|            4.46|[1f50f920176fa81d...|
|99a4788cb24856965...|      78775| 6921762.710000014|    87.87|            4.08|[4a3ca9315b744ce9...|
|389d119b48cf3043d...|      60248|3280533.1300000115|    54.45|            4.37|[1f50f920176fa81d...|
|d1c427060a0f73f6b...|      59274|  8220103.32999999|   138.68|           16.58|[a1043bafd471dff5...|
|368c6c730842d7801...|      58358|3181698.8999999925|    54.52|            4.59|[1f50f920176fa81d...|
|53759a2ecddad2bb8...|      52654|2893017.4999999977|    54.94|            4.52|[1

                                                                                

In [57]:
# Monthly Revenue and Order Count Trend ----> HW

order_purchase_timestamp ---> month

monthly_revenue_df = full_orders_df.groupBy('total_orders')

total_orders 
total_revenue
avg_order_value
min_order_value
max_orderValues

SyntaxError: invalid syntax (2460152509.py, line 3)

In [43]:
# Customer Retention Analysis ( First & Last Order )

customer_retention_df = full_orders_df.groupBy('customer_id')\
.agg(
    first('order_purchase_timestamp').alias('first_order_date'),
    last('order_purchase_timestamp').alias('last_order_date'),
    count('order_id').alias('total_orders'),
    round(avg('price'),2).alias('aov')
)\
.orderBy(desc('total_orders'))

In [36]:
customer_retention_df.show()



+--------------------+-------------------+-------------------+------------+------+
|         customer_id|   first_order_date|    last_order_date|total_orders|   aov|
+--------------------+-------------------+-------------------+------------+------+
|270c23a11d024a44c...|2017-08-08 20:26:31|2017-08-08 20:26:31|      168021| 36.59|
|351e40989da90e704...|2017-07-13 10:42:37|2017-07-13 10:42:37|      148551| 85.99|
|13aa59158da63ba0e...|2017-09-23 14:56:45|2017-09-23 14:56:45|       98914| 79.99|
|7bb57d182bdc11653...|2018-04-02 17:11:30|2018-04-02 17:11:30|       87612|  86.9|
|eed931d3a5222a9a5...|2017-05-12 16:24:51|2017-05-12 16:24:51|       80142|  69.9|
|9af2372a1e4934027...|2017-04-20 12:45:34|2017-04-20 12:45:34|       79054|392.55|
|63b964e79dee32a35...|2018-02-14 16:34:27|2018-02-14 16:34:27|       72864| 412.0|
|24e7dc2ff8c071263...|2017-11-24 16:16:45|2017-11-24 16:16:45|       59373|  59.2|
|2ba91e12e5e4c9f56...|2017-11-25 13:54:39|2017-11-25 13:54:39|       57024|  99.9|
|9b4

                                                                                

# Extended Enrichment 

In [37]:
# Order Status Flags

full_orders_df = full_orders_df.withColumn('is_delivered', when(col('order_status')== 'delivered',lit(1)).otherwise(lit(0)))\
.withColumn('is_canceled', when(col('order_status')== 'canceled',lit(1)).otherwise(lit(0)))

In [38]:
full_orders_df.where(full_orders_df['order_status']=='canceled').select('order_status','is_delivered','is_canceled').show(100)

+------------+------------+-----------+
|order_status|is_delivered|is_canceled|
+------------+------------+-----------+
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|


In [39]:
# Order Revenue Calcualtion

full_orders_df = full_orders_df.withColumn('order_revenue',col('price')+col('freight_value'))

In [40]:
full_orders_df.select('price','freight_value','order_revenue').show()

+-----+-------------+------------------+
|price|freight_value|     order_revenue|
+-----+-------------+------------------+
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
+-----+-------------+------------------+
only showing top

In [47]:
customer_spending_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- total_orders: long (nullable = false)
 |-- total_spent: double (nullable = true)
 |-- AOV: double (nullable = true)



In [48]:
# Customer Segmentation based on spending

customer_spending_df = customer_spending_df.withColumn(
    'customer_segment',
    when(col('AOV') >=1200,"High-Value")
    .when( (col('AOV')<1200) & (col('AOV') >=700),'Medium_Value')
    .otherwise('Low-Value'))


In [65]:
customer_spending_df.show()



+--------------------+------------+------------------+-------+----------------+
|         customer_id|total_orders|       total_spent|    AOV|customer_segment|
+--------------------+------------+------------------+-------+----------------+
|d3e82ccec3cb5f956...|        6876|         6662844.0|  969.0|    Medium_Value|
|df55c14d1476a9a34...|         743|         3565657.0| 4799.0|      High-Value|
|fe5113a38e3575c04...|        2292|         3293604.0| 1437.0|      High-Value|
|ec5b2ba62e5743423...|        1428|         2556120.0| 1790.0|      High-Value|
|63b964e79dee32a35...|        6072|         2501664.0|  412.0|       Low-Value|
|46bb3c0b1a65c8399...|         748|         2336752.0| 3124.0|      High-Value|
|05455dfa7cd02f13d...|        2184| 2160194.400000087|  989.1|    Medium_Value|
|3690e975641f01bd0...|         802|         2124498.0| 2649.0|      High-Value|
|349509b216bd5ec11...|         743|         1923627.0| 2589.0|      High-Value|
|695476b5848d64ba0...|         687|18205

                                                                                

In [53]:
full_orders_df = full_orders_df.join(customer_spending_df.select('customer_id','customer_segment'),'customer_id',how='left')

In [54]:
full_orders_df.select('customer_id','customer_segment').show()

                                                                                

+--------------------+----------------+
|         customer_id|customer_segment|
+--------------------+----------------+
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
+--------------------+----------------+
only showing top 20 rows



In [55]:
full_orders_df.select('order_purchase_timestamp').show()

+------------------------+
|order_purchase_timestamp|
+------------------------+
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
+------------------------+
only showing top 20 rows



In [56]:
#Hourly Order Distribution
full_orders_df = full_orders_df.withColumn('hour_of_day',expr('hour(order_purchase_timestamp)'))


In [57]:
full_orders_df.select('order_purchase_timestamp','hour_of_day').show()

+------------------------+-----------+
|order_purchase_timestamp|hour_of_day|
+------------------------+-----------+
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
+------------------------+-----------+
only showing top 20 rows



In [58]:
# Weekday vs Weekend Order

full_orders_df = full_orders_df.withColumn('order_day_type',\
                                           when(dayofweek('order_purchase_timestamp').isin(1,7),lit('Weekend')).otherwise(lit('weekday')))

In [59]:
full_orders_df.select('order_purchase_timestamp','order_day_type').show()

+------------------------+--------------+
|order_purchase_timestamp|order_day_type|
+------------------------+--------------+
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
|     2018-08-07 23:19:16|       weekday|
+------------------------+--------

In [60]:
full_orders_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [None]:
# a new column frieght category  based on freight_value --> low, med or high

In [None]:
# Order Volume by Customer State

In [66]:
!hadoop fs -mkdir /data/olist/processed/

In [None]:
full_orders_df.write.mode('overwrite').parquet('/data/olist/processed')

In [72]:
!hadoop fs -ls -h /data/olist/processed/