In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName('OlistData')\
.getOrCreate()

25/12/14 20:02:47 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
hdfs_path = '/data/olist/'

In [3]:
customers_df = spark.read.csv(hdfs_path + 'olist_customers_dataset.csv', header = True, inferSchema = True)
geolocation_df = spark.read.csv(hdfs_path + 'olist_geolocation_dataset.csv', header = True, inferSchema = True)
order_items_df = spark.read.csv(hdfs_path + 'olist_order_items_dataset.csv', header = True, inferSchema = True)
payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv', header = True, inferSchema = True)
reviews_df = spark.read.csv(hdfs_path + 'olist_order_reviews_dataset.csv', header = True, inferSchema = True)
orders_df = spark.read.csv(hdfs_path + 'olist_orders_dataset.csv', header = True, inferSchema = True)
products_df = spark.read.csv(hdfs_path + 'olist_products_dataset.csv', header = True, inferSchema = True)
sellers_df = spark.read.csv(hdfs_path + 'olist_sellers_dataset.csv', header = True, inferSchema = True)
category_translation_df = spark.read.csv(hdfs_path + 'product_category_name_translation.csv', header = True, inferSchema = True)

                                                                                

In [4]:
# Cache Frequently Used Data For Better Performance

orders_df.cache()
customers_df.cache()
order_items_df.cache()

DataFrame[order_id: string, order_item_id: int, product_id: string, seller_id: string, shipping_limit_date: timestamp, price: double, freight_value: double]

In [5]:
order_items_joined_df = orders_df.join(order_items_df,'order_id','inner')

In [6]:
order_items_products_df = order_items_joined_df.join(products_df,'product_id','inner')

In [7]:
order_items_products_sellers_df = order_items_products_df.join(sellers_df,'seller_id','inner')

In [8]:
full_orders_df = order_items_products_sellers_df.join(customers_df,'customer_id','inner')

In [9]:
geolocation_df.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1037| -23.54562128115268| -46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535| -46.64482029837157|       sao paulo|               SP|
|                       1046| -23.54612896641469| -46.64295148361138|       sao paulo|               SP|
|                       1041|  -23.5443921648681| -46.63949930627844|       sao paulo|               SP|
|                       1035|-23.541577961711493| -46.64160722329613|       sao paulo|               SP|
|                       1012|-23.547762303364266| -46.63536053788448|       são paulo|               SP|
|                       1047|-23.546273112412678| -46.6

In [10]:
full_orders_df = full_orders_df.join(geolocation_df,full_orders_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix, 'left')

In [11]:
full_orders_df = full_orders_df.join(reviews_df,'order_id','left')

In [12]:
full_orders_df = full_orders_df.join(payments_df,'order_id','left')

In [13]:
full_orders_df.cache()

25/12/14 19:13:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

In [5]:
from pyspark.sql.functions import *


In [6]:
# Total Revenue Per Seller

seller_revenue_df = full_orders_df.groupBy('seller_id').agg(sum('price'))

NameError: name 'full_orders_df' is not defined

In [16]:
seller_revenue_df.show(5)



+--------------------+-------------------+
|           seller_id|         sum(price)|
+--------------------+-------------------+
|7a67c85e85bb2ce85...|2.031279489000002E7|
|9d213f303afae4983...|  2321.400000000004|
|d2374cbcbb3ca4ab1...| 3375517.5500000133|
|1835b56ce799e6a4d...|  6097995.110000008|
|d650b663c3b5f6fb3...|          2253869.1|
+--------------------+-------------------+
only showing top 5 rows



                                                                                

In [17]:
# Total Orders Per Customer
# Average Review Score Per Seller
# Most Sold Products (Top 10)
# Top Customers By Spending

## Optimized Joins for Data Integration

In [7]:
from pyspark.sql.functions import *

In [8]:
order_items_joined_df = orders_df.join(order_items_df,'order_id','inner')

In [9]:
order_items_products_df = order_items_joined_df.join(products_df,'product_id','inner')

In [10]:
order_items_products_sellers_df = order_items_products_df.join(broadcast(sellers_df),'seller_id','inner')

In [11]:
full_orders_df = order_items_products_sellers_df.join(customers_df,'customer_id','inner')

In [12]:
full_orders_df = full_orders_df.join(broadcast(geolocation_df),full_orders_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix, 'left')

In [13]:
full_orders_df = full_orders_df.join(broadcast(reviews_df),'order_id','left')

In [14]:
full_orders_df = full_orders_df.join(payments_df,'order_id','left')

In [15]:
full_orders_df.cache()

25/12/14 20:03:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

In [16]:
# Total Orders Per Customer

customer_order_count_df = full_orders_df.groupBy('customer_id')\
.agg(count('order_id').alias('total_orders'))\
.orderBy(desc('total_orders'))

In [17]:
customer_order_count_df.show(5)

[Stage 31:>                                                         (0 + 3) / 3]

+--------------------+------------+
|         customer_id|total_orders|
+--------------------+------------+
|351e40989da90e704...|       11427|
|50920f8cd0681fd86...|       10752|
|9b43e2a62de9bab3a...|        8556|
|270c23a11d024a44c...|        8001|
|5c87184371002d49e...|        6876|
+--------------------+------------+
only showing top 5 rows



                                                                                

In [18]:
# Average Review Score Per Seller

seller_review_df = full_orders_df.groupBy('seller_id')\
.agg(avg('review_score').alias('avg_review_score'))\
.orderBy(desc('avg_review_score'))

In [19]:
seller_review_df.show(10)



+--------------------+----------------+
|           seller_id|avg_review_score|
+--------------------+----------------+
|20a7efa9721046319...|             5.0|
|2b2fed75b8e5ea3a0...|             5.0|
|f26d70155c2b6326f...|             5.0|
|58c851d1a3c7cd3da...|             5.0|
|f1fdf2d1318657575...|             5.0|
|33ab10be054370c25...|             5.0|
|fd312b6bf05efac6c...|             5.0|
|0c7f30ae9b147eca0...|             5.0|
|57a834594e1e9e921...|             5.0|
|43753b27d77860f16...|             5.0|
+--------------------+----------------+
only showing top 10 rows



                                                                                

In [20]:
# Most Sold Products (Top 10)

top_products_df = full_orders_df.groupBy('product_id')\
.agg(count('order_id').alias('total_sold'))\
.orderBy(desc('total_sold'))\
.limit(10)

In [21]:
top_products_df.show()



+--------------------+----------+
|          product_id|total_sold|
+--------------------+----------+
|aca2eb7d00ea1a7b8...|     86740|
|422879e10f4668299...|     81110|
|99a4788cb24856965...|     78775|
|389d119b48cf3043d...|     60248|
|d1c427060a0f73f6b...|     59274|
|368c6c730842d7801...|     58358|
|53759a2ecddad2bb8...|     52654|
|53b36df67ebb7c415...|     52105|
|154e7e31ebfa09220...|     42700|
|3dd2a17168ec895c7...|     40787|
+--------------------+----------+



                                                                                

In [22]:
# Top Customers By Spending

top_customers_df = full_orders_df.groupBy('customer_id')\
.agg(count('price').alias('top_customers'))\
.orderBy(desc('top_customers'))

In [23]:
top_customers_df.show()



+--------------------+-------------+
|         customer_id|top_customers|
+--------------------+-------------+
|351e40989da90e704...|        11427|
|50920f8cd0681fd86...|        10752|
|9b43e2a62de9bab3a...|         8556|
|270c23a11d024a44c...|         8001|
|5c87184371002d49e...|         6876|
|d3e82ccec3cb5f956...|         6876|
|d5f2b3f597c7ccafb...|         6706|
|c2f18647725395af4...|         6612|
|24e7dc2ff8c071263...|         6597|
|7bb57d182bdc11653...|         6258|
|63b964e79dee32a35...|         6072|
|d22f25a9fadfb1abb...|         6072|
|1ff773612ab8934db...|         5820|
|13aa59158da63ba0e...|         5206|
|78fc46047c4a639e8...|         5200|
|dd3f1762eb601f41c...|         4992|
|a193aa8d905b8e246...|         4896|
|9eb3d566e87289dcb...|         4872|
|2ba91e12e5e4c9f56...|         4752|
|55e7cfd6e28d2fbfb...|         4728|
+--------------------+-------------+
only showing top 20 rows



                                                                                

In [24]:
full_orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

## Window Function and Ranking

In [25]:
# Rank Top Selling Products Per Seller
# Dense Rank For Sellers Based On Revenue

In [26]:
from pyspark.sql.window import Window

In [27]:
# Rank Top Selling Products Per Seller

window_spec = Window.partitionBy('seller_id').orderBy(desc('price'))

In [28]:
top_seller_products_df = full_orders_df.withColumn('rank', rank().over(window_spec))

In [29]:
top_seller_products_df.select('seller_id','price','rank').show()

[Stage 43:>                                                         (0 + 1) / 1]

+--------------------+-----+----+
|           seller_id|price|rank|
+--------------------+-----+----+
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
+--------------------+-----+----+
only showing top 20 rows



                                                                                

In [30]:
# Dense Rank For Sellers Based On Revenue

