In [58]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [59]:
# Create spark session
spark = SparkSession.builder \
 .master("local") \
 .config("spark.sql.autoBroadcastJoinThreshold", -1) \
 .config("spark.executor.memory", "500mb") \
 .appName("Exercise1") \
 .getOrCreate()

# Warm-up #1

Find out how many orders, how many products and how many
sellers are in the data.

In [60]:
# Read parquet files into dataframes
sales_df = spark.read.parquet("/Users/allenc/PyCharmProjects/JupyterProjects/DatasetToCompleteTheSixSparkExercises/sales_parquet/*/")
products_df = spark.read.parquet("/Users/allenc/PyCharmProjects/JupyterProjects/DatasetToCompleteTheSixSparkExercises/products_parquet/*/")
sellers_df = spark.read.parquet("/Users/allenc/PyCharmProjects/JupyterProjects/DatasetToCompleteTheSixSparkExercises/sellers_parquet/*/")

                                                                                

In [65]:
sales_df.show()

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-10|             26|kyeibuumwlyhuwksx...|
|       2|         0|        0|2020-07-08|             13|jfyuoyfkeyqkckwbu...|
|       3|         0|        0|2020-07-05|             38|uyjihlzhzcswxcccx...|
|       4|         0|        0|2020-07-05|             56|umnxvoqbdzpbwjqmz...|
|       5|         0|        0|2020-07-05|             11|zmqexmaawmvdpqhih...|
|       6|         0|        0|2020-07-01|             82|lmuhhkpyuoyslwmvX...|
|       7|         0|        0|2020-07-04|             15|zoqweontumefxbgvu...|
|       8|         0|        0|2020-07-08|             79|sgldfgtcxufasnvsc...|
|       9|         0|        0|2020-07-10|             25|jnykelwjjebgkwgmu...|
|      10|         0|        0|2020-07-0

In [66]:
products_df.show()

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
|         5|   product_5|  128|
|         6|   product_6|   66|
|         7|   product_7|  145|
|         8|   product_8|   51|
|         9|   product_9|   44|
|        10|  product_10|   53|
|        11|  product_11|   13|
|        12|  product_12|  104|
|        13|  product_13|  102|
|        14|  product_14|   24|
|        15|  product_15|   14|
|        16|  product_16|   38|
|        17|  product_17|   72|
|        18|  product_18|   16|
|        19|  product_19|   46|
+----------+------------+-----+
only showing top 20 rows



In [67]:
sellers_df.show()

+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|     2500000|
|        1|   seller_1|      257237|
|        2|   seller_2|      754188|
|        3|   seller_3|      310462|
|        4|   seller_4|     1532808|
|        5|   seller_5|     1199693|
|        6|   seller_6|     1055915|
|        7|   seller_7|     1946998|
|        8|   seller_8|      547320|
|        9|   seller_9|     1318051|
+---------+-----------+------------+



In [61]:
# Number of total sales (rows)
sales_df.count()

                                                                                

20000040

In [62]:
# Number of total products (rows)
products_df.count()

75000000

In [63]:
# Number of total sellers (rows)
sellers_df.count()

10

How many products have been sold at least once? Which is the product contained in more orders?

In [64]:
# Use spark sql functions to count the distinct products in sales table
sales_df.agg(countDistinct(col("product_id"))).show()



+-----------------+
|count(product_id)|
+-----------------+
|           993429|
+-----------------+





In [78]:
# Group all product ids, count them, and order from largest count to smallest
sales_df.groupBy(col("product_id")).agg(
    count("*").alias("count")).orderBy(col("count").desc()).show()



+----------+--------+
|product_id|   count|
+----------+--------+
|         0|19000000|
|   3534470|       3|
|  19986717|       3|
|  28592106|       3|
|  28183035|       3|
|  40193396|       3|
|  73385513|       3|
|  72017876|       3|
|  17944574|       3|
|  61475460|       3|
|   2316238|       3|
|  36269838|       3|
|  40496308|       3|
|  57735075|       3|
|  67723231|       3|
|  31136332|       3|
|   2839667|       3|
|  14542470|       3|
|  26915351|       3|
|  40579633|       3|
+----------+--------+
only showing top 20 rows



                                                                                

# Warm-up #2

How many distinct products have been sold in each day?

In [85]:
# Group by date, count all distinct products and order from earliest date to most recent date
sales_df.groupby(col("date")).agg(countDistinct(col("product_id"))).orderBy(
    col("date").asc()).show()



+----------+-----------------+
|      date|count(product_id)|
+----------+-----------------+
|2020-07-01|           100337|
|2020-07-02|            99807|
|2020-07-03|           100017|
|2020-07-04|            99791|
|2020-07-05|            99796|
|2020-07-06|           100765|
|2020-07-07|            99756|
|2020-07-08|            99662|
|2020-07-09|           100501|
|2020-07-10|            98973|
+----------+-----------------+



