In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab6_DF_EX1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# load data
sales_df = spark.read.format("csv").option("header", "true") \
       .load("/home/jovyan/data/sales_e1.csv")
sales_df.printSchema()
sales_df.show(5)

products_df = spark.read.format("csv").option("header", "true") \
       .load("/home/jovyan/data/products_e1.csv")
products_df.printSchema()
products_df.show(5)

root
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- quantity: string (nullable = true)

+--------+----------+----------+--------+
|order_id|product_id|      date|quantity|
+--------+----------+----------+--------+
|       1|        31|2021-08-10|      92|
|       2|        38|2021-08-02|      46|
|       3|        47|2021-08-01|      48|
|       4|        33|2021-08-09|      18|
|       5|        29|2021-08-05|      39|
+--------+----------+----------+--------+
only showing top 5 rows

root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- unit_price: string (nullable = true)

+----------+------------+----------+
|product_id|product_name|unit_price|
+----------+------------+----------+
|         0|   product_0|        22|
|         1|   product_1|         2|
|         2|   product_2|         6|
|         3|   product_3|         3|
|         4|   product_4|        12|
+--

**Find the best performing product in terms of the total price of the sold items for any date**

In [2]:
from pyspark.sql.functions import *
from pyspark.sql import Row, Window

# Calcuate the number of pieces sold by each seller for each product
sales_df_total = sales_df.groupby(col('product_id')). \
    agg(sum("quantity").alias("total_quantity"))

# https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html
# https://stackoverflow.com/questions/35258506/how-to-avoid-duplicate-columns-after-join
joinExpression = ["product_id"]  #  Prevent duplicated columns when joining two DataFrames - see above links
merged_df = sales_df_total.join(products_df, joinExpression,"left").withColumn("total_price", col("total_quantity") * col("unit_price"))
merged_df.show(10)
print(merged_df.orderBy(col("total_price").desc()).select("product_name","total_price").collect()[0])


+----------+--------------+------------+----------+-----------+
|product_id|total_quantity|product_name|unit_price|total_price|
+----------+--------------+------------+----------+-----------+
|         7|        1924.0|   product_7|        15|    28860.0|
|        15|        3075.0|  product_15|        15|    46125.0|
|        11|        2422.0|  product_11|         4|     9688.0|
|        29|        2868.0|  product_29|         1|     2868.0|
|        42|        2414.0|  product_42|         3|     7242.0|
|         3|        1859.0|   product_3|         3|     5577.0|
|        30|        2175.0|  product_30|        12|    26100.0|
|        34|        3582.0|  product_34|         9|    32238.0|
|         8|        2775.0|   product_8|         5|    13875.0|
|        28|        2270.0|  product_28|         5|    11350.0|
+----------+--------------+------------+----------+-----------+
only showing top 10 rows

Row(product_name='product_19', total_price=47264.0)


**Find the best preforming product and the worst performing product in terms of the total price of the sold items for each date**

In [3]:
sales_df_d = sales_df.select("*", col("date"),
    to_date(col("date"),"yyyy-MM-dd").alias("sdate")).drop("date")

# Calcuate the number of pieces sold by each seller for each product on each date
sales_df_d_total = sales_df_d.groupby(col('product_id'), col("sdate")). \
    agg(sum("quantity").alias("total_quantity"))

joinExpression = ["product_id"] # as the two data frames use the same column id
sales_df_d_total_price = sales_df_d_total.join(products_df, joinExpression,"left").withColumn("total_price", col("total_quantity") * col("unit_price"))

# Rank the product in terms of the total price, per each date. Then, select the best and worst product using ranks
windowdesc = Window.partitionBy(col("sdate")).orderBy(col("total_price").desc())
windowasc = Window.partitionBy(col("sdate")).orderBy(col("total_price").asc())

sales_df_d_total_price.printSchema()
sales_df_d_total_price.show(4)

sales_df_d_total_windowed = sales_df_d_total_price.withColumn("rank_desc", dense_rank().over(windowdesc)).withColumn("rank_asc", dense_rank().over(windowasc))

# Get the best and worst performing products
sales_df_d_total_windowed.where((col("rank_desc") == 1) | (col("rank_asc") == 1) ).select("*").show(100)

root
 |-- product_id: string (nullable = true)
 |-- sdate: date (nullable = true)
 |-- total_quantity: double (nullable = true)
 |-- product_name: string (nullable = true)
 |-- unit_price: string (nullable = true)
 |-- total_price: double (nullable = true)

+----------+----------+--------------+------------+----------+-----------+
|product_id|     sdate|total_quantity|product_name|unit_price|total_price|
+----------+----------+--------------+------------+----------+-----------+
|         5|2021-08-05|          95.0|   product_5|        10|      950.0|
|         2|2021-08-05|         242.0|   product_2|         6|     1452.0|
|        33|2021-08-04|         419.0|  product_33|         9|     3771.0|
|        10|2021-08-11|         106.0|  product_10|        10|     1060.0|
+----------+----------+--------------+------------+----------+-----------+
only showing top 4 rows

+----------+----------+--------------+------------+----------+-----------+---------+--------+
|product_id|     sdate|

A useful solution for a common need
https://stackoverflow.com/questions/34409875/how-to-get-other-columns-when-using-spark-dataframe-groupby

In [5]:
# Stop the spark context
spark.stop()