You own a small online store, and want to analyze customer ratings for the products that you're selling. After doing a data pull, you have a list of products and a log of purchases. Within the purchase log, each record includes the number of stars (from 1 to 5) as a customer rating for the product.

For each category, find the lowest price among all products that received at least one 4-star or above rating from customers.
If a product category did not have any products that received at least one 4-star or above rating, the lowest price is considered to be 0. The final output should be sorted by product category in alphabetical order.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions  import *


# Initialize Spark session
spark = SparkSession.builder.appName("Products and Purchases Tables").getOrCreate()

# Define schema for the products table
products_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", IntegerType(), True)
])

# Define data for the products table
products_data = [
    (1, "Cripps Pink", "apple", 10),
    (2, "Navel Orange", "orange", 12),
    (3, "Golden Delicious", "apple", 6),
    (4, "Clementine", "orange", 14),
    (5, "Pinot Noir", "grape", 20),
    (6, "Bing Cherries", "cherry", 36),
    (7, "Sweet Cherries", "cherry", 40)
]

# Create DataFrame for the products table
products_df = spark.createDataFrame(data=products_data, schema=products_schema)

# Define schema for the purchases table
purchases_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("stars", IntegerType(), True)
])

# Define data for the purchases table
purchases_data = [
    (1, 1, 2),
    (2, 3, 3),
    (3, 2, 2),
    (4, 4, 4),
    (5, 6, 5),
    (6, 6, 4),
    (7, 7, 5)
]

# Create DataFrame for the purchases table
purchases_df = spark.createDataFrame(data=purchases_data, schema=purchases_schema)

# Show the tables
print("Products Table:")
products_df.show()

print("Purchases Table:")
purchases_df.show()


Products Table:
+---+----------------+--------+-----+
| id|            name|category|price|
+---+----------------+--------+-----+
|  1|     Cripps Pink|   apple|   10|
|  2|    Navel Orange|  orange|   12|
|  3|Golden Delicious|   apple|    6|
|  4|      Clementine|  orange|   14|
|  5|      Pinot Noir|   grape|   20|
|  6|   Bing Cherries|  cherry|   36|
|  7|  Sweet Cherries|  cherry|   40|
+---+----------------+--------+-----+

Purchases Table:
+---+----------+-----+
| id|product_id|stars|
+---+----------+-----+
|  1|         1|    2|
|  2|         3|    3|
|  3|         2|    2|
|  4|         4|    4|
|  5|         6|    5|
|  6|         6|    4|
|  7|         7|    5|
+---+----------+-----+



In [0]:
from pyspark.sql.functions import col, when, lit, min, coalesce

result_df = products_df.join(
    purchases_df,
    on=[(products_df.id == purchases_df.product_id) & ((purchases_df.stars == 4) | (purchases_df.stars == 5))],
    how="left"
).select("product_id", "category", "price", "stars") \
  .orderBy(col("category")) \
    .withColumn("price_", when(col("stars").isNotNull(), col("price")).otherwise(lit(None))) \
      .select("category", "price_") \
        .groupBy(col("category")).agg(min(col("price_")).alias("lowest_price")) \
          .select("category",coalesce("lowest_price",lit(0)).alias("lowest_price"))
# Display the result
result_df.show()


+--------+------------+
|category|lowest_price|
+--------+------------+
|   apple|           0|
|  orange|          14|
|   grape|           0|
|  cherry|          36|
+--------+------------+

