Amazon wants to find out the trending products for each month. Trending products are those for which any given month sales are more than the sum of previous 2 months sales for that product.

Please note that for first 2 months of operations this metrics does not make sense. So output should start from 3rd month only.  Assume that each product has at least 1 sale each month, display order month and product id. Sort by order month.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Initialize Spark session
spark = SparkSession.builder.appName("CreateOrdersDataFrame").getOrCreate()

# Define the data as a list of dictionaries
data = [
    {"order_month": "202301", "product_id": "p1", "sales": 100},
    {"order_month": "202301", "product_id": "p2", "sales": 500},
    {"order_month": "202302", "product_id": "p1", "sales": 700},
    {"order_month": "202302", "product_id": "p2", "sales": 300},
    {"order_month": "202303", "product_id": "p1", "sales": 900},
    {"order_month": "202303", "product_id": "p2", "sales": 700},
    {"order_month": "202304", "product_id": "p2", "sales": 2000},
    {"order_month": "202305", "product_id": "p1", "sales": 1500},
    {"order_month": "202305", "product_id": "p2", "sales": 1300},
    {"order_month": "202306", "product_id": "p1", "sales": 1700},
    {"order_month": "202306", "product_id": "p2", "sales": 1200},
    {"order_month": "202304", "product_id": "p2", "sales": 1100},
    {"order_month": "202307", "product_id": "p1", "sales": 1900},
    {"order_month": "202307", "product_id": "p2", "sales": 1400},
    {"order_month": "202308", "product_id": "p1", "sales": 2100},
    {"order_month": "202308", "product_id": "p2", "sales": 1600},
    {"order_month": "202309", "product_id": "p1", "sales": 2300},
    {"order_month": "202309", "product_id": "p2", "sales": 1800},
    {"order_month": "202310", "product_id": "p1", "sales": 5000},
    {"order_month": "202310", "product_id": "p2", "sales": 2000},
]

# Create the DataFrame
orders = spark.createDataFrame(data)

# Show the DataFrame
orders.show()


+-----------+----------+-----+
|order_month|product_id|sales|
+-----------+----------+-----+
|     202301|        p1|  100|
|     202301|        p2|  500|
|     202302|        p1|  700|
|     202302|        p2|  300|
|     202303|        p1|  900|
|     202303|        p2|  700|
|     202304|        p2| 2000|
|     202305|        p1| 1500|
|     202305|        p2| 1300|
|     202306|        p1| 1700|
|     202306|        p2| 1200|
|     202304|        p2| 1100|
|     202307|        p1| 1900|
|     202307|        p2| 1400|
|     202308|        p1| 2100|
|     202308|        p2| 1600|
|     202309|        p1| 2300|
|     202309|        p2| 1800|
|     202310|        p1| 5000|
|     202310|        p2| 2000|
+-----------+----------+-----+



In [0]:
window_spec = Window.partitionBy(col("product_id")).orderBy(col("order_month").asc())

orders.withColumn("prev_2_sales", sum(col("sales")).over(window_spec.rowsBetween(-2, -1))) \
    .withColumn("rn",row_number().over(window_spec)) \
        .filter(
            (col("rn")>2) & (col("sales")>col("prev_2_sales"))
            ).select("order_month","product_id") \
                .orderBy(col("order_month")).display()



In [0]:

# Define a window for partitioning by `product_id` and ordering by `order_month`
window_spec_sum = Window.partitionBy("product_id").orderBy("order_month").rowsBetween(-2, -1)
window_spec_row_number = Window.partitionBy("product_id").orderBy("order_month")

# Add the calculated columns `prev_2` (sum of sales over window) and `rn` (row number)
orders_with_windows = orders.withColumn(
    "prev_2", spark_sum("sales").over(window_spec_sum)
).withColumn(
    "rn", row_number().over(window_spec_row_number)
)

# Filter rows where `rn > 2` and `sales > prev_2`
filtered_orders = orders_with_windows.filter((col("rn") > 2) & (col("sales") > col("prev_2")))

# Select the required columns and order by `order_month`
result = filtered_orders.select("order_month", "product_id").orderBy("order_month")

# Show the result
result.show()


+-----------+----------+
|order_month|product_id|
+-----------+----------+
|     202303|        p1|
|     202304|        p2|
|     202310|        p1|
+-----------+----------+

