In [0]:
%sql
SELECT 
    OrderId, 
    COUNT(*) AS orderID_count
FROM sales3
GROUP BY OrderId
ORDER BY orderID_count DESC

# Q11. Get the total number of product based on each quantity of order

In [0]:
%sql
SELECT 
    Quantity, 
    COUNT(Product) AS total_products
FROM sales3
GROUP BY Quantity
ORDER BY Quantity

# Q12. Get the total number of orders placed for quantity order 1 which has been sold on consecutive order days


In [0]:
from pyspark.sql.functions import col, to_date, count, lag, datediff
from pyspark.sql.window import Window

# Load the sales3 table
sales_df = spark.table("sales3")

# Step 1: Filter rows where Quantity is 1
df_q1 = sales_df.filter(col("Quantity") == 1)

# Step 2: Extract date only from OrderDate
df_q1_date = df_q1.withColumn("OrderDay", to_date(col("OrderDate")))

# Step 3: Count number of Quantity=1 orders per day
orders_per_day = df_q1_date.groupBy("OrderDay") \
    .agg(count("*").alias("OrdersCount")) \
    .orderBy("OrderDay")

# Step 4: Use window function to get previous day
windowSpec = Window.orderBy("OrderDay")
orders_with_lag = orders_per_day.withColumn("PrevDay", lag("OrderDay").over(windowSpec))

# Step 5: Filter rows where current day is consecutive to previous
consecutive_orders = orders_with_lag.filter(datediff(col("OrderDay"), col("PrevDay")) == 1)

# Step 6: Sum the number of Quantity=1 orders that fall on consecutive days
total_orders = consecutive_orders.agg({"OrdersCount": "sum"}).collect()[0][0]

# Step 7: Print the result
print(f"âœ… Total number of Quantity=1 orders on consecutive days: {total_orders}")


# # Q13. Find the top 3 products which has been sold together


In [0]:
%sql
with product_pairs as (
  select
    CASE WHEN a.Product < b.Product THEN a.Product ELSE b.Product END AS Product1,
    CASE WHEN a.Product < b.Product THEN b.Product ELSE a.Product END AS Product2
  FROM sales a
  JOIN sales b
    ON a.OrderId = b.OrderId
    AND a.Product < b.Product
)
select Product1, Product2, COUNT(*) AS pair_count
from product_pairs
group by Product1, Product2
order by pair_count DESC
limit 10;