You're analyzing the efficiency of food delivery on Zomato, focusing on the time taken by restaurants to prepare orders. Total food delivery time for an order is a combination of food preparation time + time taken by rider to deliver the order. 
Write an SQL to calculate average food preparation time(in minutes) for each restaurant . Round the average to 2 decimal points and sort the output in increasing order of average time.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Create a SparkSession
spark = SparkSession.builder.appName("OrdersTable").getOrCreate()

# Define the schema
schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("restaurant_id", IntegerType(), True),
    StructField("order_time", StringType(), True),
    StructField("expected_delivery_time", StringType(), True),
    StructField("actual_delivery_time", StringType(), True),
    StructField("rider_delivery_mins", IntegerType(), True)
])

# Sample data from the table
data = [
    (1, 101, "12:00:00", "12:30:00", "12:45:00", 15),
    (2, 102, "12:15:00", "12:45:00", "12:55:00", 10),
    (3, 101, "12:30:00", "13:00:00", "13:10:00", 15),
    (4, 101, "12:45:00", "13:15:00", "13:21:00", 5),
    (5, 102, "13:00:00", "13:30:00", "13:36:00", 10),
    (6, 103, "13:15:00", "13:45:00", "13:58:00", 10),
    (7, 101, "13:30:00", "14:00:00", "14:12:00", 20),
    (8, 102, "13:45:00", "14:15:00", "14:25:00", 10),
    (9, 103, "14:00:00", "14:30:00", "14:30:00", 5),
    (10, 101, "14:15:00", "14:45:00", "15:05:00", 15),
]

# Create the DataFrame using the defined schema
orders_df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
orders_df.show()


+--------+-------------+----------+----------------------+--------------------+-------------------+
|order_id|restaurant_id|order_time|expected_delivery_time|actual_delivery_time|rider_delivery_mins|
+--------+-------------+----------+----------------------+--------------------+-------------------+
|       1|          101|  12:00:00|              12:30:00|            12:45:00|                 15|
|       2|          102|  12:15:00|              12:45:00|            12:55:00|                 10|
|       3|          101|  12:30:00|              13:00:00|            13:10:00|                 15|
|       4|          101|  12:45:00|              13:15:00|            13:21:00|                  5|
|       5|          102|  13:00:00|              13:30:00|            13:36:00|                 10|
|       6|          103|  13:15:00|              13:45:00|            13:58:00|                 10|
|       7|          101|  13:30:00|              14:00:00|            14:12:00|                 20|


In [0]:
# Convert string columns to timestamp
orders_df = orders_df.withColumn("order_time", to_timestamp("order_time", "HH:mm:ss")) \
                     .withColumn("actual_delivery_time", to_timestamp("actual_delivery_time", "HH:mm:ss"))

# Calculate food preparation time 
orders_df = orders_df.withColumn(
    "food_process_time",
    abs(expr("(unix_timestamp(actual_delivery_time) - unix_timestamp(order_time)) / 60.0") - col("rider_delivery_mins").cast("double"))
)

# Calculate the average food preparation time per restaurant_id
#group by restaurant_id avg of food_process_time and in ascending order
result_df = orders_df.groupBy("restaurant_id") \
    .agg(avg("food_process_time").alias("avg_food_prep_mins")) \
    .orderBy("restaurant_id") \
    .select(
        col("restaurant_id"),
        col("avg_food_prep_mins").cast("decimal(10,6)").alias("avg_food_prep_mins") 
    )

# Show the final result
result_df.display()

restaurant_id,avg_food_prep_mins
101,28.6
102,28.666667
103,29.0
