In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("Food Delivery").getOrCreate()

#Creating DataFrame

In [2]:
data = [
("O001","Amit","Hyderabad","Spice Hub","Indian",450,35,"UPI","Delivered"),
("O002","Neha","Bangalore","Pizza Town","Italian",650,40,"Card","Delivered"),
("O003","Rahul","Delhi","Burger Zone","American",520,30,"Cash","Delivered"),
("O004","Pooja","Mumbai","Sushi Bar","Japanese",1200,55,"UPI","Cancelled"),
("O005","Arjun","Chennai","Curry Leaf","Indian",380,28,"UPI","Delivered"),
("O006","Sneha","Hyderabad","Pasta Street","Italian",700,45,"Card","Delivered"),
("O007","Karan","Delhi","Taco Bell","Mexican",540,33,"UPI","Delivered"),
("O008","Riya","Bangalore","Dragon Bowl","Chinese",600,38,"Wallet","Delivered"),
("O009","Vikas","Mumbai","BBQ Nation","Indian",1500,60,"Card","Delivered"),
("O010","Anjali","Chennai","Burger Zone","American",480,32,"Cash","Delivered"),
("O011","Farhan","Delhi","Biryani House","Indian",520,36,"UPI","Delivered"),
("O012","Megha","Hyderabad","Sushi Bar","Japanese",1100,58,"Card","Canccelled"),
("O013","Suresh","Bangalore","Curry Leaf","Indian",420,29,"UPI","Delivered"),
("O014","Divya","Mumbai","Pizza Town","Italian",780,42,"Wallet","Delivered"),
("O015","Nikhil","Delhi","Pasta Street","Italian",690,47,"UPI","Delivered"),
("O016","Kavya","Chennai","Dragon Bowl","Chinese",560,34,"UPI","Delivered"),
("O017","Rohit","Hyderabad","BBQ Nation","Indian",1400,62,"Card","Delivered"),
("O018","Simran","Bangalore","Burger Zone","American",510,31,"Cash","Delivered"),
("O019","Ayesha","Mumbai","Taco Bell","Mexican",570,35,"UPI","Delivered"),
("O020","Manish","Delhi","Curry Leaf","Indian",390,27,"Wallet","Delivered"),
("O021","Priya","Hyderabad","Pizza Town","Italian",720,41,"Card","Delivered"),
("O022","Yash","Chennai","Sushi Bar","Japanese",1150,57,"UPI","Delivered"),
("O023","Naina","Bangalore","Pasta Street","Italian",680,44,"UPI","Delivered"),
("O024","Sameer","Mumbai","Dragon Bowl","Chinese",610,39,"Wallet","Delivered"),
("O025","Ritika","Delhi","Burger Zone","American",500,30,"Cash","Delivered"),
("O026","Gopal","Hyderabad","Curry Leaf","Indian",410,28,"UPI","Delivered"),
("O027","Tina","Bangalore","Pizza Town","Italian",760,43,"Card","Delivered"),
("O028","Irfan","Mumbai","BBQ Nation","Indian",1550,65,"Card","Delivered"),
("O029","Sahil","Chennai","Taco Bell","Mexican",590,37,"UPI","Delivered"),
("O030","Lavanya","Delhi","Dragon Bowl","Chinese",630,40,"Wallet","Delivered"),
("O031","Deepak","Hyderabad","Burger Zone","American",520,33,"Cash","Delivered"),
("O032","Shweta","Bangalore","Curry Leaf","Indian",450,31,"UPI","Delivered"),
("O033","Aman","Mumbai","Pizza Town","Italian",810,46,"Card","Delivered"),
("O034","Rekha","Chennai","Pasta Street","Italian",700,45,"UPI","Delivered"),
("O035","Zubin","Delhi","BBQ Nation","Indian",1480,63,"Card","Delivered"),
("O036","Pallavi","Hyderabad","Dragon Bowl","Chinese",580,36,"Wallet","Delivered"),
("O037","Naveen","Bangalore","Taco Bell","Mexican",560,34,"UPI","Delivered"),
("O038","Sonia","Mumbai","Sushi Bar","Japanese",1180,59,"Card","Delivered"),
("O039","Harish","Chennai","Burger Zone","American",490,29,"Cash","Delivered"),
("O040","Kriti","Delhi","Curry Leaf","Indian",420,26,"UPI","Delivered")
]
columns = [
"order_id","customer_name","city","restaurant","cuisine","order_amount","delivery_time_minutes","payment_mode","order_status"
]
df = spark.createDataFrame(data, columns)
df.show()
df.printSchema()

+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|   restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+
|    O001|         Amit|Hyderabad|    Spice Hub|  Indian|         450|                   35|         UPI|   Delivered|
|    O002|         Neha|Bangalore|   Pizza Town| Italian|         650|                   40|        Card|   Delivered|
|    O003|        Rahul|    Delhi|  Burger Zone|American|         520|                   30|        Cash|   Delivered|
|    O004|        Pooja|   Mumbai|    Sushi Bar|Japanese|        1200|                   55|         UPI|   Cancelled|
|    O005|        Arjun|  Chennai|   Curry Leaf|  Indian|         380|                   28|         UPI|   Delivered|
|    O006|        Sneha|Hyderabad| Pasta Street|

#Section A - CSV

In [3]:
#Write the full dataset to CSV with header enabled.
df.write.mode("overwrite").option("header",True).csv("orders_csv")

In [4]:
#Read the CSV back and filter: order_amount > 700
df_csv = spark.read.option("header",True).option("inferSchema",True).csv("orders_csv")

df_csv.filter(col("order_amount")> 700).show()

+--------+-------------+---------+----------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+----------+--------+------------+---------------------+------------+------------+
|    O021|        Priya|Hyderabad|Pizza Town| Italian|         720|                   41|        Card|   Delivered|
|    O022|         Yash|  Chennai| Sushi Bar|Japanese|        1150|                   57|         UPI|   Delivered|
|    O027|         Tina|Bangalore|Pizza Town| Italian|         760|                   43|        Card|   Delivered|
|    O028|        Irfan|   Mumbai|BBQ Nation|  Indian|        1550|                   65|        Card|   Delivered|
|    O033|         Aman|   Mumbai|Pizza Town| Italian|         810|                   46|        Card|   Delivered|
|    O035|        Zubin|    Delhi|BBQ Nation|  Indian|        1480|     

In [5]:
"""From CSV, show only:
order_id
city
cuisine
order_amount"""

df_csv.select("order_id","city","cuisine","order_amount").show()

+--------+---------+--------+------------+
|order_id|     city| cuisine|order_amount|
+--------+---------+--------+------------+
|    O021|Hyderabad| Italian|         720|
|    O022|  Chennai|Japanese|        1150|
|    O023|Bangalore| Italian|         680|
|    O024|   Mumbai| Chinese|         610|
|    O025|    Delhi|American|         500|
|    O026|Hyderabad|  Indian|         410|
|    O027|Bangalore| Italian|         760|
|    O028|   Mumbai|  Indian|        1550|
|    O029|  Chennai| Mexican|         590|
|    O030|    Delhi| Chinese|         630|
|    O031|Hyderabad|American|         520|
|    O032|Bangalore|  Indian|         450|
|    O033|   Mumbai| Italian|         810|
|    O034|  Chennai| Italian|         700|
|    O035|    Delhi|  Indian|        1480|
|    O036|Hyderabad| Chinese|         580|
|    O037|Bangalore| Mexican|         560|
|    O038|   Mumbai|Japanese|        1180|
|    O039|  Chennai|American|         490|
|    O040|    Delhi|  Indian|         420|
+--------+-

In [9]:
#Sort orders by delivery_time_minutes descending and write result to CSV.
df_csv.orderBy(col("delivery_time_minutes").desc()).write.mode("overwrite")\
.option("header",True).csv("order_sorted")

df2_csv = spark.read.option("header",True).option("inferSchema",True).csv("order_sorted")
df2_csv.show()

+--------+-------------+---------+------------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|  restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+------------+--------+------------+---------------------+------------+------------+
|    O028|        Irfan|   Mumbai|  BBQ Nation|  Indian|        1550|                   65|        Card|   Delivered|
|    O035|        Zubin|    Delhi|  BBQ Nation|  Indian|        1480|                   63|        Card|   Delivered|
|    O017|        Rohit|Hyderabad|  BBQ Nation|  Indian|        1400|                   62|        Card|   Delivered|
|    O009|        Vikas|   Mumbai|  BBQ Nation|  Indian|        1500|                   60|        Card|   Delivered|
|    O038|        Sonia|   Mumbai|   Sushi Bar|Japanese|        1180|                   59|        Card|   Delivered|
|    O012|        Megha|Hyderabad|   Sushi Bar|Japanese|

#Section B - Json

In [12]:
#Write only “Delivered” orders to JSON.

df.filter(col("order_status") == "Delivered").write.mode("overwrite").json("delivered_orders_json")
df_json = spark.read.json("delivered_orders_json")
df_json.show()

+---------+--------+-------------+---------------------+------------+--------+------------+------------+------------+
|     city| cuisine|customer_name|delivery_time_minutes|order_amount|order_id|order_status|payment_mode|  restaurant|
+---------+--------+-------------+---------------------+------------+--------+------------+------------+------------+
|Hyderabad| Italian|        Priya|                   41|         720|    O021|   Delivered|        Card|  Pizza Town|
|  Chennai|Japanese|         Yash|                   57|        1150|    O022|   Delivered|         UPI|   Sushi Bar|
|Bangalore| Italian|        Naina|                   44|         680|    O023|   Delivered|         UPI|Pasta Street|
|   Mumbai| Chinese|       Sameer|                   39|         610|    O024|   Delivered|      Wallet| Dragon Bowl|
|    Delhi|American|       Ritika|                   30|         500|    O025|   Delivered|        Cash| Burger Zone|
|Hyderabad|  Indian|        Gopal|                   28|

In [13]:
"""Read JSON and filter:
city = "Mumbai"
payment_mode = "Card" """

df_json.filter((col("city") == "Mumbai") & (col("payment_mode") == "Card")).show()


+------+--------+-------------+---------------------+------------+--------+------------+------------+----------+
|  city| cuisine|customer_name|delivery_time_minutes|order_amount|order_id|order_status|payment_mode|restaurant|
+------+--------+-------------+---------------------+------------+--------+------------+------------+----------+
|Mumbai|  Indian|        Irfan|                   65|        1550|    O028|   Delivered|        Card|BBQ Nation|
|Mumbai| Italian|         Aman|                   46|         810|    O033|   Delivered|        Card|Pizza Town|
|Mumbai|Japanese|        Sonia|                   59|        1180|    O038|   Delivered|        Card| Sushi Bar|
|Mumbai|  Indian|        Vikas|                   60|        1500|    O009|   Delivered|        Card|BBQ Nation|
+------+--------+-------------+---------------------+------------+--------+------------+------------+----------+



In [15]:
"""Add a column:
delivery_category
Logic:
delivery_time_minutes > 45 → "Late"
else → "OnTime"
Write output to JSON."""

df_with_category = df_json.withColumn("delivery_category", when(col("delivery_time_minutes") >45,"Late").otherwise("OnTime"))
df_with_category.write.mode("overwrite").json("orders_with_category_json")

df_delivery_json = spark.read.json("orders_with_category_json")
df_delivery_json.show()

+---------+--------+-------------+-----------------+---------------------+------------+--------+------------+------------+------------+
|     city| cuisine|customer_name|delivery_category|delivery_time_minutes|order_amount|order_id|order_status|payment_mode|  restaurant|
+---------+--------+-------------+-----------------+---------------------+------------+--------+------------+------------+------------+
|Hyderabad| Italian|        Priya|           OnTime|                   41|         720|    O021|   Delivered|        Card|  Pizza Town|
|  Chennai|Japanese|         Yash|             Late|                   57|        1150|    O022|   Delivered|         UPI|   Sushi Bar|
|Bangalore| Italian|        Naina|           OnTime|                   44|         680|    O023|   Delivered|         UPI|Pasta Street|
|   Mumbai| Chinese|       Sameer|           OnTime|                   39|         610|    O024|   Delivered|      Wallet| Dragon Bowl|
|    Delhi|American|       Ritika|           OnT

In [17]:
#Force JSON output to a single partition and observe number of files created.

df_with_category.coalesce(1).write.mode("overwrite").json("single_file_json")

#Section C - Parquet

In [18]:
#Convert full datatset to Parquet
df.write.mode("overwrite").parquet("orders_parquet")

In [19]:
"""Read Parquet and filter:
cuisine = "Indian"
order_amount > 500"""

df_parquet = spark.read.parquet("orders_parquet")
df_parquet.filter((col("cuisine") == "Indian") & (col("order_amount") > 500)).show()

+--------+-------------+---------+-------------+-------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|   restaurant|cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+-------------+-------+------------+---------------------+------------+------------+
|    O009|        Vikas|   Mumbai|   BBQ Nation| Indian|        1500|                   60|        Card|   Delivered|
|    O011|       Farhan|    Delhi|Biryani House| Indian|         520|                   36|         UPI|   Delivered|
|    O017|        Rohit|Hyderabad|   BBQ Nation| Indian|        1400|                   62|        Card|   Delivered|
|    O028|        Irfan|   Mumbai|   BBQ Nation| Indian|        1550|                   65|        Card|   Delivered|
|    O035|        Zubin|    Delhi|   BBQ Nation| Indian|        1480|                   63|        Card|   Delivered|
+--------+-------------+---------+-------------+-------+

In [22]:
#Sort Parquet data by order_amount descending and write top 10 orders back to Parquet.

df_parquet.orderBy(col("order_amount").desc()).limit(10).write.mode("overwrite").parquet("top10_orders_parquet")
df2_parquet = spark.read.parquet("top10_orders_parquet")
df2_parquet.show()

+--------+-------------+---------+----------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+----------+--------+------------+---------------------+------------+------------+
|    O028|        Irfan|   Mumbai|BBQ Nation|  Indian|        1550|                   65|        Card|   Delivered|
|    O009|        Vikas|   Mumbai|BBQ Nation|  Indian|        1500|                   60|        Card|   Delivered|
|    O035|        Zubin|    Delhi|BBQ Nation|  Indian|        1480|                   63|        Card|   Delivered|
|    O017|        Rohit|Hyderabad|BBQ Nation|  Indian|        1400|                   62|        Card|   Delivered|
|    O004|        Pooja|   Mumbai| Sushi Bar|Japanese|        1200|                   55|         UPI|   Cancelled|
|    O038|        Sonia|   Mumbai| Sushi Bar|Japanese|        1180|     

#SECTION D — FORMAT CONVERSION

In [23]:
#CSV → Parquet

df_csv.write.mode("overwrite").parquet("orders_csv_to_parquet")

In [24]:
#JSON → Parquet

df_json.write.mode("overwrite").parquet("delivered_orders_json_to_parquet")

In [34]:
#Read Parquet and write it back as CSV using delimiter |

df_parquet.write.mode("overwrite").option("header",True).option("delimiter",",").csv("orders_parquet_to_csv")

dfx_csv = spark.read.option("header",True).option("inferSchema",True).csv("orders_parquet_to_csv")
dfx_csv.show()

+--------+-------------+---------+------------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|  restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+------------+--------+------------+---------------------+------------+------------+
|    O021|        Priya|Hyderabad|  Pizza Town| Italian|         720|                   41|        Card|   Delivered|
|    O022|         Yash|  Chennai|   Sushi Bar|Japanese|        1150|                   57|         UPI|   Delivered|
|    O023|        Naina|Bangalore|Pasta Street| Italian|         680|                   44|         UPI|   Delivered|
|    O024|       Sameer|   Mumbai| Dragon Bowl| Chinese|         610|                   39|      Wallet|   Delivered|
|    O025|       Ritika|    Delhi| Burger Zone|American|         500|                   30|        Cash|   Delivered|
|    O026|        Gopal|Hyderabad|  Curry Leaf|  Indian|

#THINKING / ANALYTICS QUESTIONS

In [26]:
#Which cuisine generates the highest order_amount overall?

df.groupBy("cuisine").agg(sum("order_amount").alias("total_amount")).orderBy(col("total_amount").desc()).show(1)

+-------+------------+
|cuisine|total_amount|
+-------+------------+
| Indian|        9370|
+-------+------------+
only showing top 1 row


In [27]:
#Which city has the highest number of orders
df.groupBy("city").count().orderBy(col("count").desc()).show(1)

+-----+-----+
| city|count|
+-----+-----+
|Delhi|    9|
+-----+-----+
only showing top 1 row


In [28]:
#Which payment mode is most frequently used ?
df.groupBy("payment_mode").count().orderBy(col("count").desc()).show(1)

+------------+-----+
|payment_mode|count|
+------------+-----+
|         UPI|   17|
+------------+-----+
only showing top 1 row


#Optional Challenge

In [29]:
#Repartition the dataset into 4 partitions and write to Parquet

df.repartition(4).write.mode("overwrite").parquet("orders_4_partitioned")

In [31]:
"""Create a report dataset containing:
city
total_orders
total_revenue
Write it to Parquet."""

city_report = df.groupBy("city").agg(count("*").alias("total_orders"),sum("order_amount").alias("total_revenue"))
city_report.write.mode("overwrite").parquet("city_report")

df3_parquet = spark.read.parquet("city_report")
df3_parquet.show()

+---------+------------+-------------+
|     city|total_orders|total_revenue|
+---------+------------+-------------+
|Bangalore|           8|         4630|
|  Chennai|           7|         4350|
|   Mumbai|           8|         8200|
|    Delhi|           9|         5690|
|Hyderabad|           8|         5880|
+---------+------------+-------------+

