#Spark SQL Exercise Set – Product Orders Analytics
Preparation Instructions
1. Create a PySpark DataFrame with the following schema:
OrderID (int)
CustomerName (string)
Product (string)
Category (string)
Quantity (int)
UnitPrice (int)
OrderDate (string in YYYY-MM-DD format)
2. Sample at least 12 rows across multiple categories:
"Electronics" , "Clothing" , "Furniture" , "Books"
3. Create:
A local temporary view: "orders_local"
A global temporary view: "orders_global"

In [42]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ProductOrders").getOrCreate()
data = [
    (101, "Abinaya", "Smartphone", "Electronics", 2, 15000, "2025-07-01"),
    (102, "Sereesha", "T-Shirt", "Clothing", 3, 500, "2025-06-02"),
    (103, "Charu", "Laptop", "Electronics", 1, 60000, "2025-05-03"),
    (104, "Harish", "Sofa", "Furniture", 1, 25000, "2025-07-04"),
    (105, "Elakkiya", "Novel", "Books", 2, 300, "2025-07-05"),
    (106, "Kashifa", "Tablet", "Electronics", 1, 20000, "2025-06-06"),
    (107, "Roja", "Dress", "Clothing", 2, 1500, "2023-01-07"),
    (108, "Harish", "Bookshelf", "Furniture", 1, 8000, "2025-07-08"),
    (109, "Varshini", "Headphones", "Electronics", 2, 2500, "2025-06-09"),
    (110, "Lavanya", "Comic Book", "Books", 4, 200, "2023-01-10"),
    (111, "Sereesha", "Shirt", "Clothing", 1, 700, "2025-05-11"),
    (112, "Elakkiya", "Dining Table", "Furniture", 2, 8000, "2025-07-12")
]
columns = ["OrderID", "CustomerName", "Product", "Category", "Quantity", "UnitPrice", "OrderDate"]
df = spark.createDataFrame(data, columns)
df.show()

+-------+------------+------------+-----------+--------+---------+----------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+------------+-----------+--------+---------+----------+
|    101|     Abinaya|  Smartphone|Electronics|       2|    15000|2025-07-01|
|    102|    Sereesha|     T-Shirt|   Clothing|       3|      500|2025-06-02|
|    103|       Charu|      Laptop|Electronics|       1|    60000|2025-05-03|
|    104|      Harish|        Sofa|  Furniture|       1|    25000|2025-07-04|
|    105|    Elakkiya|       Novel|      Books|       2|      300|2025-07-05|
|    106|     Kashifa|      Tablet|Electronics|       1|    20000|2025-06-06|
|    107|        Roja|       Dress|   Clothing|       2|     1500|2023-01-07|
|    108|      Harish|   Bookshelf|  Furniture|       1|     8000|2025-07-08|
|    109|    Varshini|  Headphones|Electronics|       2|     2500|2025-06-09|
|    110|     Lavanya|  Comic Book|      Books|       4|      20

In [43]:
df.createOrReplaceTempView("orders_local")
df.createOrReplaceGlobalTempView("orders_global")

#Part A: Local View – orders_local

In [44]:
# 1. List all orders placed for "Electronics" with a Quantity of 2 or more.
spark.sql("select * from orders_local where Category = 'Electronics' and Quantity >= 2 ").show()

+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|    101|     Abinaya|Smartphone|Electronics|       2|    15000|2025-07-01|
|    109|    Varshini|Headphones|Electronics|       2|     2500|2025-06-09|
+-------+------------+----------+-----------+--------+---------+----------+



In [45]:
# 2. Calculate TotalAmount (Quantity × UnitPrice) for each order.
spark.sql("select OrderID, (Quantity * UnitPrice) as TotalAmount from orders_local ").show()

+-------+-----------+
|OrderID|TotalAmount|
+-------+-----------+
|    101|      30000|
|    102|       1500|
|    103|      60000|
|    104|      25000|
|    105|        600|
|    106|      20000|
|    107|       3000|
|    108|       8000|
|    109|       5000|
|    110|        800|
|    111|        700|
|    112|      16000|
+-------+-----------+



In [46]:
# 3. Show the total number of orders per Category .
spark.sql("select Category, count(*) as Total_no_orders from orders_local group by Category ").show()

+-----------+---------------+
|   Category|Total_no_orders|
+-----------+---------------+
|Electronics|              4|
|   Clothing|              3|
|      Books|              2|
|  Furniture|              3|
+-----------+---------------+



In [56]:
# 4. List orders placed in "January 2023" only.
spark.sql("select * from orders_local where OrderDate like '2023-01%'").show()

+-------+------------+----------+--------+--------+---------+----------+
|OrderID|CustomerName|   Product|Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+--------+--------+---------+----------+
|    107|        Roja|     Dress|Clothing|       2|     1500|2023-01-07|
|    110|     Lavanya|Comic Book|   Books|       4|      200|2023-01-10|
+-------+------------+----------+--------+--------+---------+----------+



In [55]:
# 5. Show the average UnitPrice per category.
spark.sql("select Category, round(avg(UnitPrice),2) from orders_local group by Category").show()

+-----------+------------------------+
|   Category|round(avg(UnitPrice), 2)|
+-----------+------------------------+
|Electronics|                 24375.0|
|   Clothing|                   900.0|
|      Books|                   250.0|
|  Furniture|                13666.67|
+-----------+------------------------+



In [57]:
# 6. Find the order with the highest total amount.
spark.sql("select *,(Quantity * UnitPrice) as TotalAmount from orders_local order by TotalAmount Desc limit 1").show()

+-------+------------+-------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+-------+-----------+--------+---------+----------+-----------+
|    103|       Charu| Laptop|Electronics|       1|    60000|2025-05-03|      60000|
+-------+------------+-------+-----------+--------+---------+----------+-----------+



In [58]:
# 7. Drop the local view and try querying it again.
spark.catalog.dropTempView("orders_local")

True

In [60]:
# running querry after froppinf local view gives error
spark.sql("select * from orders_local ").show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `orders_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [orders_local], [], false


#Part B: Global View – orders_global

In [61]:
# 1. Display all "Furniture" orders with TotalAmount above 10,000.
spark.sql("select * from global_temp.orders_global where Category = 'Furniture' and Quantity * UnitPrice > 10000 ").show()

+-------+------------+------------+---------+--------+---------+----------+
|OrderID|CustomerName|     Product| Category|Quantity|UnitPrice| OrderDate|
+-------+------------+------------+---------+--------+---------+----------+
|    104|      Harish|        Sofa|Furniture|       1|    25000|2025-07-04|
|    112|    Elakkiya|Dining Table|Furniture|       2|     8000|2025-07-12|
+-------+------------+------------+---------+--------+---------+----------+



In [62]:
# 2. Create a column called DiscountFlag :
# Mark "Yes" if Quantity > 3
# Otherwise "No"
from pyspark.sql.functions import when
discount_df = df.withColumn('DiscountFlag', when(df.Quantity >= 3, 'Yes').otherwise('NO'))
discount_df.show()

+-------+------------+------------+-----------+--------+---------+----------+------------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|DiscountFlag|
+-------+------------+------------+-----------+--------+---------+----------+------------+
|    101|     Abinaya|  Smartphone|Electronics|       2|    15000|2025-07-01|          NO|
|    102|    Sereesha|     T-Shirt|   Clothing|       3|      500|2025-06-02|         Yes|
|    103|       Charu|      Laptop|Electronics|       1|    60000|2025-05-03|          NO|
|    104|      Harish|        Sofa|  Furniture|       1|    25000|2025-07-04|          NO|
|    105|    Elakkiya|       Novel|      Books|       2|      300|2025-07-05|          NO|
|    106|     Kashifa|      Tablet|Electronics|       1|    20000|2025-06-06|          NO|
|    107|        Roja|       Dress|   Clothing|       2|     1500|2023-01-07|          NO|
|    108|      Harish|   Bookshelf|  Furniture|       1|     8000|2025-07-08|          NO|

In [63]:
# 3. List customers who ordered more than 1 product type (Hint: use GROUP BY and HAVING).
spark.sql("select CustomerName,count(*) from global_temp.orders_global group by CustomerName having count(*) > 1").show()

+------------+--------+
|CustomerName|count(1)|
+------------+--------+
|      Harish|       2|
|    Sereesha|       2|
|    Elakkiya|       2|
+------------+--------+



In [64]:
# 4. Count number of orders per month across the dataset.
spark.sql("select substring(OrderDate, 1, 7) as Month, count(*) OrderCount from global_temp.orders_global group by Month order by Month").show()

+-------+----------+
|  Month|OrderCount|
+-------+----------+
|2023-01|         2|
|2025-05|         2|
|2025-06|         3|
|2025-07|         5|
+-------+----------+



In [65]:
# 5. Rank all products by total quantity sold across all orders using a window function.
from pyspark.sql.functions import sum, rank
from pyspark.sql.window import Window
product_totals = df.groupBy("Product").agg(sum("Quantity").alias("TotalQuantity"))
ranked = product_totals.withColumn("Rank",rank().over(Window.orderBy(product_totals["TotalQuantity"].desc())))
ranked.show()

+------------+-------------+----+
|     Product|TotalQuantity|Rank|
+------------+-------------+----+
|  Comic Book|            4|   1|
|     T-Shirt|            3|   2|
|       Novel|            2|   3|
|  Smartphone|            2|   3|
|       Dress|            2|   3|
|Dining Table|            2|   3|
|  Headphones|            2|   3|
|      Laptop|            1|   8|
|        Sofa|            1|   8|
|      Tablet|            1|   8|
|   Bookshelf|            1|   8|
|       Shirt|            1|   8|
+------------+-------------+----+



In [67]:
# 6. Run a query using a new SparkSession and the global view.
new_spark = SparkSession.builder.appName("ProductOrdersNew").getOrCreate()

In [68]:
# running querry after creating new session
spark.sql("select * from global_temp.orders_global ").show()

+-------+------------+------------+-----------+--------+---------+----------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+------------+-----------+--------+---------+----------+
|    101|     Abinaya|  Smartphone|Electronics|       2|    15000|2025-07-01|
|    102|    Sereesha|     T-Shirt|   Clothing|       3|      500|2025-06-02|
|    103|       Charu|      Laptop|Electronics|       1|    60000|2025-05-03|
|    104|      Harish|        Sofa|  Furniture|       1|    25000|2025-07-04|
|    105|    Elakkiya|       Novel|      Books|       2|      300|2025-07-05|
|    106|     Kashifa|      Tablet|Electronics|       1|    20000|2025-06-06|
|    107|        Roja|       Dress|   Clothing|       2|     1500|2023-01-07|
|    108|      Harish|   Bookshelf|  Furniture|       1|     8000|2025-07-08|
|    109|    Varshini|  Headphones|Electronics|       2|     2500|2025-06-09|
|    110|     Lavanya|  Comic Book|      Books|       4|      20

#Bonus Challenges

In [72]:
# 1. Save a filtered subset (only "Books" category) as a new global temp view.
df.filter(df.Category == "Books").createOrReplaceGlobalTempView("books_orders")
spark.sql("select * from global_temp.books_orders").show()

+-------+------------+----------+--------+--------+---------+----------+
|OrderID|CustomerName|   Product|Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+--------+--------+---------+----------+
|    105|    Elakkiya|     Novel|   Books|       2|      300|2025-07-05|
|    110|     Lavanya|Comic Book|   Books|       4|      200|2023-01-10|
+-------+------------+----------+--------+--------+---------+----------+



In [76]:
# 2. Find the most purchased product per category.
product_totals = df.groupBy("Category", "Product").agg(sum("Quantity").alias("TotalQuantity"))
windowSpec = Window.partitionBy("Category").orderBy(product_totals["TotalQuantity"].desc())
ranked = product_totals.withColumn("Rank", rank().over(windowSpec))
top_products = ranked.filter(ranked.Rank == 1)
top_products.select("Category", "Product", "TotalQuantity").show()

+-----------+------------+-------------+
|   Category|     Product|TotalQuantity|
+-----------+------------+-------------+
|      Books|  Comic Book|            4|
|   Clothing|     T-Shirt|            3|
|Electronics|  Smartphone|            2|
|Electronics|  Headphones|            2|
|  Furniture|Dining Table|            2|
+-----------+------------+-------------+



In [78]:
# 3. Create a view that excludes all "Clothing" orders and call it "filtered_orders" .
df.filter(df.Category != "Clothing").createOrReplaceTempView("filtered_orders")
spark.sql("select * from filtered_orders").show()

+-------+------------+------------+-----------+--------+---------+----------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+------------+-----------+--------+---------+----------+
|    101|     Abinaya|  Smartphone|Electronics|       2|    15000|2025-07-01|
|    103|       Charu|      Laptop|Electronics|       1|    60000|2025-05-03|
|    104|      Harish|        Sofa|  Furniture|       1|    25000|2025-07-04|
|    105|    Elakkiya|       Novel|      Books|       2|      300|2025-07-05|
|    106|     Kashifa|      Tablet|Electronics|       1|    20000|2025-06-06|
|    108|      Harish|   Bookshelf|  Furniture|       1|     8000|2025-07-08|
|    109|    Varshini|  Headphones|Electronics|       2|     2500|2025-06-09|
|    110|     Lavanya|  Comic Book|      Books|       4|      200|2023-01-10|
|    112|    Elakkiya|Dining Table|  Furniture|       2|     8000|2025-07-12|
+-------+------------+------------+-----------+--------+--------