Preparation Instructions

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Orders DataFrame") \
    .getOrCreate()


In [2]:
from pyspark.sql import Row

data = [
    Row(101, "Alice", "Laptop", "Electronics", 1, 70000, "2025-07-01"),
    Row(102, "Bob", "T-Shirt", "Clothing", 3, 500, "2025-07-02"),
    Row(103, "Carol", "Bookshelf", "Furniture", 1, 3500, "2025-07-03"),
    Row(104, "David", "Smartphone", "Electronics", 2, 30000, "2025-07-04"),
    Row(105, "Eve", "Dress", "Clothing", 2, 1500, "2025-07-05"),
    Row(106, "Frank", "Chair", "Furniture", 4, 1000, "2025-07-06"),
    Row(107, "Grace", "Fiction Novel", "Books", 5, 300, "2025-07-07"),
    Row(108, "Heidi", "Tablet", "Electronics", 1, 25000, "2025-07-08"),
    Row(109, "Ivan", "Jeans", "Clothing", 1, 1200, "2025-07-09"),
    Row(110, "Judy", "Dining Table", "Furniture", 1, 8000, "2025-07-10"),
    Row(111, "Mallory", "Textbook", "Books", 3, 600, "2025-07-11"),
    Row(112, "Oscar", "Monitor", "Electronics", 2, 15000, "2025-07-12"),
]

columns = ["OrderID", "CustomerName", "Product", "Category", "Quantity", "UnitPrice", "OrderDate"]
df = spark.createDataFrame(data, schema=columns)
df.show()


+-------+------------+-------------+-----------+--------+---------+----------+
|OrderID|CustomerName|      Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------------+-----------+--------+---------+----------+
|    101|       Alice|       Laptop|Electronics|       1|    70000|2025-07-01|
|    102|         Bob|      T-Shirt|   Clothing|       3|      500|2025-07-02|
|    103|       Carol|    Bookshelf|  Furniture|       1|     3500|2025-07-03|
|    104|       David|   Smartphone|Electronics|       2|    30000|2025-07-04|
|    105|         Eve|        Dress|   Clothing|       2|     1500|2025-07-05|
|    106|       Frank|        Chair|  Furniture|       4|     1000|2025-07-06|
|    107|       Grace|Fiction Novel|      Books|       5|      300|2025-07-07|
|    108|       Heidi|       Tablet|Electronics|       1|    25000|2025-07-08|
|    109|        Ivan|        Jeans|   Clothing|       1|     1200|2025-07-09|
|    110|        Judy| Dining Table|  Furniture|    

In [3]:
df.createOrReplaceTempView("orders_local")
df.createOrReplaceGlobalTempView("orders_global")

Part A: Local View – orders_local

In [4]:
spark.sql("select * from orders_local where Category='Electronics' and Quantity >= 2").show()

+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|    104|       David|Smartphone|Electronics|       2|    30000|2025-07-04|
|    112|       Oscar|   Monitor|Electronics|       2|    15000|2025-07-12|
+-------+------------+----------+-----------+--------+---------+----------+



In [5]:
spark.sql("select *, Quantity * UnitPrice as Tot_amount from orders_local").show()

+-------+------------+-------------+-----------+--------+---------+----------+----------+
|OrderID|CustomerName|      Product|   Category|Quantity|UnitPrice| OrderDate|Tot_amount|
+-------+------------+-------------+-----------+--------+---------+----------+----------+
|    101|       Alice|       Laptop|Electronics|       1|    70000|2025-07-01|     70000|
|    102|         Bob|      T-Shirt|   Clothing|       3|      500|2025-07-02|      1500|
|    103|       Carol|    Bookshelf|  Furniture|       1|     3500|2025-07-03|      3500|
|    104|       David|   Smartphone|Electronics|       2|    30000|2025-07-04|     60000|
|    105|         Eve|        Dress|   Clothing|       2|     1500|2025-07-05|      3000|
|    106|       Frank|        Chair|  Furniture|       4|     1000|2025-07-06|      4000|
|    107|       Grace|Fiction Novel|      Books|       5|      300|2025-07-07|      1500|
|    108|       Heidi|       Tablet|Electronics|       1|    25000|2025-07-08|     25000|
|    109| 

In [7]:
spark.sql("select Category, count(Quantity) as total_Orders from orders_local group by Category").show()

+-----------+------------+
|   Category|total_Orders|
+-----------+------------+
|Electronics|           4|
|   Clothing|           3|
|  Furniture|           3|
|      Books|           2|
+-----------+------------+



In [8]:
spark.sql("select * from orders_local where OrderDate like '2023-01-%'").show()

+-------+------------+-------+--------+--------+---------+---------+
|OrderID|CustomerName|Product|Category|Quantity|UnitPrice|OrderDate|
+-------+------------+-------+--------+--------+---------+---------+
+-------+------------+-------+--------+--------+---------+---------+



In [11]:
spark.sql("select Category, round(avg(UnitPrice),2) as avg_unitPrice from orders_local group by Category").show()

+-----------+-------------+
|   Category|avg_unitPrice|
+-----------+-------------+
|Electronics|      35000.0|
|   Clothing|      1066.67|
|  Furniture|      4166.67|
|      Books|        450.0|
+-----------+-------------+



In [15]:
spark.sql("""
    create or replace temp view orders_with_total as
    select *, quantity * unitprice as tot_amount
    from orders_local
""")

spark.sql("""
    select * from orders_with_total
    order by tot_amount desc
    limit 1
""").show()

+-------+------------+-------+-----------+--------+---------+----------+----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|tot_amount|
+-------+------------+-------+-----------+--------+---------+----------+----------+
|    101|       Alice| Laptop|Electronics|       1|    70000|2025-07-01|     70000|
+-------+------------+-------+-----------+--------+---------+----------+----------+



In [16]:
spark.catalog.dropTempView("orders_local")


True

In [None]:
spark.sql("select * from orders_local").show()


Part B: Global View – orders_global

In [28]:
df.createOrReplaceGlobalTempView("orders_global")


In [37]:
spark.sql("""
    create or replace global temporary view orders_global_with_total as
    select *, quantity * unitprice as tot_amount
    from global_temp.orders_global
""")
spark.sql("""
    select *
    from global_temp.orders_global_with_total
    where category = 'Furniture' and tot_amount > 10000
""").show()


+-------+------------+-------+--------+--------+---------+---------+----------+
|OrderID|CustomerName|Product|Category|Quantity|UnitPrice|OrderDate|tot_amount|
+-------+------------+-------+--------+--------+---------+---------+----------+
+-------+------------+-------+--------+--------+---------+---------+----------+



In [38]:
from pyspark.sql.functions import when, col
df_status=df.withColumn("DiscountFlag",when(col("Quantity")>3, "Yes").otherwise("No")).show()

+-------+------------+-------------+-----------+--------+---------+----------+------------+
|OrderID|CustomerName|      Product|   Category|Quantity|UnitPrice| OrderDate|DiscountFlag|
+-------+------------+-------------+-----------+--------+---------+----------+------------+
|    101|       Alice|       Laptop|Electronics|       1|    70000|2025-07-01|          No|
|    102|         Bob|      T-Shirt|   Clothing|       3|      500|2025-07-02|          No|
|    103|       Carol|    Bookshelf|  Furniture|       1|     3500|2025-07-03|          No|
|    104|       David|   Smartphone|Electronics|       2|    30000|2025-07-04|          No|
|    105|         Eve|        Dress|   Clothing|       2|     1500|2025-07-05|          No|
|    106|       Frank|        Chair|  Furniture|       4|     1000|2025-07-06|         Yes|
|    107|       Grace|Fiction Novel|      Books|       5|      300|2025-07-07|         Yes|
|    108|       Heidi|       Tablet|Electronics|       1|    25000|2025-07-08|  

In [None]:
spark.sql("""
    select CustomerName, count(distinct Product) as product_types_ordered
    from global_temp.orders_global
    group by CustomerName
    having count(distinct Product) > 1
""").show()


In [41]:
spark.sql("""
    select
        substr(OrderDate, 1, 7) as year_month,
        count(*) as orders_count
    from global_temp.orders_global
    group by substr(OrderDate, 1, 7)
    order by year_month
""").show()


+----------+------------+
|year_month|orders_count|
+----------+------------+
|   2025-07|          12|
+----------+------------+



In [46]:
spark.sql("""
    select
        product,
        sum(quantity) as total_quantity,
        rank() over (order by sum(quantity) desc) as rank
    from global_temp.orders_global
    group by product
    order by rank
""").show()



+-------------+--------------+----+
|      product|total_quantity|rank|
+-------------+--------------+----+
|Fiction Novel|             5|   1|
|        Chair|             4|   2|
|      T-Shirt|             3|   3|
|     Textbook|             3|   3|
|        Dress|             2|   5|
|   Smartphone|             2|   5|
|      Monitor|             2|   5|
|       Laptop|             1|   8|
|    Bookshelf|             1|   8|
|       Tablet|             1|   8|
| Dining Table|             1|   8|
|        Jeans|             1|   8|
+-------------+--------------+----+



In [47]:
from pyspark.sql import SparkSession

new_spark = SparkSession.builder \
    .appName("NewSession") \
    .getOrCreate()


In [48]:
new_spark.sql("select * from global_temp.orders_global limit 5").show()


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|    101|       Alice|    Laptop|Electronics|       1|    70000|2025-07-01|
|    102|         Bob|   T-Shirt|   Clothing|       3|      500|2025-07-02|
|    103|       Carol| Bookshelf|  Furniture|       1|     3500|2025-07-03|
|    104|       David|Smartphone|Electronics|       2|    30000|2025-07-04|
|    105|         Eve|     Dress|   Clothing|       2|     1500|2025-07-05|
+-------+------------+----------+-----------+--------+---------+----------+



Bonus Challenges

In [62]:
spark.sql("select * from global_temp.books_only").show()


+-------+------------+-------------+--------+--------+---------+----------+
|OrderID|CustomerName|      Product|Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------------+--------+--------+---------+----------+
|    107|       Grace|Fiction Novel|   Books|       5|      300|2025-07-07|
|    111|     Mallory|     Textbook|   Books|       3|      600|2025-07-11|
+-------+------------+-------------+--------+--------+---------+----------+



In [None]:
spark.sql("""
    create or replace global temporary view books_only as
    select *
    from global_temp.orders_global
    where category = 'Books'
""").show()


In [65]:
spark.sql("""
    select category, product, total_quantity from (
        select category,product,
            sum(quantity) as total_quantity,
            row_number() over (partition by category order by sum(quantity) desc) as rn
        from global_temp.orders_global
        group by category, product
    ) tmp
    where rn = 1
""").show()


+-----------+-------------+--------------+
|   category|      product|total_quantity|
+-----------+-------------+--------------+
|      Books|Fiction Novel|             5|
|   Clothing|      T-Shirt|             3|
|Electronics|   Smartphone|             2|
|  Furniture|        Chair|             4|
+-----------+-------------+--------------+



In [None]:
spark.sql("""
    create or replace temp view filtered_orders as
    select *
    from global_temp.orders_global
    where category != 'clothing'
""").show()
