## Dataset: sales_data.json (nested JSON)


In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("Exercise_1") \
                .getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7f94a98d5f10>

In [0]:
from pyspark.sql import Row 
data = [ 
        Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}, 
        {"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0), 
        Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}], 
        Region="Europe", Amount=650.0), 
        Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, 
        {"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0), 
        Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}], 
        Region="US", Amount=450.0) 
        ] 
df_sales = spark.createDataFrame(data) 
df_sales.show(truncate=False)

+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



## PySpark Exercises – Set 4 (SQL, JSON, Advanced Functions)
### Working with JSON & Nested Fields


1. Flatten the Items array using explode() to create one row per product .

In [0]:
from pyspark.sql.functions import col, explode
sales_df = df_sales.withColumn("Item", explode("Items"))
sales_df.show()


+-------+--------+--------------------+------+------+--------------------+
|OrderID|Customer|               Items|Region|Amount|                Item|
+-------+--------+--------------------+------+------+--------------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|{Product -> Lapto...|
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|{Product -> Mouse...|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|{Product -> Table...|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|{Product -> Phone...|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|{Product -> Charg...|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|{Product -> Desk,...|
+-------+--------+--------------------+------+------+--------------------+



2. Count total quantity sold per product.

In [0]:
from pyspark.sql.types import IntegerType
sales_df = sales_df.withColumn("Product", col("Item.Product")) \
                   .withColumn("Quantity", col("Item.Qty").cast("int"))

sales_df.groupBy("Product") \
        .sum("Quantity") \
        .withColumnRenamed("sum(Quantity)", "TotalQuantity") \
        .show()


+-------+-------------+
|Product|TotalQuantity|
+-------+-------------+
|  Mouse|            2|
| Laptop|            1|
| Tablet|            1|
|Charger|            1|
|  Phone|            2|
|   Desk|            1|
+-------+-------------+



3. Count number of orders per region.

In [0]:
sales_df.groupBy(col('Region')) \
        .count() \
        .withColumnRenamed("count", "TotalOrders")\
        .show()

+-------+--------+--------------------+------+------+--------------------+-------+--------+
|OrderID|Customer|               Items|Region|Amount|                Item|Product|Quantity|
+-------+--------+--------------------+------+------+--------------------+-------+--------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|{Product -> Lapto...| Laptop|       1|
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|{Product -> Mouse...|  Mouse|       2|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|{Product -> Table...| Tablet|       1|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|{Product -> Phone...|  Phone|       2|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|{Product -> Charg...|Charger|       1|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|{Product -> Desk,...|   Desk|       1|
+-------+--------+--------------------+------+------+--------------------+-------+--------+

+------+-----------+
|Region|TotalOrders|
+------+-----------+
|  Asia|        

### Using when and otherwise

4.  Create a new column \
HighValueOrder : \
"Yes" if Amount > 1000
"No" otherwise

In [0]:
from pyspark.sql.functions import when
sales_df = sales_df.withColumn("HighValueOrder", 
                               when(col('Amount') > 1000, "Yes")
                               .otherwise("No")
                               )
sales_df.select(
                "OrderID",
                "Customer",
                "Amount",
                "HighValueOrder"
).show()

+-------+--------+------+--------------+
|OrderID|Customer|Amount|HighValueOrder|
+-------+--------+------+--------------+
|    101|     Ali|1200.0|           Yes|
|    101|     Ali|1200.0|           Yes|
|    102|    Zara| 650.0|            No|
|    103|   Mohan| 890.0|            No|
|    103|   Mohan| 890.0|            No|
|    104|    Sara| 450.0|            No|
+-------+--------+------+--------------+



5.  Add a column ShippingZone : \
 Asia → "Zone A", Europe → "Zone B", US → "Zone C"

In [0]:
sales_df = sales_df.withColumn("ShippingZone", 
                               when(col('Region') == "Asia", "Zone A") \
                               .when(col('Region') == "Europe", "Zone B") \
                               .when(col('Region') == "US", "Zone C"))
sales_df.select(
                "Customer",
                "Region",
                "ShippingZone"
).show()

+--------+------+------------+
|Customer|Region|ShippingZone|
+--------+------+------------+
|     Ali|  Asia|      Zone A|
|     Ali|  Asia|      Zone A|
|    Zara|Europe|      Zone B|
|   Mohan|  Asia|      Zone A|
|   Mohan|  Asia|      Zone A|
|    Sara|    US|      Zone C|
+--------+------+------------+



### Temporary & Permanent Views

6. Register df_sales as a temporary view named sales_view .

In [0]:
sales_df.createOrReplaceTempView("sales_view")

7. Write a SQL query to: \
o count orders by Region \
o Find average amount per region \

In [0]:
spark.sql("""
    SELECT
        OrderID,
        Region,
        Product,
        Quantity,
        Amount,
        COUNT(OrderID) OVER (PARTITION BY Region) AS OrdersByRegion,
        AVG(Amount) OVER (PARTITION BY Region) AS AvgAmountByRegion
    FROM sales_view
""").show()


+-------+------+-------+--------+------+--------------+-----------------+
|OrderID|Region|Product|Quantity|Amount|OrdersByRegion|AvgAmountByRegion|
+-------+------+-------+--------+------+--------------+-----------------+
|    101|  Asia| Laptop|       1|1200.0|             4|           1045.0|
|    101|  Asia|  Mouse|       2|1200.0|             4|           1045.0|
|    103|  Asia|  Phone|       2| 890.0|             4|           1045.0|
|    103|  Asia|Charger|       1| 890.0|             4|           1045.0|
|    102|Europe| Tablet|       1| 650.0|             1|            650.0|
|    104|    US|   Desk|       1| 450.0|             1|            450.0|
+-------+------+-------+--------+------+--------------+-----------------+



8. Create a permanent view using saveAsTable() 

In [0]:
sales_df.write.mode("overwrite").saveAsTable("sales_permanent_table")

spark.sql("select * from sales_permanent_table").show()

+-------+--------+--------------------+------+------+--------------------+-------+--------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|                Item|Product|Quantity|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------------+-------+--------+--------------+------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|{Product -> Lapto...| Laptop|       1|           Yes|      Zone A|
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|{Product -> Mouse...|  Mouse|       2|           Yes|      Zone A|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|{Product -> Table...| Tablet|       1|            No|      Zone B|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|{Product -> Phone...|  Phone|       2|            No|      Zone A|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|{Product -> Charg...|Charger|       1|            No|      Zone A|
|    104|    Sara|[{Product -> Desk...| 

###  SQL Queries via Spark

In [0]:
 spark.sql("""
            SELECT 
                Region, 
                COUNT(*) as OrderCount 
            FROM sales_view 
            GROUP BY Region
""").show()

+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         4|
|Europe|         1|
|    US|         1|
+------+----------+



9. Use SQL to filter all orders with more than 1 item.

In [0]:
spark.sql("""
            WITH OrderInfo AS(
                            SELECT
                                OrderID,
                                Product,
                                Quantity,
                                ROW_NUMBER() OVER(PARTITION BY OrderID ORDER BY Quantity DESC) as row_num
                            FROM sales_view)
            SELECT * 
            FROM OrderInfo
            WHERE row_num > 1
        
          """).show()

+-------+-------+--------+-------+
|OrderID|Product|Quantity|row_num|
+-------+-------+--------+-------+
|    101| Laptop|       1|      2|
|    103|Charger|       1|      2|
+-------+-------+--------+-------+



10. Use SQL to extract customer names where Amount > 800.

In [0]:
spark.sql("""
            SELECT
                Customer,
                SUM(Amount) AS TotalAmount
            FROM sales_view
            GROUP BY(Customer)
          """
          ).show()

+--------+-----------+
|Customer|TotalAmount|
+--------+-----------+
|     Ali|     2400.0|
|    Zara|      650.0|
|   Mohan|     1780.0|
|    Sara|      450.0|
+--------+-----------+



### Saving as Parquet and Reading Again

11. Save the exploded product-level DataFrame as a partitioned Parquet file by
 Region .

In [0]:

exploded_df = df_sales.withColumn("Item", explode(col("Items"))) \
                      .select(
                          "OrderID", 
                          "Customer", 
                          "Region", 
                          "Amount", 
                          col("Item.Product").alias("Product"), 
                          col("Item.Qty").alias("Quantity")
                      )

exploded_df.write.mode('Overwrite')\
           .partitionBy("Region") \
           .parquet("dbfs:/FileStore/exploded_parquet")
          

12. Read the parquet back and perform a group-by on 
Product 

In [0]:
exploded_df_loaded = spark.read.parquet("dbfs:/FileStore/exploded_parquet")
exploded_df_loaded.groupBy(col('Product')) \
                  .agg(sum('Quantity').cast("int").alias("TotalQuantity")) \
                  .orderBy('TotalQuantity', ascending=False) \
                  .show()

+-------+-------------+
|Product|TotalQuantity|
+-------+-------------+
|  Phone|            2|
|  Mouse|            2|
| Laptop|            1|
| Tablet|            1|
|Charger|            1|
|   Desk|            1|
+-------+-------------+

