In [28]:
from pyspark.sql import SparkSession

spark =SparkSession.builder.appName("june12set1").enableHiveSupport().getOrCreate()
spark

In [29]:
from pyspark.sql import Row
data = [
Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1},
{"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}],
Region="Europe", Amount=650.0),
Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2},
{"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}],
Region="US", Amount=450.0)
]
df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)

+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



In [30]:
# Working with JSON & Nested Fields
# 1. Flatten the Items array using explode() to create one row per product.
# 2. Count total quantity sold per product.
# 3. Count number of orders per region.
from pyspark.sql.functions import explode,col
from pyspark.sql.types import IntegerType
df_exploded=df_sales.withColumn("Items",explode("Items"))
df_exploded.show()
df_flat=df_exploded.select("OrderID","Customer",col("Items.Product").alias("Product"),col("Items.Qty").cast(IntegerType()).alias("Qty"),"Region","Amount")
df_flat.show()

df_flat.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()
df_flat.groupBy("Region").count().show()

+-------+--------+--------------------+------+------+
|OrderID|Customer|               Items|Region|Amount|
+-------+--------+--------------------+------+------+
|    101|     Ali|{Product -> Lapto...|  Asia|1200.0|
|    101|     Ali|{Product -> Mouse...|  Asia|1200.0|
|    102|    Zara|{Product -> Table...|Europe| 650.0|
|    103|   Mohan|{Product -> Phone...|  Asia| 890.0|
|    103|   Mohan|{Product -> Charg...|  Asia| 890.0|
|    104|    Sara|{Product -> Desk,...|    US| 450.0|
+-------+--------+--------------------+------+------+

+-------+--------+-------+---+------+------+
|OrderID|Customer|Product|Qty|Region|Amount|
+-------+--------+-------+---+------+------+
|    101|     Ali| Laptop|  1|  Asia|1200.0|
|    101|     Ali|  Mouse|  2|  Asia|1200.0|
|    102|    Zara| Tablet|  1|Europe| 650.0|
|    103|   Mohan|  Phone|  2|  Asia| 890.0|
|    103|   Mohan|Charger|  1|  Asia| 890.0|
|    104|    Sara|   Desk|  1|    US| 450.0|
+-------+--------+-------+---+------+------+

+-------

In [31]:
# Using when and otherwise
# 4. Create a new column HighValueOrder :
# "Yes" if Amount > 1000
# "No" otherwise
# 5. Add a column ShippingZone :
# Asia → "Zone A", Europe → "Zone B", US → "Zone C"
from pyspark.sql.functions import when
df_flat=df_flat.withColumn("HighValueOrder",when(col("Amount")>1000,"Yes").otherwise("No"))
df_flat.show()
df_flat=df_flat.withColumn("ShippingZone",when(col("Region")=="Asia","Zone A").when(col("Region")=="Europe","Zone B").otherwise("Zone C"))
df_flat.show()

+-------+--------+-------+---+------+------+--------------+
|OrderID|Customer|Product|Qty|Region|Amount|HighValueOrder|
+-------+--------+-------+---+------+------+--------------+
|    101|     Ali| Laptop|  1|  Asia|1200.0|           Yes|
|    101|     Ali|  Mouse|  2|  Asia|1200.0|           Yes|
|    102|    Zara| Tablet|  1|Europe| 650.0|            No|
|    103|   Mohan|  Phone|  2|  Asia| 890.0|            No|
|    103|   Mohan|Charger|  1|  Asia| 890.0|            No|
|    104|    Sara|   Desk|  1|    US| 450.0|            No|
+-------+--------+-------+---+------+------+--------------+

+-------+--------+-------+---+------+------+--------------+------------+
|OrderID|Customer|Product|Qty|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+-------+---+------+------+--------------+------------+
|    101|     Ali| Laptop|  1|  Asia|1200.0|           Yes|      Zone A|
|    101|     Ali|  Mouse|  2|  Asia|1200.0|           Yes|      Zone A|
|    102|    Zara| Tablet|  1|Euro

In [32]:
# Temporary & Permanent Views
# 6. Register df_sales as a temporary view named sales_view .
# 7. Write a SQL query to:
# Count orders by Region
# Find average amount per region
# 8. Create a permanent view using saveAsTable() .
sales_view=df_sales.createOrReplaceTempView("sales_view")
spark.sql("select Region,count(*) as regioncount from sales_view group by Region").show()
spark.sql("select Region,avg(Amount) from sales_view group by Region").show()
df_sales.write.mode("overwrite").saveAsTable("sales_table")


+------+-----------+
|Region|regioncount|
+------+-----------+
|Europe|          1|
|  Asia|          2|
|    US|          1|
+------+-----------+

+------+-----------+
|Region|avg(Amount)|
+------+-----------+
|Europe|      650.0|
|  Asia|     1045.0|
|    US|      450.0|
+------+-----------+



In [33]:
# SQL Queries via Spark
# 9. Use SQL to filter all orders with more than 1 item.
# 10. Use SQL to extract customer names where Amount > 800.
spark.sql("SELECT OrderID, Customer, Size(Items) as ItemCount FROM sales_view WHERE Size(Items) > 1").show()
spark.sql("SELECT Customer FROM sales_view WHERE Amount > 800").show()


+-------+--------+---------+
|OrderID|Customer|ItemCount|
+-------+--------+---------+
|    101|     Ali|        2|
|    103|   Mohan|        2|
+-------+--------+---------+

+--------+
|Customer|
+--------+
|     Ali|
|   Mohan|
+--------+



In [35]:
# Saving as Parquet and Reading Again
# 11. Save the exploded product-level DataFrame as a partitioned Parquet file byRegion .
# 12. Read the parquet back and perform a group-by on Product .


df_parquet = spark.read.parquet("/content/drive/MyDrive/ParquetData/sales_by_region")
df_parquet.show()
df_parquet.groupBy("Product").count().show()



+-------+--------+-------+---+------+------+
|OrderID|Customer|Product|Qty|Amount|Region|
+-------+--------+-------+---+------+------+
|    103|   Mohan|  Phone|  2| 890.0|  Asia|
|    103|   Mohan|Charger|  1| 890.0|  Asia|
|    101|     Ali| Laptop|  1|1200.0|  Asia|
|    101|     Ali|  Mouse|  2|1200.0|  Asia|
|    102|    Zara| Tablet|  1| 650.0|Europe|
|    104|    Sara|   Desk|  1| 450.0|    US|
+-------+--------+-------+---+------+------+

+-------+-----+
|Product|count|
+-------+-----+
|  Phone|    1|
| Laptop|    1|
|Charger|    1|
|  Mouse|    1|
|   Desk|    1|
| Tablet|    1|
+-------+-----+

