In [0]:
# 1. Read the above data from CSV into a DataFrame and print the schema.
df_csv = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/FileStore/tables/products.csv")
df_csv.printSchema()
df_csv.show()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       101|      Laptop|Electronics|55000|      10|
|       102|  Smartphone|Electronics|30000|      25|
|       103|       Chair|  Furniture| 2500|      50|
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
|       108|     Monitor|Electronics|12000|      15|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
+----------+------------+-----------+-----+--------+



In [0]:
# 2. Read the same data from JSON and compare with the CSV schema. Any differences?
# read json
df_json = spark.read.json('dbfs:/FileStore/tables/products.json')
df_json.printSchema()

root
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: long (nullable = true)



In [0]:
# 3. Convert the CSV data into Parquet format and save to disk.
# read parquet
df_csv.write.mode('overwrite').parquet('dbfs:/tmp/product_data.parquet')

In [0]:
# 4. Measure the size of CSV vs JSON vs Parquet on disk. Which one is smallest?
csv_size = dbutils.fs.ls("dbfs:/FileStore/tables/products.csv")[0].size
json_size = dbutils.fs.ls("dbfs:/FileStore/tables/products.json")[0].size
parquet_size = dbutils.fs.ls("dbfs:/tmp/product_data.parquet")[0].size

print(f"CSV size: {csv_size} bytes")
print(f"JSON size: {json_size} bytes")
print(f"Parquet size: {parquet_size} bytes")

sizes = {"CSV": csv_size, "JSON": json_size, "Parquet": parquet_size}
smallest = min(sizes, key=sizes.get)
print(f"Smallest format: {smallest} with size {sizes[smallest]} bytes")


CSV size: 362 bytes
JSON size: 955 bytes
Parquet size: 223 bytes
Smallest format: Parquet with size 223 bytes


In [0]:
# 5. Add a column total_revenue = price * quantity for each record.
csv_df = df.withColumn('Total_revenue', df['price'] * df['quantity'])
csv_df.show()

+----------+------------+-----------+-----+--------+-------------+----------+
|product_id|product_name|   category|price|quantity|Total_revenue|price_band|
+----------+------------+-----------+-----+--------+-------------+----------+
|       101|      Laptop|Electronics|55000|      10|       550000|      High|
|       102|  Smartphone|Electronics|30000|      25|       750000|      High|
|       103|       Chair|  Furniture| 2500|      50|       125000|       Low|
|       104|        Book| Stationery|  400|     200|        80000|       Low|
|       105|  Headphones|Electronics| 1500|     100|       150000|       Low|
|       106|       Table|  Furniture| 3200|      40|       128000|    Medium|
|       107|         Pen| Stationery|   20|     500|        10000|       Low|
|       108|     Monitor|Electronics|12000|      15|       180000|      High|
|       109|    Notebook| Stationery|   60|     300|        18000|       Low|
|       110|        Sofa|  Furniture|45000|       5|       22500

In [0]:
# 6. Find the top 3 products with the highest total revenue.
from pyspark.sql.functions import col
csv_df.orderBy(col("total_revenue").desc()).limit(3).show()

+----------+------------+-----------+-----+--------+-------------+----------+
|product_id|product_name|   category|price|quantity|Total_revenue|price_band|
+----------+------------+-----------+-----+--------+-------------+----------+
|       102|  Smartphone|Electronics|30000|      25|       750000|      High|
|       101|      Laptop|Electronics|55000|      10|       550000|      High|
|       110|        Sofa|  Furniture|45000|       5|       225000|      High|
+----------+------------+-----------+-----+--------+-------------+----------+



In [0]:
# 7. Filter and display only Furniture products with price > 3000.
csv_df.filter((col('category') == 'Furniture') & (col('price') > 3000)).show()

+----------+------------+---------+-----+--------+-------------+----------+
|product_id|product_name| category|price|quantity|Total_revenue|price_band|
+----------+------------+---------+-----+--------+-------------+----------+
|       106|       Table|Furniture| 3200|      40|       128000|    Medium|
|       110|        Sofa|Furniture|45000|       5|       225000|      High|
+----------+------------+---------+-----+--------+-------------+----------+



In [0]:
# 8. Create a new column price_band with values:
# 'High' if price > 10000
# 'Medium' if 3000 < price <= 10000
# 'Low' if price ≤ 3000
from pyspark.sql.functions import when
csv_df.withColumn("price_band",when(col("price") > 10000, "High").when((col("price") > 3000) & (col("price") <= 10000), "Medium").otherwise("Low")
)
df.show()

+----------+------------+-----------+-----+--------+-------------+----------+
|product_id|product_name|   category|price|quantity|Total_revenue|price_band|
+----------+------------+-----------+-----+--------+-------------+----------+
|       101|      Laptop|Electronics|55000|      10|       550000|      High|
|       102|  Smartphone|Electronics|30000|      25|       750000|      High|
|       103|       Chair|  Furniture| 2500|      50|       125000|       Low|
|       104|        Book| Stationery|  400|     200|        80000|       Low|
|       105|  Headphones|Electronics| 1500|     100|       150000|       Low|
|       106|       Table|  Furniture| 3200|      40|       128000|    Medium|
|       107|         Pen| Stationery|   20|     500|        10000|       Low|
|       108|     Monitor|Electronics|12000|      15|       180000|      High|
|       109|    Notebook| Stationery|   60|     300|        18000|       Low|
|       110|        Sofa|  Furniture|45000|       5|       22500

In [0]:
# 9. Group by category and calculate total quantity sold.
from pyspark.sql.functions import sum
csv_df.groupBy("category").agg(sum("quantity").alias("total_quantity")).show()

+-----------+--------------+
|   category|total_quantity|
+-----------+--------------+
|Electronics|           150|
| Stationery|          1000|
|  Furniture|            95|
+-----------+--------------+



In [0]:
# 10. Calculate average price of products for each category.
from pyspark.sql.functions import avg
csv_df.groupBy("category").agg(avg("price").alias("avg_price")).show()

+-----------+---------+
|   category|avg_price|
+-----------+---------+
|Electronics|  24625.0|
| Stationery|    160.0|
|  Furniture|  16900.0|
+-----------+---------+



In [0]:
# 11. Count how many products fall in each price_band .
csv_df.groupBy("price_band").count().show()

+----------+-----+
|price_band|count|
+----------+-----+
|      High|    4|
|       Low|    5|
|    Medium|    1|
+----------+-----+



In [0]:
# 12. Write the filtered Electronics products (price > 5000) into a Parquet file.
elect_filter = csv_df.filter((col('category') == 'Electronics') & (col('price') > 5000) )
elect_filter.write.mode('overwrite').parquet('dbfs:/tmp/product_data_electronics.parquet')

In [0]:
# 13. Write the Stationery products into a JSON file.
stationery_filter = csv_df.filter((col('category') == 'Stationery'))
stationery_filter.write.mode('overwrite').json('dbfs:/tmp/product_data_stationery.json')

In [0]:
# 14. Load Parquet back and run a query to find which category has highest total revenue.
elect_df = spark.read.parquet('dbfs:/tmp/product_data_electronics.parquet')
elect_df.groupBy("category").agg(sum("Total_revenue").alias("total_revenue")).orderBy(col("total_revenue").desc()).show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|Electronics|      1480000|
+-----------+-------------+



In [0]:
# 15. BONUS: Create a temporary view from the DataFrame and run Spark SQL to find all products with quantity > 100 and price < 1000.
df.createOrReplaceTempView("product_data")
spark.sql("select * from product_data where quantity > 100 AND price < 1000").show()

+----------+------------+----------+-----+--------+-------------+----------+
|product_id|product_name|  category|price|quantity|Total_revenue|price_band|
+----------+------------+----------+-----+--------+-------------+----------+
|       104|        Book|Stationery|  400|     200|        80000|       Low|
|       107|         Pen|Stationery|   20|     500|        10000|       Low|
|       109|    Notebook|Stationery|   60|     300|        18000|       Low|
+----------+------------+----------+-----+--------+-------------+----------+

