In [0]:
data = [
    (101, "Laptop", "Electronics", 55000, 10),
    (102, "Smartphone", "Electronics", 30000, 25),
    (103, "Chair", "Furniture", 2500, 50),
    (104, "Book", "Stationery", 400, 200),
    (105, "Headphones", "Electronics", 1500, 100),
    (106, "Table", "Furniture", 3200, 40),
    (107, "Pen", "Stationery", 20, 500),
    (108, "Monitor", "Electronics", 12000, 15),
    (109, "Notebook", "Stationery", 60, 300),
    (110, "Sofa", "Furniture", 45000, 5)
]
columns = ["product_id", "product_name", "category", "price", "quantity"]
df = spark.createDataFrame(data, columns)
df.show()

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       101|      Laptop|Electronics|55000|      10|
|       102|  Smartphone|Electronics|30000|      25|
|       103|       Chair|  Furniture| 2500|      50|
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
|       108|     Monitor|Electronics|12000|      15|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
+----------+------------+-----------+-----+--------+



In [0]:
df.write.mode("overwrite").option("header", "true").csv("/tmp/products_csv")
df.write.mode("overwrite").json("/tmp/products_json")

In [0]:
csv_df = spark.read.option("header", "true").csv("/tmp/products_csv")
csv_df.show()

json_df = spark.read.json("/tmp/products_json")
json_df.show()

csv_df.write.mode("overwrite").parquet("/tmp/products_parquet")

parquet_df = spark.read.parquet("/tmp/products_parquet")
parquet_df.show()

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
|       102|  Smartphone|Electronics|30000|      25|
|       108|     Monitor|Electronics|12000|      15|
|       101|      Laptop|Electronics|55000|      10|
|       103|       Chair|  Furniture| 2500|      50|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
+----------+------------+-----------+-----+--------+

+-----------+-----+----------+------------+--------+
|   category|price|product_id|product_name|quantity|
+-----------+-----+----------+------------+--------+
| Stationery|  400|       104|        Book|     200|
|Electronics| 1500|       105|  Headphones|  

In [0]:
from pyspark.sql.functions import col
df_with_revenue = csv_df.withColumn("total_revenue", col("price") * col("quantity"))
df_with_revenue.show()


+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       104|        Book| Stationery|  400|     200|      80000.0|
|       105|  Headphones|Electronics| 1500|     100|     150000.0|
|       109|    Notebook| Stationery|   60|     300|      18000.0|
|       110|        Sofa|  Furniture|45000|       5|     225000.0|
|       102|  Smartphone|Electronics|30000|      25|     750000.0|
|       108|     Monitor|Electronics|12000|      15|     180000.0|
|       101|      Laptop|Electronics|55000|      10|     550000.0|
|       103|       Chair|  Furniture| 2500|      50|     125000.0|
|       106|       Table|  Furniture| 3200|      40|     128000.0|
|       107|         Pen| Stationery|   20|     500|      10000.0|
+----------+------------+-----------+-----+--------+-------------+



In [0]:
df_with_revenue.orderBy(col("total_revenue").desc()).show(3)

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       102|  Smartphone|Electronics|30000|      25|     750000.0|
|       101|      Laptop|Electronics|55000|      10|     550000.0|
|       110|        Sofa|  Furniture|45000|       5|     225000.0|
+----------+------------+-----------+-----+--------+-------------+
only showing top 3 rows


In [0]:
furniture_df = df_with_revenue.filter((col("category") == "Furniture") & (col("price") > 3000))
furniture_df.show()

+----------+------------+---------+-----+--------+-------------+
|product_id|product_name| category|price|quantity|total_revenue|
+----------+------------+---------+-----+--------+-------------+
|       110|        Sofa|Furniture|45000|       5|     225000.0|
|       106|       Table|Furniture| 3200|      40|     128000.0|
+----------+------------+---------+-----+--------+-------------+



In [0]:
from pyspark.sql.functions import when
df_with_band = df_with_revenue.withColumn(
    "price_band",when(col("price") > 10000, "High")
    .when((col("price") > 3000) & (col("price") <= 10000), "Medium")
    .otherwise("Low")
)
df_with_band.show()


+----------+------------+-----------+-----+--------+-------------+----------+
|product_id|product_name|   category|price|quantity|total_revenue|price_band|
+----------+------------+-----------+-----+--------+-------------+----------+
|       104|        Book| Stationery|  400|     200|      80000.0|       Low|
|       105|  Headphones|Electronics| 1500|     100|     150000.0|       Low|
|       109|    Notebook| Stationery|   60|     300|      18000.0|       Low|
|       110|        Sofa|  Furniture|45000|       5|     225000.0|      High|
|       102|  Smartphone|Electronics|30000|      25|     750000.0|      High|
|       108|     Monitor|Electronics|12000|      15|     180000.0|      High|
|       101|      Laptop|Electronics|55000|      10|     550000.0|      High|
|       103|       Chair|  Furniture| 2500|      50|     125000.0|       Low|
|       106|       Table|  Furniture| 3200|      40|     128000.0|    Medium|
|       107|         Pen| Stationery|   20|     500|      10000.

In [0]:
df_with_band.withColumn("quantity", col("quantity").cast("float")).groupBy("category").sum("quantity").withColumnRenamed("sum(quantity)", "total_quantity").show()

+-----------+--------------+
|   category|total_quantity|
+-----------+--------------+
| Stationery|        1000.0|
|  Furniture|          95.0|
|Electronics|         150.0|
+-----------+--------------+



In [0]:
df_with_band.withColumn("price", col("price").cast("float")) \
    .groupBy("category") \
    .avg("price") \
    .withColumnRenamed("avg(price)", "average_price") \
    .show()


+-----------+-------------+
|   category|average_price|
+-----------+-------------+
| Stationery|        160.0|
|  Furniture|      16900.0|
|Electronics|      24625.0|
+-----------+-------------+



In [0]:
df_with_band.groupBy("price_band").count().show()

+----------+-----+
|price_band|count|
+----------+-----+
|       Low|    5|
|      High|    4|
|    Medium|    1|
+----------+-----+



In [0]:
electronics_df = df_with_band.filter((col("category") == "Electronics") & (col("price") > 5000))
electronics_df.write.mode("overwrite").parquet("/tmp/electronics_filtered_parquet")


In [0]:
stationery_df = df_with_band.filter(col("category") == "Stationery")
stationery_df.write.mode("overwrite").json("/tmp/stationery_json")

In [0]:
parquet_df = spark.read.parquet("dbfs:/tmp/products_parquet")
parquet_df = parquet_df.withColumn("total_revenue", col("price") * col("quantity"))
parquet_df.groupBy("category") \
    .sum("total_revenue") \
    .withColumnRenamed("sum(total_revenue)", "category_revenue") \
    .orderBy(col("category_revenue").desc()) \
    .show(1)



+-----------+----------------+
|   category|category_revenue|
+-----------+----------------+
|Electronics|       1630000.0|
+-----------+----------------+
only showing top 1 row


In [0]:
df_with_band.createOrReplaceTempView("products")
spark.sql("""
    select * from products
    where quantity > 100 and price < 1000
""").show()


+----------+------------+----------+-----+--------+-------------+----------+
|product_id|product_name|  category|price|quantity|total_revenue|price_band|
+----------+------------+----------+-----+--------+-------------+----------+
|       104|        Book|Stationery|  400|     200|      80000.0|       Low|
|       109|    Notebook|Stationery|   60|     300|      18000.0|       Low|
|       107|         Pen|Stationery|   20|     500|      10000.0|       Low|
+----------+------------+----------+-----+--------+-------------+----------+

