In [0]:
csv_path1 = "/Volumes/workspace/default/product/cleaned_sales .csv"  
csv_path2= "/Volumes/workspace/default/product/cleaned_products.csv"
sales_df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path1)
products_df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path2)
sales_df.show()
products_df.show()


+-------+----------+--------+--------+----------+--------+
|sale_id|product_id|store_id|quantity| sale_date|discount|
+-------+----------+--------+--------+----------+--------+
|   1001|         1|       1|    10.0|2022-03-15|    10.0|
|   1002|         2|       2|     5.0|2023-06-20|     5.0|
|   1003|         3|       3|    20.0|2024-01-05|    10.0|
|   1004|         4|       4|    15.0|2023-11-12|     5.0|
|   1005|         5|       5|    25.0|2022-12-30|    10.0|
|   1006|         6|       6|    18.0|2024-07-10|     0.0|
|   1007|         7|       7|    22.0|2025-04-18|     0.0|
|   1008|         1|       2|     5.0|2025-05-22|     0.0|
|   1009|         3|       1|    10.0|2024-09-14|     0.0|
|   1010|         5|       3|    30.0|2023-02-28|     0.0|
|   1011|         6|       4|     8.0|2022-10-09|     0.0|
|   1012|         8|       1|     1.0|2024-06-01|     0.0|
|   1013|         9|       2|    10.0|0000-00-00|     0.0|
|   1014|        10|       3|     5.0|0000-00-00|     0.

In [0]:
from pyspark.sql.functions import col, to_date
df = sales_df.join(products_df, on="product_id", how="left")

df = df.withColumn("revenue", col("quantity") * col("price")) \
       .withColumn("discount_amount", col("price") * (col("discount") / 100) * col("quantity")) \
       .withColumn("profit", col("revenue") - col("discount_amount"))

df.show()


+----------+-------+--------+--------+----------+--------+--------------------+----------------+-----+-------+---------------+------+
|product_id|sale_id|store_id|quantity| sale_date|discount|        product_name|        category|price|revenue|discount_amount|profit|
+----------+-------+--------+--------+----------+--------+--------------------+----------------+-----+-------+---------------+------+
|         1|   1001|       1|    10.0|2022-03-15|    10.0|  Parachute Hair Oil| Health & Beauty| 90.0|  900.0|           90.0| 810.0|
|         2|   1002|       2|     5.0|2023-06-20|     5.0|     Dettol Handwash|Health & Hygiene|120.0|  600.0|           30.0| 570.0|
|         3|   1003|       3|    20.0|2024-01-05|    10.0|  Britannia Biscuits|            Food| 35.0|  700.0|           70.0| 630.0|
|         4|   1004|       4|    15.0|2023-11-12|     5.0|         Amul Butter|           Dairy| 50.0|  750.0|           37.5| 712.5|
|         5|   1005|       5|    25.0|2022-12-30|    10.0|    

In [0]:
from pyspark.sql.functions import sum as spark_sum, round, col, when
import time

profit_margin_df = df.groupBy("category") \
    .agg(
        spark_sum("revenue").alias("total_revenue"),
        spark_sum("profit").alias("total_profit")
    ) \
    .withColumn(
        "profit_margin_percent",
        round(
            when(col("total_revenue") != 0,
                 (col("total_profit") / col("total_revenue")) * 100
            ).otherwise(None), 2
        )
    )

# Save as Delta table
profit_margin_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("profit_margin_by_category")


In [0]:
df.createOrReplaceTempView("sales_cleaned")

query = """
SELECT 
    product_id,
    product_name,
    sale_id,
    SUM(quantity) AS total_sold
FROM sales_cleaned
GROUP BY product_id, sale_id,product_name
ORDER BY total_sold DESC
LIMIT 3;
"""
result = spark.sql(query)
display(result)

product_id,product_name,sale_id,total_sold
5,Maggi Noodles,1010,30.0
5,Maggi Noodles,1005,25.0
7,Surf Excel Detergent,1007,22.0


In [0]:
result.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("top_3")
