#Task 1: Ingest and Save

In [0]:
# Load the CSV into a PySpark DataFrame.
# I have uploaded it as a csv file in dbfs
df = spark.read.csv("dbfs:/FileStore/tables/sales_transaction.csv", header=True, inferSchema=True)
df.show()

+--------------+-------------+------+---------------+-------------+--------+----------+----------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|      date|
+--------------+-------------+------+---------------+-------------+--------+----------+----------+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|2024-01-12|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|2024-02-05|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|2024-01-17|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|2024-03-22|
|             5|       Vikram|  East|Washing Machine|  Electronics|       1|     28000|2024-02-28|
|             6|       Preeti|  West|       Sneakers|      Fashion|       2|      4000|2024-01-31|
|             7|         Aman| South|             TV|  Electronics|       1|     45000|2024-02-15|
|         

In [0]:
# Save it in Parquet and Delta formats (in different folders).
df.write.mode("overwrite").parquet("/tmp/sales_transactions.parquet")
df.write.format("delta").mode("overwrite").save("/tmp/sales_transactions.delta")

In [0]:
# Create and register Delta Table called sales_transactions .
df.write.format("delta").mode("overwrite").saveAsTable("sales_transactions")
spark.sql("SELECT * FROM sales_transactions").show()

+--------------+-------------+------+---------------+-------------+--------+----------+----------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|      date|
+--------------+-------------+------+---------------+-------------+--------+----------+----------+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|2024-01-12|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|2024-02-05|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|2024-01-17|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|2024-03-22|
|             5|       Vikram|  East|Washing Machine|  Electronics|       1|     28000|2024-02-28|
|             6|       Preeti|  West|       Sneakers|      Fashion|       2|      4000|2024-01-31|
|             7|         Aman| South|             TV|  Electronics|       1|     45000|2024-02-15|
|         

#Task 2: Data Transformation

In [0]:
# Add a new column total_amount = quantity × unit_price .
from pyspark.sql.functions import col
df = df.withColumn('total_amount', col("quantity") * col("unit_price"))
df.show()

+--------------+-------------+------+---------------+-------------+--------+----------+----------+------------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|      date|total_amount|
+--------------+-------------+------+---------------+-------------+--------+----------+----------+------------+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|2024-01-12|       55000|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|2024-02-05|       32000|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|2024-01-17|         750|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|2024-03-22|       40000|
|             5|       Vikram|  East|Washing Machine|  Electronics|       1|     28000|2024-02-28|       28000|
|             6|       Preeti|  West|       Sneakers|      Fashion|       2|      4000|2024-01-31|      

In [0]:
# Add another column month extracted from the date.
from pyspark.sql.functions import month
df = df.withColumn('month', month(col("date")))
df.show()

+--------------+-------------+------+---------------+-------------+--------+----------+----------+------------+-----+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|      date|total_amount|month|
+--------------+-------------+------+---------------+-------------+--------+----------+----------+------------+-----+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|2024-01-12|       55000|    1|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|2024-02-05|       32000|    2|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|2024-01-17|         750|    1|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|2024-03-22|       40000|    3|
|             5|       Vikram|  East|Washing Machine|  Electronics|       1|     28000|2024-02-28|       28000|    2|
|             6|       Preeti|  West|       Sneakers|   

In [0]:
# Format date as dd-MMM-yyyy and display.
from pyspark.sql.functions import date_format
df = df.withColumn('date', date_format(col("date"), "dd-MMM-yyyy"))
df.show()

+--------------+-------------+------+---------------+-------------+--------+----------+-----------+------------+-----+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|       date|total_amount|month|
+--------------+-------------+------+---------------+-------------+--------+----------+-----------+------------+-----+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|12-Jan-2024|       55000|    1|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|05-Feb-2024|       32000|    2|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|17-Jan-2024|         750|    1|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|22-Mar-2024|       40000|    3|
|             5|       Vikram|  East|Washing Machine|  Electronics|       1|     28000|28-Feb-2024|       28000|    2|
|             6|       Preeti|  West|       Snea

In [0]:
# Create a column is_high_value (TRUE if total_amount > 30,000 , else FALSE).
from pyspark.sql.functions import when
df = df.withColumn("high_value", when(col("total_amount") > 30000, True).otherwise(False))
df.show()

+--------------+-------------+------+---------------+-------------+--------+----------+-----------+------------+-----+----------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|       date|total_amount|month|high_value|
+--------------+-------------+------+---------------+-------------+--------+----------+-----------+------------+-----+----------+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|12-Jan-2024|       55000|    1|      true|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|05-Feb-2024|       32000|    2|      true|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|17-Jan-2024|         750|    1|     false|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|22-Mar-2024|       40000|    3|      true|
|             5|       Vikram|  East|Washing Machine|  Electronics|       1|     28000|28-

#Task 3: Aggregations & Insights

In [0]:
# Count transactions per region.
df.groupBy("region").count().show()

+------+-----+
|region|count|
+------+-----+
| South|    2|
|  East|    2|
|  West|    3|
| North|    3|
+------+-----+



In [0]:
# Get top 3 categories by total sales amount.
df.groupBy("category").sum("total_amount").orderBy(col("sum(total_amount)").desc()).limit(3).show()

+-------------+-----------------+
|     category|sum(total_amount)|
+-------------+-----------------+
|  Electronics|           200000|
|      Fashion|             8000|
|Personal Care|             1350|
+-------------+-----------------+



In [0]:
# Find month-wise revenue trend.
df.groupBy("month").sum("total_amount").orderBy(col("month")).show()

+-----+-----------------+
|month|sum(total_amount)|
+-----+-----------------+
|    1|            64350|
|    2|           105000|
|    3|            40800|
+-----+-----------------+



In [0]:
# Show customer(s) who made the highest purchase in one transaction.
df.orderBy(col("total_amount").desc()).limit(1).show()

+--------------+-------------+------+-------+-----------+--------+----------+-----------+------------+-----+----------+
|transaction_id|customer_name|region|product|   category|quantity|unit_price|       date|total_amount|month|high_value|
+--------------+-------------+------+-------+-----------+--------+----------+-----------+------------+-----+----------+
|             1|       Rajesh| North| Laptop|Electronics|       1|     55000|12-Jan-2024|       55000|    1|      true|
+--------------+-------------+------+-------+-----------+--------+----------+-----------+------------+-----+----------+



In [0]:
# Calculate total sales done in Q1 (Jan–Mar).
df.filter(col("month").isin([1,2,3])).groupBy().sum("total_amount").show()

+-----------------+
|sum(total_amount)|
+-----------------+
|           210150|
+-----------------+



#Task 4: Update & Delete Scenarios

In [0]:
# backup
df_old = df
# df.write.format("delta").mode("overwrite").saveAsTable("sales_transactions_backup")

In [0]:
# Update price of all Stationery items to increase by 10%.
from delta.tables import DeltaTable
delta_table = DeltaTable.forName(spark, "sales_transactions")
print("original:")
delta_table.toDF().show()
delta_table.update(
    condition=col("category") == "Stationery",
    set={"unit_price": col("unit_price") * 1.10}
)
print("After price update (+10%) for Stationery:")
delta_table.toDF().show()

original:
+--------------+-------------+------+---------------+-------------+--------+----------+----------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|      date|
+--------------+-------------+------+---------------+-------------+--------+----------+----------+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|2024-01-12|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|2024-02-05|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|2024-01-17|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|2024-03-22|
|             5|       Vikram|  East|Washing Machine|  Electronics|       1|     28000|2024-02-28|
|             6|       Preeti|  West|       Sneakers|      Fashion|       2|      4000|2024-01-31|
|             7|         Aman| South|             TV|  Electronics|       1|     45000|2024-02-15|


In [0]:
# Delete all records with quantity < 3 .
delta_table.delete(col("quantity") < 3)
print("After deleting records with quantity < 3:")
delta_table.toDF().show()

After deleting records with quantity < 3:
+--------------+-------------+------+----------+-------------+--------+----------+----------+
|transaction_id|customer_name|region|   product|     category|quantity|unit_price|      date|
+--------------+-------------+------+----------+-------------+--------+----------+----------+
|             8|         Isha| North|  Notebook|   Stationery|      10|        66|2024-01-10|
|             9|        Kunal|  East|    Pencil|   Stationery|      20|        11|2024-03-05|
|             3|         Anil| South|   Shampoo|Personal Care|       5|       150|2024-01-17|
|            10|        Tanvi|  West|Face Cream|Personal Care|       3|       200|2024-03-19|
+--------------+-------------+------+----------+-------------+--------+----------+----------+



In [0]:
# Add a new row into the Delta Table with today's transaction data.
from datetime import date
today = date.today().strftime("%Y-%m-%d")
new_row = [(11, "Meera", "South", "Tablet", "Electronics", 1, 25000, today)]
columns = ["transaction_id", "customer_name", "region", "product", "category", "quantity", "unit_price", "date"]
new_df = spark.createDataFrame(new_row, columns)
new_df.write.format("delta").mode("append").insertInto("sales_transactions")
spark.table("sales_transactions").show()

+--------------+-------------+------+----------+-------------+--------+----------+----------+
|transaction_id|customer_name|region|   product|     category|quantity|unit_price|      date|
+--------------+-------------+------+----------+-------------+--------+----------+----------+
|             8|         Isha| North|  Notebook|   Stationery|      10|        66|2024-01-10|
|             9|        Kunal|  East|    Pencil|   Stationery|      20|        11|2024-03-05|
|             3|         Anil| South|   Shampoo|Personal Care|       5|       150|2024-01-17|
|            10|        Tanvi|  West|Face Cream|Personal Care|       3|       200|2024-03-19|
|            11|        Meera| South|    Tablet|  Electronics|       1|     25000|2025-08-08|
+--------------+-------------+------+----------+-------------+--------+----------+----------+



#Task 5: Partitioning & Optimization (Bonus)

In [0]:
# Re-write the Delta table partitioned by region .
df_old.write.format("delta").mode("overwrite").partitionBy("region").saveAsTable("partioned_sales_transactions")
spark.table("partioned_sales_transactions").show()

+--------------+-------------+------+---------------+-------------+--------+----------+-----------+------------+-----+----------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|       date|total_amount|month|high_value|
+--------------+-------------+------+---------------+-------------+--------+----------+-----------+------------+-----+----------+
|             8|         Isha| North|       Notebook|   Stationery|      10|        60|10-Jan-2024|         600|    1|     false|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|22-Mar-2024|       40000|    3|      true|
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|12-Jan-2024|       55000|    1|      true|
|            10|        Tanvi|  West|     Face Cream|Personal Care|       3|       200|19-Mar-2024|         600|    3|     false|
|             6|       Preeti|  West|       Sneakers|      Fashion|       2|      4000|31-

In [0]:
# Create a second Delta table partitioned by month .
df_old.write.format("delta").mode("overwrite").partitionBy("month").saveAsTable("partioned_sales_transactions_month")
spark.table("partioned_sales_transactions_month").show()

+--------------+-------------+------+---------------+-------------+--------+----------+-----------+------------+-----+----------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|       date|total_amount|month|high_value|
+--------------+-------------+------+---------------+-------------+--------+----------+-----------+------------+-----+----------+
|             8|         Isha| North|       Notebook|   Stationery|      10|        60|10-Jan-2024|         600|    1|     false|
|             6|       Preeti|  West|       Sneakers|      Fashion|       2|      4000|31-Jan-2024|        8000|    1|     false|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|17-Jan-2024|         750|    1|     false|
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|12-Jan-2024|       55000|    1|      true|
|             7|         Aman| South|             TV|  Electronics|       1|     45000|15-