In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Expense monitoring").getOrCreate()
spark

In [0]:
# Load expenses data
expenses_df = spark.read.format("csv").option("header", True).load("file:/Workspace/Shared/expenses_cleaned.csv")
expenses_df.show()

+-------+--------------------+------+------------+--------------------+
|user_id|            category|amount|expense_date|         description|
+-------+--------------------+------+------------+--------------------+
|      1|   Utilities & Bills|1200.0|  2025-06-01| BSNL broadband bill|
|      1|                Food| 850.0|  2025-05-03|Dinner at Saravan...|
|      2|           Transport|1000.0|  2025-05-02|Uber ride to airport|
|      2|          Healthcare| 450.0|  2025-05-04|Doctor visit and ...|
|      2|Savings & Investm...|3000.0|  2025-05-05|Monthly SIP in Ax...|
+-------+--------------------+------+------------+--------------------+



In [0]:
# Load user data
users_df = spark.read.format("csv").option("header", True).load("file:/Workspace/Shared/users.csv")
users_df.show()

+-------+-------------+--------------------+
|user_id|         name|               email|
+-------+-------------+--------------------+
|      1|Ananya Sharma|ananya.sharma@exa...|
|      2|  Rahul Verma|rahul.verma@examp...|
+-------+-------------+--------------------+



In [0]:
# Join user and expense data
combined_df = expenses_df.join(users_df, on="user_id", how="inner")
combined_df.show()

+-------+--------------------+------+------------+--------------------+-------------+--------------------+
|user_id|            category|amount|expense_date|         description|         name|               email|
+-------+--------------------+------+------------+--------------------+-------------+--------------------+
|      1|   Utilities & Bills|1200.0|  2025-06-01| BSNL broadband bill|Ananya Sharma|ananya.sharma@exa...|
|      1|                Food| 850.0|  2025-05-03|Dinner at Saravan...|Ananya Sharma|ananya.sharma@exa...|
|      2|           Transport|1000.0|  2025-05-02|Uber ride to airport|  Rahul Verma|rahul.verma@examp...|
|      2|          Healthcare| 450.0|  2025-05-04|Doctor visit and ...|  Rahul Verma|rahul.verma@examp...|
|      2|Savings & Investm...|3000.0|  2025-05-05|Monthly SIP in Ax...|  Rahul Verma|rahul.verma@examp...|
+-------+--------------------+------+------------+--------------------+-------------+--------------------+



In [0]:
# Create monthly summary with alerts
from pyspark.sql.functions import month, year, sum as _sum, when,col

summary_df = combined_df .withColumn("month", month("expense_date")) \
    .withColumn("year", year("expense_date")) \
    .groupBy("user_id", "year", "month") \
    .agg(_sum("amount").alias("monthly_spend")) \
    .withColumn("alert", when(col("monthly_spend") > 20000, "High Spend").otherwise("Normal"))
summary_df.show()

+-------+----+-----+-------------+------+
|user_id|year|month|monthly_spend| alert|
+-------+----+-----+-------------+------+
|      2|2025|    5|       4450.0|Normal|
|      1|2025|    6|       1200.0|Normal|
|      1|2025|    5|        850.0|Normal|
+-------+----+-----+-------------+------+



In [0]:
# Save to Delta and CSV formats
summary_df.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/expense_summary_delta")
summary_df.write.format("csv").option("header", True).mode("overwrite").save("file:/Workspace/Shared/expense_summary_csv")