#1. Upload Cleaned Data into Databricks

In [0]:
expenses_df = spark.read.csv("dbfs:/FileStore/tables/cleaned_expenses.csv", header=True, inferSchema=True)
users_df = spark.read.csv("dbfs:/FileStore/tables/users-1.csv", header=True, inferSchema=True)

In [0]:
expenses_df.show()

+-------+-----------+-------+------------+--------------------+----------+
|user_id|category_id| amount|expense_date|         description|     month|
+-------+-----------+-------+------------+--------------------+----------+
|      1|          1| 2500.0|  2025-07-01|Weekly groceries ...|2025-07-01|
|      2|          2|10000.0|  2025-07-01|  Monthly house rent|2025-07-01|
|      3|          5| 1200.0|  2025-07-02| Dinner with friends|2025-07-01|
|      4|          3| 1800.0|  2025-03-07|Electricity and w...|2025-03-01|
|      5|          6| 2200.0|  2025-07-04|General health ch...|2025-07-01|
|      6|          4|  800.0|  2025-07-04|Local transport c...|2025-07-01|
|      7|          7| 5000.0|  2025-07-05|        Tuition fees|2025-07-01|
|      8|          8|  950.0|  2025-07-05|    Movie and snacks|2025-07-01|
|      1|          5|  700.0|  2025-07-06|       Lunch at cafe|2025-07-01|
|      2|          1| 3000.0|  2025-07-07|Vegetables and fr...|2025-07-01|
|      3|          4|  60

In [0]:
users_df.show()

+-------+--------+------------------+--------------+
|user_id|    name|             email|monthly_income|
+-------+--------+------------------+--------------+
|      1|Elakkiya|elakkiya@gmail.com|          2000|
|      2| Kashifa| kashifa@gmail.com|          4000|
|      3|    Roja|    roja@gmail.com|          3000|
|      4|Sereesha|sereesha@gmail.com|          5000|
|      5| Lavanya| lavanya@gmail.com|          5000|
|      6| Rithika| rithika@gmail.com|          4700|
|      7|Shobitha|shobitha@gmail.com|          4900|
|      8|  Harish|  harish@gmail.com|          3000|
+-------+--------+------------------+--------------+



#2. Combine User & Expense Data

In [0]:
combined_df = users_df.join(expenses_df, on="user_id", how="left")
combined_df.show()

+-------+--------+------------------+--------------+-----------+-------+------------+--------------------+----------+
|user_id|    name|             email|monthly_income|category_id| amount|expense_date|         description|     month|
+-------+--------+------------------+--------------+-----------+-------+------------+--------------------+----------+
|      1|Elakkiya|elakkiya@gmail.com|          2000|          1| 2500.0|  2025-07-01|Weekly groceries ...|2025-07-01|
|      1|Elakkiya|elakkiya@gmail.com|          2000|          5|  700.0|  2025-07-06|       Lunch at cafe|2025-07-01|
|      1|Elakkiya|elakkiya@gmail.com|          2000|          3|  950.0|  2025-07-14|Internet and elec...|2025-07-01|
|      6| Rithika| rithika@gmail.com|          4700|          4|  800.0|  2025-07-04|Local transport c...|2025-07-01|
|      6| Rithika| rithika@gmail.com|          4700|          6| 1500.0|  2025-11-07|      Dental checkup|2025-11-01|
|      6| Rithika| rithika@gmail.com|          4700|    

#3. Create Summary Table

In [0]:
from pyspark.sql.functions import col, date_format, sum as _sum, avg, when,round,col
combined_df = combined_df.withColumn("month", date_format("expense_date", "yyyy-MM"))
summary_df = combined_df.groupBy("user_id", "month").agg(
    _sum("amount").alias("monthly_spend"),
    round(avg("amount"),2).alias("avg_transaction")
)

summary_df = summary_df.join(users_df.select("user_id", "monthly_income"), on="user_id", how="left")

summary_df = summary_df.withColumn("monthly_savings",when((col("monthly_income") - col("monthly_spend")) < 0, 0).otherwise (col("monthly_income") - col("monthly_spend")))

summary_df = summary_df.withColumn("alert",when(col("monthly_spend") > col("monthly_income") * 0.9, "High Spending").otherwise("Normal"))
summary_df.orderBy("user_id").show()

+-------+-------+-------------+---------------+--------------+---------------+-------------+
|user_id|  month|monthly_spend|avg_transaction|monthly_income|monthly_savings|        alert|
+-------+-------+-------------+---------------+--------------+---------------+-------------+
|      1|2025-07|       4150.0|        1383.33|          2000|            0.0|High Spending|
|      2|2025-07|      14400.0|         4800.0|          4000|            0.0|High Spending|
|      3|2025-07|       4100.0|        1366.67|          3000|            0.0|High Spending|
|      4|2025-07|       4400.0|         2200.0|          5000|          600.0|       Normal|
|      4|2025-03|       1800.0|         1800.0|          5000|         3200.0|       Normal|
|      5|2025-07|      13600.0|        4533.33|          5000|            0.0|High Spending|
|      6|2025-11|       1500.0|         1500.0|          4700|         3200.0|       Normal|
|      6|2025-07|       4800.0|         2400.0|          4700|        

as the sample expenses are of small amount i have given the monthly income less than 5000 for all user inorder to get variation in savings and alerts.

#4. Save as Delta or CSV 

In [0]:
summary_df.write.format("delta").mode("overwrite").save("dbfs:/FileStore/expense_summary_delta")
summary_df.write.csv("dbfs:/FileStore/expense_summary_csv", header=True, mode="overwrite")