In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ExpenseVolumeAnalysis").getOrCreate()
spark

In [8]:
df = spark.read.csv("expenses_large.csv", header=True, inferSchema=True)
df.show()

+-------+---------+------+------------+--------------------+
|user_id| category|amount|expense_date|         description|
+-------+---------+------+------------+--------------------+
|      1|     Food|   500|  2025-01-10|Dinner at restaurant|
|      1|     Food|   600|  2025-02-15|  Lunch with friends|
|      1|     Food|   700|  2025-03-12|      Weekend snacks|
|      1|     Food|  2000|  2025-04-05|      Party at hotel|
|      1|     Food|   300|  2025-05-09|       Chai & Samosa|
|      2|Transport|   800|  2025-01-12| Metro card recharge|
|      2|Transport|   900|  2025-02-20|           Uber ride|
|      2|Transport|   850|  2025-03-10|      Taxi to office|
|      2|Transport|   950|  2025-04-18|    Bus monthly pass|
|      2|Transport|  4000|  2025-05-25|       Flight ticket|
+-------+---------+------+------------+--------------------+



In [9]:
from pyspark.sql.functions import month, year, sum as _sum, col, avg, stddev
df = df.withColumn("expense_date", col("expense_date").cast("date"))
df = df.withColumn("month", month("expense_date"))
df = df.withColumn("year", year("expense_date"))
df.show()

+-------+---------+------+------------+--------------------+-----+----+
|user_id| category|amount|expense_date|         description|month|year|
+-------+---------+------+------------+--------------------+-----+----+
|      1|     Food|   500|  2025-01-10|Dinner at restaurant|    1|2025|
|      1|     Food|   600|  2025-02-15|  Lunch with friends|    2|2025|
|      1|     Food|   700|  2025-03-12|      Weekend snacks|    3|2025|
|      1|     Food|  2000|  2025-04-05|      Party at hotel|    4|2025|
|      1|     Food|   300|  2025-05-09|       Chai & Samosa|    5|2025|
|      2|Transport|   800|  2025-01-12| Metro card recharge|    1|2025|
|      2|Transport|   900|  2025-02-20|           Uber ride|    2|2025|
|      2|Transport|   850|  2025-03-10|      Taxi to office|    3|2025|
|      2|Transport|   950|  2025-04-18|    Bus monthly pass|    4|2025|
|      2|Transport|  4000|  2025-05-25|       Flight ticket|    5|2025|
+-------+---------+------+------------+--------------------+----

In [10]:
# Group by user, year, month to calculate total monthly spend
monthly_spend = df.groupBy("user_id", "year", "month").agg(_sum("amount").alias("total_spent"))

# Detect unusual spending
unusual_df = monthly_spend.groupBy("user_id").agg(
    avg("total_spent").alias("avg_spent"),
    stddev("total_spent").alias("std_spent")
)
unusual_df.show()

+-------+---------+------------------+
|user_id|avg_spent|         std_spent|
+-------+---------+------------------+
|      1|    820.0| 676.0177512462228|
|      2|   1500.0|1398.6600730699365|
+-------+---------+------------------+



In [18]:
# Join back with monthly spend
detected = monthly_spend.join(unusual_df, "user_id")
detected = detected.withColumn("is_spike", col("total_spent") > (col("avg_spent") + col("std_spent")))
detected.show()

# Filter only spike entries
detected_spikes = detected.filter(col("is_spike") == True)
detected_spikes.select("user_id", "year", "month", "total_spent", "avg_spent", "std_spent").show()


+-------+----+-----+-----------+---------+------------------+--------+
|user_id|year|month|total_spent|avg_spent|         std_spent|is_spike|
+-------+----+-----+-----------+---------+------------------+--------+
|      1|2025|    4|       2000|    820.0| 676.0177512462228|    true|
|      1|2025|    2|        600|    820.0| 676.0177512462228|   false|
|      1|2025|    1|        500|    820.0| 676.0177512462228|   false|
|      1|2025|    5|        300|    820.0| 676.0177512462228|   false|
|      1|2025|    3|        700|    820.0| 676.0177512462228|   false|
|      2|2025|    1|        800|   1500.0|1398.6600730699365|   false|
|      2|2025|    5|       4000|   1500.0|1398.6600730699365|    true|
|      2|2025|    3|        850|   1500.0|1398.6600730699365|   false|
|      2|2025|    4|        950|   1500.0|1398.6600730699365|   false|
|      2|2025|    2|        900|   1500.0|1398.6600730699365|   false|
+-------+----+-----+-----------+---------+------------------+--------+

+----