In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Smart Home Tracker").getOrCreate()
spark

In [0]:
df = spark.read.csv("file:/Workspace/Shared/energy_logs.csv", header=True, inferSchema=True)
df.show()

+------+---------+-------------------+---------------+
|log_id|device_id|          timestamp|energy_used_kwh|
+------+---------+-------------------+---------------+
|     1|        1|2025-06-01 08:00:00|           0.05|
|     2|        1|2025-06-01 09:00:00|           0.06|
|     3|        2|2025-06-01 20:00:00|           0.12|
|     4|        3|2025-06-01 22:00:00|           0.08|
|     5|        4|2025-06-01 07:00:00|            0.1|
|     6|        5|2025-06-01 14:00:00|           0.75|
|     7|        6|2025-06-01 09:30:00|            0.2|
|     8|        1|2025-06-02 08:00:00|           0.07|
|     9|        3|2025-06-02 22:00:00|           0.09|
+------+---------+-------------------+---------------+



In [0]:
from pyspark.sql.functions import to_timestamp, date_format, weekofyear
# Convert string timestamp to actual TimestampType
df = df.withColumn("timestamp", to_timestamp("timestamp"))

# Extract only the date part (yyyy-MM-dd) for daily aggregation
df = df.withColumn("date", date_format("timestamp", "yyyy-MM-dd"))

# Extract week number from timestamp for weekly aggregation
df = df.withColumn("week", weekofyear("timestamp"))

df.show()

+------+---------+-------------------+---------------+----------+----+
|log_id|device_id|          timestamp|energy_used_kwh|      date|week|
+------+---------+-------------------+---------------+----------+----+
|     1|        1|2025-06-01 08:00:00|           0.05|2025-06-01|  22|
|     2|        1|2025-06-01 09:00:00|           0.06|2025-06-01|  22|
|     3|        2|2025-06-01 20:00:00|           0.12|2025-06-01|  22|
|     4|        3|2025-06-01 22:00:00|           0.08|2025-06-01|  22|
|     5|        4|2025-06-01 07:00:00|            0.1|2025-06-01|  22|
|     6|        5|2025-06-01 14:00:00|           0.75|2025-06-01|  22|
|     7|        6|2025-06-01 09:30:00|            0.2|2025-06-01|  22|
|     8|        1|2025-06-02 08:00:00|           0.07|2025-06-02|  23|
|     9|        3|2025-06-02 22:00:00|           0.09|2025-06-02|  23|
+------+---------+-------------------+---------------+----------+----+



In [0]:
# Total energy used per day
from pyspark.sql.functions import sum 
daily_summary = df.groupBy("device_id", "date").agg(sum("energy_used_kwh").alias("daily_kwh"))
daily_summary.show()

+---------+----------+---------+
|device_id|      date|daily_kwh|
+---------+----------+---------+
|        3|2025-06-01|     0.08|
|        1|2025-06-02|     0.07|
|        1|2025-06-01|     0.11|
|        6|2025-06-01|      0.2|
|        2|2025-06-01|     0.12|
|        3|2025-06-02|     0.09|
|        4|2025-06-01|      0.1|
|        5|2025-06-01|     0.75|
+---------+----------+---------+



In [0]:
# Total energy used per week
weekly_summary = df.groupBy("device_id", "week").agg(sum("energy_used_kwh").alias("weekly_kwh"))
weekly_summary.show()

+---------+----+----------+
|device_id|week|weekly_kwh|
+---------+----+----------+
|        3|  22|      0.08|
|        6|  22|       0.2|
|        4|  22|       0.1|
|        2|  22|      0.12|
|        5|  22|      0.75|
|        1|  22|      0.11|
|        1|  23|      0.07|
|        3|  23|      0.09|
+---------+----+----------+



In [0]:
# Save daily summary in Delta format
daily_summary.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/daily_summary")

# Save weekly summary as CSV
weekly_summary.write.csv("file:/Workspace/Shared/weekly_summary.csv", header=True)