In [5]:
if 'spark' in locals() and isinstance(spark, SparkSession):
    print("Stopping existing SparkSession...")
    spark.stop()

Stopping existing SparkSession...


In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Smart Home Tracker").getOrCreate()
spark

In [7]:
from pyspark.sql.functions import col, hour, when, sum
df = spark.read.csv("energy_logs.csv", header=True, inferSchema=True)
df = df.withColumn("hour", hour(col("timestamp")))
df.show()

+------+---------+-------------------+---------------+----+
|log_id|device_id|          timestamp|energy_used_kwh|hour|
+------+---------+-------------------+---------------+----+
|     1|        1|2025-06-01 08:00:00|           0.05|   8|
|     2|        1|2025-06-01 09:00:00|           0.06|   9|
|     3|        2|2025-06-01 20:00:00|           0.12|  20|
|     4|        3|2025-06-01 22:00:00|           0.08|  22|
|     5|        4|2025-06-01 07:00:00|            0.1|   7|
|     6|        5|2025-06-01 14:00:00|           0.75|  14|
|     7|        6|2025-06-01 09:30:00|            0.2|   9|
|     8|        1|2025-06-02 08:00:00|           0.07|   8|
|     9|        3|2025-06-02 22:00:00|           0.09|  22|
+------+---------+-------------------+---------------+----+



In [8]:
# Classify each log entry as 'peak' if the hour is between 18:00 and 22:00, otherwise 'off_peak'
df = df.withColumn("period",when((col("hour") >= 18) & (col("hour") <= 22), "peak").otherwise("off_peak"))

# Group by device and period to calculate total energy used in peak and off-peak periods
device_usage = df.groupBy("device_id", "period").agg(sum("energy_used_kwh").alias("total_kwh"))

# Group by device to calculate total energy usage across all periods
total_usage = df.groupBy("device_id").agg(sum("energy_used_kwh").alias("total_kwh")).orderBy(col("total_kwh").desc())

total_usage.write.csv("output/top_devices.csv", header=True)