#Load a large dataset of sensor logs using PySpark

In [1]:
# I have the the cleaned data from python(week2) in drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SmartHomeEnergyAnalysis").getOrCreate()

In [4]:
df = spark.read.csv('/content/drive/My Drive/cleaned_energy_logs.csv', header=True, inferSchema=True)
df.show()

+------+---------+---------------+-------------------+
|log_id|device_id|energy_used_kwh|           log_time|
+------+---------+---------------+-------------------+
|     1|      101|            2.5|2025-07-20 09:00:00|
|     2|      101|            3.0|2025-07-20 18:00:00|
|     3|      102|            1.2|2025-07-20 12:00:00|
|     4|      103|            0.0|2025-07-20 13:00:00|
|     5|      104|            4.5|2025-07-20 07:00:00|
|     6|      104|            5.0|2025-07-20 20:00:00|
|     7|      105|            1.8|2025-07-20 10:00:00|
+------+---------+---------------+-------------------+



#Group by device and calculate peak vs off-peak usage

In [31]:
from pyspark.sql.functions import col, hour
df = df.withColumn("hour", hour(col("log_time")))
# here the values i have given for peak and offpeak hr is just a example value based on my data
peak_df = df.filter((col("hour") > 11) & (col("hour") <= 20))
off_peak_df = df.filter((col("hour") >= 0) & (col("hour") <= 10))

In [34]:
# peak and offpeak usage
from pyspark.sql.functions import max, avg
peak_usage = peak_df.groupBy("device_id").agg(max("energy_used_kwh").alias("energy_used_peak_hr"))
peak_usage.show()
off_peak_usage = off_peak_df.groupBy("device_id").agg(max("energy_used_kwh").alias("energy_used_offpeak_hr"))
off_peak_usage.show()

+---------+-------------------+
|device_id|energy_used_peak_hr|
+---------+-------------------+
|      101|                3.0|
|      103|                0.0|
|      102|                1.2|
|      104|                5.0|
+---------+-------------------+

+---------+----------------------+
|device_id|energy_used_offpeak_hr|
+---------+----------------------+
|      101|                   2.5|
|      105|                   1.8|
|      104|                   4.5|
+---------+----------------------+



#Identify top energy-consuming devices

In [36]:
from pyspark.sql.functions import desc
usage_summary = peak_usage.join(off_peak_usage, "device_id", "outer").na.fill(0)
total_usage = df.groupBy("device_id").agg(avg("energy_used_kwh").alias("avg_energy"),max("energy_used_kwh").alias("max_energy"))
final_summary = usage_summary.join(total_usage, "device_id")
top_devices = final_summary.orderBy(desc("max_energy")).limit(5)
top_devices.show()

+---------+-------------------+----------------------+----------+----------+
|device_id|energy_used_peak_hr|energy_used_offpeak_hr|avg_energy|max_energy|
+---------+-------------------+----------------------+----------+----------+
|      104|                5.0|                   4.5|      4.75|       5.0|
|      101|                3.0|                   2.5|      2.75|       3.0|
|      105|                0.0|                   1.8|       1.8|       1.8|
|      102|                1.2|                   0.0|       1.2|       1.2|
|      103|                0.0|                   0.0|       0.0|       0.0|
+---------+-------------------+----------------------+----------+----------+



In [40]:
# i am saving the top_devices and their energy consumption during peak and offpeak hrs  as csv in drive
# but it creates a folder in name of top_devices_output.csv and store the csv inside the folder not a direct csv file
top_devices.coalesce(1).write.csv("/content/drive/My Drive/top_devices_output", header=True, mode = 'overwrite')