**Data Loading**

In [2]:
import pandas as pd
data = pd.read_csv('household_energy_consumption.csv')
data.head()

Unnamed: 0,Household_ID,Date,Energy_Consumption_kWh,Household_Size,Avg_Temperature_C,Has_AC,Peak_Hours_Usage_kWh
0,H00001,2025-04-01,8.4,4,17.8,No,3.2
1,H00001,2025-04-02,7.9,4,17.3,No,2.8
2,H00001,2025-04-03,9.2,4,18.6,No,3.0
3,H00001,2025-04-04,7.9,4,18.2,No,2.7
4,H00001,2025-04-05,9.6,4,11.9,No,3.2


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Household_ID            90000 non-null  object 
 1   Date                    90000 non-null  object 
 2   Energy_Consumption_kWh  90000 non-null  float64
 3   Household_Size          90000 non-null  int64  
 4   Avg_Temperature_C       90000 non-null  float64
 5   Has_AC                  90000 non-null  object 
 6   Peak_Hours_Usage_kWh    90000 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 4.8+ MB


**Data Cleaning and Preprocessing**

In [4]:
duplicate = data.duplicated()
print(duplicate.sum())

0


In [5]:
missing = data.isna()
print(missing.sum())

Household_ID              0
Date                      0
Energy_Consumption_kWh    0
Household_Size            0
Avg_Temperature_C         0
Has_AC                    0
Peak_Hours_Usage_kWh      0
dtype: int64


In [6]:
data['Date'] = pd.to_datetime(data['Date'])
data['Has_AC'] = data['Has_AC'].map({'Yes': 1, 'No': 0})
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Household_ID            90000 non-null  object        
 1   Date                    90000 non-null  datetime64[ns]
 2   Energy_Consumption_kWh  90000 non-null  float64       
 3   Household_Size          90000 non-null  int64         
 4   Avg_Temperature_C       90000 non-null  float64       
 5   Has_AC                  90000 non-null  int64         
 6   Peak_Hours_Usage_kWh    90000 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 4.8+ MB


In [8]:
data.describe()

Unnamed: 0,Date,Energy_Consumption_kWh,Household_Size,Avg_Temperature_C,Has_AC,Peak_Hours_Usage_kWh
count,90000,90000.0,90000.0,90000.0,90000.0,90000.0
mean,2025-04-04 00:00:03.840000256,10.571988,3.487811,17.505802,0.494356,4.319557
min,2025-04-01 00:00:00,0.5,1.0,10.0,0.0,0.2
25%,2025-04-02 00:00:00,6.0,2.0,15.8,0.0,2.3
50%,2025-04-04 00:00:00,10.4,3.0,17.5,0.0,4.0
75%,2025-04-06 00:00:00,14.8,5.0,19.2,1.0,6.0
max,2025-04-08 00:00:00,20.0,6.0,25.0,1.0,10.0
std,,5.519494,1.709761,2.491621,0.499971,2.531432


In [10]:
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['Is_Weekend'] = data['DayOfWeek'].isin([5, 6]).astype(int)
data.head()

Unnamed: 0,Household_ID,Date,Energy_Consumption_kWh,Household_Size,Avg_Temperature_C,Has_AC,Peak_Hours_Usage_kWh,Month,Day,DayOfWeek,Is_Weekend
0,H00001,2025-04-01,8.4,4,17.8,0,3.2,4,1,1,0
1,H00001,2025-04-02,7.9,4,17.3,0,2.8,4,2,2,0
2,H00001,2025-04-03,9.2,4,18.6,0,3.0,4,3,3,0
3,H00001,2025-04-04,7.9,4,18.2,0,2.7,4,4,4,0
4,H00001,2025-04-05,9.6,4,11.9,0,3.2,4,5,5,1
