# Project Description

# Part 1: Data Preparation

In [2]:
import pandas as pd

df = pd.read_csv('KAG_energydata_complete.csv', index_col=0)
print(df.shape)
print(df.columns)
print(df.head(10))
df.info()

(19735, 28)
Index(['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4',
       'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
       'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')
                     Appliances  lights         T1       RH_1     T2  \
date                                                                   
2016-01-11 17:00:00          60      30  19.890000  47.596667  19.20   
2016-01-11 17:10:00          60      30  19.890000  46.693333  19.20   
2016-01-11 17:20:00          50      30  19.890000  46.300000  19.20   
2016-01-11 17:30:00          50      40  19.890000  46.066667  19.20   
2016-01-11 17:40:00          60      40  19.890000  46.333333  19.20   
2016-01-11 17:50:00          50      40  19.890000  46.026667  19.20   
2016-01-11 18:00:00          60      50  19.890000  45.766667  19.20   
2016-01-11 18:10:00          60      50  19.856667 

# Part 2: Feature Engineering

## 1. Time features

In [4]:
import numpy as np

In [5]:
df = df.copy()
df.index = pd.to_datetime(df.index)

In [6]:
# extract basic time features
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek  
df['month'] = df.index.month
df['day_of_month'] = df.index.day
df['day_of_year'] = df.index.dayofyear

In [7]:
# weekend or not
df['is_weekend'] = (df.index.dayofweek >= 5).astype(int)

In [8]:
conditions = [
    (df['hour'] >= 6) & (df['hour'] < 12),
    (df['hour'] >= 12) & (df['hour'] < 18),
    (df['hour'] >= 18) & (df['hour'] < 24)
]

choices = [0, 1, 2]  # morning, afternoon, evening

df['time_period'] = np.select(conditions, choices, default=3) # night


In [9]:
# to encode periodic time features

"""
In order to capture periodic patterns, we apply cyclical encoding using sine and cosine transformations. 
This converts discrete time units into continuous circular coordinates.
It maintains the proximity between adjacent periods and eliminating artificial discontinuities at period boundaries.

"""

# hour
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# week
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# month
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# day of month
df['day_of_month_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 30)
df['day_of_month_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 30)

# day of year
df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 366)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 366)

## 2. Temperature features

In [33]:
df["T_indoor_avg"] = df[df.filter(like='T').columns.drop('T6')].mean(axis=1) # average temperature of 8 rooms
df["T_indoor_std"] = df[df.filter(like='T').columns.drop('T6')].std(axis=1) # standard error of temperature of 8 rooms
df["T_indoor_max"] = df[df.filter(like='T').columns.drop('T6')].max(axis=1) # maximum temperature of 8 rooms
df["T_indoor_min"] = df[df.filter(like='T').columns.drop('T6')].min(axis=1) # minimum temperature of 8 rooms
df["T_indoor_range"] = df["T_indoor_max"] - df["T_indoor_min"] # the range of indoor temperature
df['T_indoor_change'] = df['T_indoor_avg'].diff() # change rate of indoor temperature

In [21]:
# the difference between indoor and outdoor temperatures
df["T_diff_1"] = df["T_indoor_avg"] - df["T6"]
df["T_diff_2"] = df["T_indoor_avg"] - df["T_out"]

## 3. Humidity features

In [34]:
df["RH_indoor_avg"] = df[df.filter(like='RH').columns.drop('RH_6')].mean(axis=1) # average humidity of 8 rooms
df["RH_indoor_std"] = df[df.filter(like='RH').columns.drop('RH_6')].std(axis=1) # standard error of humidity of 8 rooms
df["RH_indoor_max"] = df[df.filter(like='RH').columns.drop('RH_6')].max(axis=1) # maximum humidity of 8 rooms
df["RH_indoor_min"] = df[df.filter(like='RH').columns.drop('RH_6')].min(axis=1) # minimum humidity of 8 rooms
df["RH_indoor_range"] = df["RH_indoor_max"] - df["RH_indoor_min"] # the range of indoor humidity
df['RH_indoor_change'] = df['RH_indoor_avg'].diff() # change rate of indoor humidity

In [23]:
# the difference between indoor and outdoor humidity
df["RH_diff_1"] = df["RH_indoor_avg"] - df["RH_6"]
df["RH_diff_2"] = df["RH_indoor_avg"] - df["RH_out"]

In [None]:
pip install metpy

In [31]:
# comfort measures
import metpy.calc as mpcalc
from metpy.calc import heat_index
from metpy.units import units
temp = df["T_indoor_avg"].values * units.degC
humidity = df["RH_indoor_avg"].values * units.percent

# heat index
df["heat_index"] = heat_index(temp, humidity)

# dewpoint
df["dewpoint"] = mpcalc.dewpoint_from_relative_humidity(temp, humidity)

## 4. Weather measures

In [37]:
# weather comprehensive measure
df["weather_com"] = df["T_out"].values * df["RH_out"].values *0.01 * df["Windspeed"].values

In [36]:
# change rate
df['T_out_change'] = df['T_out'].diff()
df['Press_change'] = df['Press_mm_hg'].diff()
df['Windspeed_change'] = df['Windspeed'].diff()

## 5. Lag features

In [None]:
# appliance lag
target_col='Appliances'
df[f'{target_col}_lag1'] = df[target_col].shift(1)  
df[f'{target_col}_lag2'] = df[target_col].shift(2)  
df[f'{target_col}_lag3'] = df[target_col].shift(3)
df[f'{target_col}_lag6'] = df[target_col].shift(6)
df[f'{target_col}_lag144'] = df[target_col].shift(144)

In [41]:
# lights lag
df['lights_lag1'] = df['lights'].shift(1)  
df['lights_lag2'] = df['lights'].shift(2)  
df['lights_lag3'] = df['lights'].shift(3)
df['lights_lag4'] = df['lights'].shift(6)
df['lights_lag5'] = df['lights'].shift(144)

In [40]:
# weather lag
df['T_indoor_lag1'] = df['T_indoor_avg'].shift(1)
df['T_indoor_lag6'] = df['T_indoor_avg'].shift(6)

df['T_out_lag1'] = df['T_out'].shift(1)
df['T_out_lag6'] = df['T_out'].shift(6)

## 6. Rolling features

In [None]:
# appliance rolling
df[f'{target_col}_rolling_mean_6'] = df[target_col].rolling(window=6, min_periods=1).mean()
df[f'{target_col}_rolling_mean_18'] = df[target_col].rolling(window=18, min_periods=1).mean()
df[f'{target_col}_rolling_max_6'] = df[target_col].rolling(window=6, min_periods=1).max()
df[f'{target_col}_rolling_min_6'] = df[target_col].rolling(window=6, min_periods=1).min()
df[f'{target_col}_rolling_std_18'] = df[target_col].rolling(window=18, min_periods=1).std()
df[f'{target_col}_rolling_std_36'] = df[target_col].rolling(window=36, min_periods=1).std()
df[f'{target_col}_MA_3'] = df[target_col].rolling(window=3, min_periods=1).mean()
df[f'{target_col}_MA_12'] = df[target_col].rolling(window=12, min_periods=1).mean()  

In [42]:
# lights rolling
df['lights_rolling_mean_6'] = df['lights'].rolling(window=6, min_periods=1).mean()
df['lights_rolling_mean_18'] = df['lights'].rolling(window=18, min_periods=1).mean()
df['lights_rolling_max_6'] = df['lights'].rolling(window=6, min_periods=1).max()
df['lights_rolling_min_6'] = df['lights'].rolling(window=6, min_periods=1).min()
df['lights_rolling_std_18'] = df['lights'].rolling(window=18, min_periods=1).std()
df['lights_rolling_std_36'] = df['lights'].rolling(window=36, min_periods=1).std()
df['lights_MA_3'] = df['lights'].rolling(window=3, min_periods=1).mean()
df['lights_MA_12'] = df['lights'].rolling(window=12, min_periods=1).mean()  

In [43]:
# weather rolling
df['T_indoor_rolling_mean_6'] = df['T_indoor_avg'].rolling(window=6, min_periods=1).mean()
df['T_indoor_rolling_max_6'] = df['T_indoor_avg'].rolling(window=6, min_periods=1).max()
df['T_indoor_rolling_min_6'] = df['T_indoor_avg'].rolling(window=6, min_periods=1).min()
df['T_indoor_rolling_std_6'] = df['T_indoor_avg'].rolling(window=6, min_periods=1).std()

df['T_out_rolling_mean_6'] = df['T_out'].rolling(window=6, min_periods=1).mean()
df['T_out_rolling_std_6'] = df['T_out'].rolling(window=6, min_periods=1).std()