# Project Description

# Part 1: Data Preparation

In [2]:
import pandas as pd

df = pd.read_csv('KAG_energydata_complete.csv', index_col=0)
print(df.shape)
print(df.columns)
print(df.head(10))
df.info()

(19735, 28)
Index(['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4',
       'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
       'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')
                     Appliances  lights         T1       RH_1     T2  \
date                                                                   
2016-01-11 17:00:00          60      30  19.890000  47.596667  19.20   
2016-01-11 17:10:00          60      30  19.890000  46.693333  19.20   
2016-01-11 17:20:00          50      30  19.890000  46.300000  19.20   
2016-01-11 17:30:00          50      40  19.890000  46.066667  19.20   
2016-01-11 17:40:00          60      40  19.890000  46.333333  19.20   
2016-01-11 17:50:00          50      40  19.890000  46.026667  19.20   
2016-01-11 18:00:00          60      50  19.890000  45.766667  19.20   
2016-01-11 18:10:00          60      50  19.856667 

# Part 2: Feature Engineering

## 1. Time features

In [3]:
import numpy as np

In [4]:
df = df.copy()
df.index = pd.to_datetime(df.index)

In [5]:
# extract basic time features
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek  
df['month'] = df.index.month
df['day_of_month'] = df.index.day
df['day_of_year'] = df.index.dayofyear

In [6]:
# weekend or not
df['is_weekend'] = (df.index.dayofweek >= 5).astype(int)

In [7]:
conditions = [
    (df['hour'] >= 6) & (df['hour'] < 12),
    (df['hour'] >= 12) & (df['hour'] < 18),
    (df['hour'] >= 18) & (df['hour'] < 24)
]

choices = [0, 1, 2]  # morning, afternoon, evening

df['time_period'] = np.select(conditions, choices, default=3) # night


In [8]:
# to encode periodic time features

"""
In order to capture periodic patterns, we apply cyclical encoding using sine and cosine transformations. 
This converts discrete time units into continuous circular coordinates.
It maintains the proximity between adjacent periods and eliminating artificial discontinuities at period boundaries.

"""

# hour
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# week
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# month
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# day of month
df['day_of_month_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 30)
df['day_of_month_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 30)

# day of year
df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 366)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 366)

## 2. Temperature features

In [38]:
df["T_indoor_avg"] = df[df.filter(like='T').columns.drop('T6','T_out')].mean(axis=1) # average temperature of 8 rooms
df["T_indoor_std"] = df[df.filter(like='T').columns.drop('T6','T_out')].std(axis=1) # standard error of temperature of 8 rooms
df["T_indoor_max"] = df[df.filter(like='T').columns.drop('T6','T_out')].max(axis=1) # maximum temperature of 8 rooms
df["T_indoor_min"] = df[df.filter(like='T').columns.drop('T6','T_out')].min(axis=1) # minimum temperature of 8 rooms
df["T_indoor_range"] = df["T_indoor_max"] - df["T_indoor_min"] # the range of indoor temperature
df['T_indoor_change'] = df['T_indoor_avg'].diff() # change rate of indoor temperature

In [10]:
# the difference between indoor and outdoor temperatures
df["T_diff_1"] = df["T_indoor_avg"] - df["T6"]
df["T_diff_2"] = df["T_indoor_avg"] - df["T_out"]

## 3. Humidity features

In [None]:
df["RH_indoor_avg"] = df[df.filter(like='RH').columns.drop('RH_6','RH_out')].mean(axis=1) # average humidity of 8 rooms
df["RH_indoor_std"] = df[df.filter(like='RH').columns.drop('RH_6','RH_out')].std(axis=1) # standard error of humidity of 8 rooms
df["RH_indoor_max"] = df[df.filter(like='RH').columns.drop('RH_6','RH_out')].max(axis=1) # maximum humidity of 8 rooms
df["RH_indoor_min"] = df[df.filter(like='RH').columns.drop('RH_6','RH_out')].min(axis=1) # minimum humidity of 8 rooms
df["RH_indoor_range"] = df["RH_indoor_max"] - df["RH_indoor_min"] # the range of indoor humidity
df['RH_indoor_change'] = df['RH_indoor_avg'].diff() # change rate of indoor humidity

In [12]:
# the difference between indoor and outdoor humidity
df["RH_diff_1"] = df["RH_indoor_avg"] - df["RH_6"]
df["RH_diff_2"] = df["RH_indoor_avg"] - df["RH_out"]

In [None]:
pip install metpy

In [13]:
# comfort measures
import metpy.calc as mpcalc
from metpy.calc import heat_index
from metpy.units import units
temp = df["T_indoor_avg"].values * units.degC
humidity = df["RH_indoor_avg"].values * units.percent

# heat index
df["heat_index"] = heat_index(temp, humidity)

# dewpoint
df["dewpoint"] = mpcalc.dewpoint_from_relative_humidity(temp, humidity)

## 4. Weather measures

In [14]:
# weather comprehensive measure
df["weather_com"] = df["T_out"].values * df["RH_out"].values *0.01 * df["Windspeed"].values

In [15]:
# change rate
df['T_out_change'] = df['T_out'].diff()
df['Press_change'] = df['Press_mm_hg'].diff()
df['Windspeed_change'] = df['Windspeed'].diff()

## 5. Lag features

In [16]:
# appliance lag
target_col='Appliances'
df[f'{target_col}_lag1'] = df[target_col].shift(1)  
df[f'{target_col}_lag2'] = df[target_col].shift(2)  
df[f'{target_col}_lag3'] = df[target_col].shift(3)
df[f'{target_col}_lag6'] = df[target_col].shift(6)
df[f'{target_col}_lag144'] = df[target_col].shift(144)

In [17]:
# lights lag
df['lights_lag1'] = df['lights'].shift(1)  
df['lights_lag2'] = df['lights'].shift(2)  
df['lights_lag3'] = df['lights'].shift(3)
df['lights_lag4'] = df['lights'].shift(6)
df['lights_lag5'] = df['lights'].shift(144)

In [18]:
# weather lag
df['T_indoor_lag1'] = df['T_indoor_avg'].shift(1)
df['T_indoor_lag6'] = df['T_indoor_avg'].shift(6)

df['T_out_lag1'] = df['T_out'].shift(1)
df['T_out_lag6'] = df['T_out'].shift(6)

## 6. Rolling features

In [19]:
# appliance rolling
df[f'{target_col}_rolling_mean_6'] = df[target_col].rolling(window=6, min_periods=1).mean()
df[f'{target_col}_rolling_mean_18'] = df[target_col].rolling(window=18, min_periods=1).mean()
df[f'{target_col}_rolling_max_6'] = df[target_col].rolling(window=6, min_periods=1).max()
df[f'{target_col}_rolling_min_6'] = df[target_col].rolling(window=6, min_periods=1).min()
df[f'{target_col}_rolling_std_18'] = df[target_col].rolling(window=18, min_periods=1).std()
df[f'{target_col}_rolling_std_36'] = df[target_col].rolling(window=36, min_periods=1).std()
df[f'{target_col}_MA_3'] = df[target_col].rolling(window=3, min_periods=1).mean()
df[f'{target_col}_MA_12'] = df[target_col].rolling(window=12, min_periods=1).mean()  

In [20]:
# lights rolling
df['lights_rolling_mean_6'] = df['lights'].rolling(window=6, min_periods=1).mean()
df['lights_rolling_mean_18'] = df['lights'].rolling(window=18, min_periods=1).mean()
df['lights_rolling_max_6'] = df['lights'].rolling(window=6, min_periods=1).max()
df['lights_rolling_min_6'] = df['lights'].rolling(window=6, min_periods=1).min()
df['lights_rolling_std_18'] = df['lights'].rolling(window=18, min_periods=1).std()
df['lights_rolling_std_36'] = df['lights'].rolling(window=36, min_periods=1).std()
df['lights_MA_3'] = df['lights'].rolling(window=3, min_periods=1).mean()
df['lights_MA_12'] = df['lights'].rolling(window=12, min_periods=1).mean()  

In [21]:
# weather rolling
df['T_indoor_rolling_mean_6'] = df['T_indoor_avg'].rolling(window=6, min_periods=1).mean()
df['T_indoor_rolling_max_6'] = df['T_indoor_avg'].rolling(window=6, min_periods=1).max()
df['T_indoor_rolling_min_6'] = df['T_indoor_avg'].rolling(window=6, min_periods=1).min()
df['T_indoor_rolling_std_6'] = df['T_indoor_avg'].rolling(window=6, min_periods=1).std()

df['T_out_rolling_mean_6'] = df['T_out'].rolling(window=6, min_periods=1).mean()
df['T_out_rolling_std_6'] = df['T_out'].rolling(window=6, min_periods=1).std()

# Part 3: Feature Selection

In [None]:
# split train and test dataset
from sklearn.model_selection import train_test_split

X = df.drop(columns=["Appliances"]) 
y = df["Appliances"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [28]:
train_df = X_train.copy()
train_df["Appliances"] = y_train

In [35]:
# correlation-based feature selection
corr_matrix = train_df.corr()
corr_with_target = corr_matrix[target_col].drop(target_col, errors='ignore')

k = 60
top_k = corr_with_target.abs().sort_values(ascending=False)[:k].index

selected_target_corr = corr_with_target[top_k].sort_values(ascending=False)
for i, (feature, corr) in enumerate(selected_target_corr.items(), 1):
    sign = "+" if corr > 0 else "-"
    print(f"{i:2d}. {feature:35s}: {corr:7.4f} ({sign})")

 1. Appliances_MA_3                    :  0.8615 (+)
 2. Appliances_rolling_max_6           :  0.7603 (+)
 3. Appliances_lag1                    :  0.7568 (+)
 4. Appliances_rolling_mean_6          :  0.7287 (+)
 5. Appliances_MA_12                   :  0.6345 (+)
 6. Appliances_rolling_std_18          :  0.6001 (+)
 7. Appliances_rolling_mean_18         :  0.5783 (+)
 8. Appliances_rolling_min_6           :  0.5464 (+)
 9. Appliances_lag2                    :  0.5400 (+)
10. Appliances_rolling_std_36          :  0.4598 (+)
11. Appliances_lag3                    :  0.4391 (+)
12. Appliances_lag6                    :  0.3198 (+)
13. hour                               :  0.2260 (+)
14. lights_MA_3                        :  0.2251 (+)
15. lights_rolling_max_6               :  0.2246 (+)
16. lights                             :  0.2176 (+)
17. T_indoor_change                    :  0.2174 (+)
18. Appliances_lag144                  :  0.2137 (+)
19. lights_lag1                        :  0.20

1. The strong correlation between the target variable and its lagged and rolling statistics indicates autocorrelation, which is consistent with the persistent nature of residential appliance usage.  
2. "Lights" feature is a behavorial proxy rather than a main driver as its correlation score is not high.

In [32]:
# Bottom correlation features
all_correlations = corr_with_target.abs().sort_values(ascending=False)
print(all_correlations.tail(20))

T_indoor_std       0.028703
day_of_month       0.026763
day_of_week        0.024823
T_indoor_min       0.023654
T9                 0.022225
T_out_change       0.019856
Tdewpoint          0.016903
day_of_year_sin    0.013728
day_of_year        0.013239
dewpoint           0.012193
day_of_year_cos    0.011220
rv2                0.007158
rv1                0.007158
month_cos          0.007061
month              0.006134
Visibility         0.005451
RH_4               0.004894
is_weekend         0.002485
RH_5               0.002350
month_sin          0.001511
Name: Appliances, dtype: float64


AS the correlation results between rv1/rv2 and the target are approximately zero, we can delete these two random variables.

In [37]:
train_df = train_df.drop(columns=["rv1","rv2"])