In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6) # default figure size
plt.rcParams['font.size'] = 12           # default font size
plt.rcParams['font.family'] = 'Times New Roman'

In [34]:
data_path = '../data/energydata_complete_raw.csv'

df = pd.read_csv(data_path)

Remove rv1 and rv2

In [35]:
df = df.drop(columns=['rv1', 'rv2'], errors='ignore')

Time feature extraction
1. hour
2. day_of_week
3. is_weekend
4. hour_sin
5. hour_cos
6. day_of_week_sin
7. day_of_week_cos

In [36]:
date_format = "%d-%m-%Y %H:%M"
df['date'] = pd.to_datetime(df['date'], format=date_format, errors='raise')
df['hour_of_day'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek     # Monday:0, Sunday:6
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

Lag feature

In [None]:
df['Appliances_lag1'] = df['Appliances'].shift(1)
# df['Appliances_lag144'] = df['Appliances'].shift(144) # lag of 1 day (144 half-hour intervals)
df['T_out_lag1'] = df['T_out'].shift(1)

Rolling statistical feature

In [38]:
df['Appliances_rolling_mean_6'] = df['Appliances'].rolling(window=6).mean().shift(1)

Data export

In [39]:
print(df.head())

df.to_csv('../data/energydata_complete_cleaned.csv', index=False)

                 date  Appliances  lights     T1       RH_1    T2       RH_2  \
0 2016-01-11 17:00:00          60      30  19.89  47.596667  19.2  44.790000   
1 2016-01-11 17:10:00          60      30  19.89  46.693333  19.2  44.722500   
2 2016-01-11 17:20:00          50      30  19.89  46.300000  19.2  44.626667   
3 2016-01-11 17:30:00          50      40  19.89  46.066667  19.2  44.590000   
4 2016-01-11 17:40:00          60      40  19.89  46.333333  19.2  44.530000   

      T3       RH_3         T4  ...  day_of_week  is_weekend  hour_sin  \
0  19.79  44.730000  19.000000  ...            0           0 -0.965926   
1  19.79  44.790000  19.000000  ...            0           0 -0.965926   
2  19.79  44.933333  18.926667  ...            0           0 -0.965926   
3  19.79  45.000000  18.890000  ...            0           0 -0.965926   
4  19.79  45.000000  18.890000  ...            0           0 -0.965926   

   hour_cos  day_of_week_sin  day_of_week_cos  Appliances_lag1  \
0 -0.258