Import packages

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6) # default figure size
plt.rcParams['font.size'] = 12           # default font size
plt.rcParams['font.family'] = 'Times New Roman'

read data

In [6]:
data_path = '../data/energydata_complete_raw.csv'

df = pd.read_csv(data_path)

Remove 'rv2' (it is totally same as 'rv1' )

In [7]:
df = df.drop(columns=['rv2'], errors='raise')   # drop a duplicate column ('rv2' is same as 'rv1')

Time feature extraction
1. hour
2. day_of_week
3. is_weekend
4. hour_sin
5. hour_cos
6. day_of_week_sin
7. day_of_week_cos

In [8]:
date_format = "%d-%m-%Y %H:%M"
df['date'] = pd.to_datetime(df['date'], format=date_format, errors='raise')
df['hour_of_day'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek     # Monday:0, Sunday:6
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

Lag feature

In [9]:
df['Appliances_lag1'] = df['Appliances'].shift(1)
df['Appliances_lag144'] = df['Appliances'].shift(144) # lag of 1 day (144 half-hour intervals)
df['T_out_lag1'] = df['T_out'].shift(1)

Rolling statistical feature

In [10]:
df['Appliances_rolling_mean_6'] = df['Appliances'].rolling(window=6).mean().shift(1)

Data clean (clean the NaN at head, derived because of Lag feature and Rolling statistical feature)

In [11]:
# Data clean (clean the NaN at head, derived because of Lag feature and Rolling statistical feature)
df = df.dropna().reset_index(drop=True)

Data export

In [12]:
print(df.head())

df.to_csv('../data/energydata_complete_cleaned.csv', index=False)

                 date  Appliances  lights         T1       RH_1         T2  \
0 2016-01-12 17:00:00          60       0  20.066667  42.833333  19.000000   
1 2016-01-12 17:10:00          60      10  20.000000  42.672500  19.000000   
2 2016-01-12 17:20:00         210      20  20.000000  42.530000  18.990000   
3 2016-01-12 17:30:00         380      20  20.033333  43.496667  18.902222   
4 2016-01-12 17:40:00         370      40  20.033333  42.963333  18.890000   

        RH_2         T3       RH_3     T4  ...  day_of_week  is_weekend  \
0  42.418182  19.790000  44.700000  19.26  ...            1           0   
1  42.433333  19.790000  44.663333  19.20  ...            1           0   
2  42.471818  19.790000  44.590000  19.20  ...            1           0   
3  42.580000  19.823333  44.590000  19.20  ...            1           0   
4  42.560000  19.890000  44.590000  19.36  ...            1           0   

   hour_sin  hour_cos  day_of_week_sin  day_of_week_cos  Appliances_lag1  \
0 -0