In [1]:
import os
import numpy as np
import pandas as pd

%matplotlib widget
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='darkgrid')

from tsmoothie.smoother import *
from datetime import datetime, timedelta
from scipy import signal

<b>Define a function to resample the power consumption from 15 minute to 1 hour</b>

<b>*Note: The reason that pandas resample not use is the dataset start record at xx:15
(e.g. 01:15 -> 01:45 This is what pandas will resample; However, we want to in hour which mean 01:15 -> 02:00

In [2]:
def resample_data(series, window=4, types='sum'):
    #Resample the data by giving window width size
    n = len(series) / window
    split_data = np.split(series, n)
    new_data = list()
    for split in split_data:
        if types == 'sum':
            new_data.append(split.sum())
        else:
            new_data.append(split.mean())
    return np.array(new_data)

<h1> Import dataset

In [3]:
#import data
ds_path = '../dataset/raw/'
filename = 'building1retail.csv'

dataframe = pd.read_csv(os.path.join(ds_path, filename), header=0, index_col='Timestamp', parse_dates=True, low_memory=False)
dataframe.head(5)

Unnamed: 0_level_0,OAT (F),Power (kW)
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 01:15:00,51,165.1
2010-01-01 01:30:00,51,151.6
2010-01-01 01:45:00,51,146.9
2010-01-01 02:00:00,51,153.7
2010-01-01 02:15:00,51,153.8


<h1>Resample data

In [4]:
#create variables hold series values
origin_power = dataframe['Power (kW)'].values
origin_temp = dataframe['OAT (F)'].values

In [5]:
#Resample data
resampled_power = resample_data(origin_power, window=4, types='sum')
resampled_temp = resample_data(origin_temp, window=4, types='mean')

In [6]:
#Create new dataframe for resample data
dates = pd.date_range("2010-01-01", periods=len(resampled_power), freq='H')

data = {'Power':resampled_power, 'Temperature':resampled_temp}
resampled_df = pd.DataFrame(data=data, index=dates)

resampled_df.head(5)

Unnamed: 0,Power,Temperature
2010-01-01 00:00:00,617.3,51.0
2010-01-01 01:00:00,633.7,51.0
2010-01-01 02:00:00,617.0,50.0
2010-01-01 03:00:00,648.5,50.0
2010-01-01 04:00:00,637.1,50.0


<h1> Filling missing values

In [7]:
#Replace missing values by the days between missing value at the same time

index_missing_vals = resampled_df.loc[resampled_df['Power'] == 0].index

for index in index_missing_vals:
    previous_day = index.day - 1
    next_day = index.day + 1
    
    #Change dates
    previous_day_index = index.replace(day=previous_day)
    next_day_index = index.replace(day=next_day)
    
    #Retreive value
    preivous_day_val = resampled_df._get_value(previous_day_index, 'Power')
    next_day_val = resampled_df._get_value(next_day_index, 'Power')
    #Replace missing value
    average_val = (preivous_day_val + next_day_val) / 2
    resampled_df.loc[index, 'Power'] = average_val

In [8]:
#Plotting before and after missing value filling

plt.close()
resampled_df.Power.plot()
plt.title('Power consumption after filling missing values')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Power consumption after filling missing values')

<b>As shown above, there are no missing value (zero) in the graph. However, there are still a spark in the graph which is outliers.</b>

<b>So, we will remove it by anomaly detection using Tsmoothie library: https://pypi.org/project/tsmoothie/.

<h1> Outlier detection

In [9]:
resampled_df = resampled_df[:8712] #363 Days, Due to last record is 2010-12-31, 00:00:00

In [10]:
power_vals = resampled_df.Power.values

In [11]:
#Smooth by Convolution algorithm
smoother = DecomposeSmoother(smooth_type='convolution', periods=24, window_len=24, window_type='ones')

smoother.smooth(power_vals)

#Fine a confident intervals to detect outliers
lower, upper = smoother.get_intervals('sigma_interval', n_sigma=3)

is_anomaly = np.logical_or(
            power_vals[:] > upper,
            power_vals[:] < lower
).flatten()

In [12]:
#For plotting
anomalies = list()
for ind, ano in enumerate(is_anomaly):
    if ano:
        anomalies.append(power_vals[ind])
    else:
        anomalies.append(np.nan)

In [13]:
#Plot outliers
plt.close()
plt.plot(smoother.smooth_data[0], label='Smoothed power consumption')
plt.plot(smoother.data[0], label="Origin power consumption")
plt.plot(anomalies, 'x', color='r', label='Outliers')
plt.fill_between(range(len(smoother.data[0])), lower[0], upper[0], alpha=0.3) #Fill a confident interval
plt.title('Outliers detection')
plt.legend()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.legend.Legend at 0x17de1642248>

In [44]:
resampled_df['Smooth'] = smoother.smooth_data[0]

<h1> Adding features

In [45]:
#Adding a features such as Day of Week and Holiday-Events (By: Mark 1 as holiday)
#Please notice that we mark a day before holiday as 1! By this, because the model have to learn know that before head to predict power of holiday 

#Day of week 0 to 6 is Mon to Sunday
resampled_df['Day_of_Week'] = resampled_df.index.dayofweek

#Events
resampled_df['Events'] = 0
event_dates = ['2010-04-03', '2010-06-13', '2010-11-24', '2010-12-24']

for event in event_dates:
    resampled_df.loc[event, 'Events'] = 1

resampled_df

Unnamed: 0,Power,Temperature,Smooth,Day_of_Week,Events
2010-01-01 00:00:00,627.3,67.0,537.965366,4,0
2010-01-01 01:00:00,624.8,63.0,524.488132,4,0
2010-01-01 02:00:00,704.0,64.0,620.592628,4,0
2010-01-01 03:00:00,768.5,63.0,692.804419,4,0
2010-01-01 04:00:00,728.2,62.0,702.827174,4,0
...,...,...,...,...,...
2010-12-29 19:00:00,1363.7,64.0,1533.936705,2,0
2010-12-29 20:00:00,1305.8,62.0,1476.069433,2,0
2010-12-29 21:00:00,1288.8,63.0,1331.001113,2,0
2010-12-29 22:00:00,890.0,62.0,913.536397,2,0


In [46]:
#Boxplot of power consumption 
plt.close()
sns.boxplot(data=resampled_df, x=resampled_df.Day_of_Week, y=resampled_df.Smooth)
plt.title('Power consumption by day of week')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Power consumption by day of week')

<h1> Save data

In [47]:
resampled_df.index = resampled_df.index.rename('Datetime')

#Save the dataframe
save_path = '../dataset/clean/'
full_path = os.path.join(save_path, filename)
resampled_df.to_csv(full_path)