# 06.01 - Features - Scoping

## Imports & setup

In [1]:
import pathlib
from datetime import datetime
import math
import sys

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from skoot.feature_selection import FeatureFilter
from skoot.preprocessing import SelectiveRobustScaler

sys.path.append("..")
from src.models.models import SetTempAsPower, SK_Prophet
from src.utils.utils import bound_precision, AnnualTimeSeriesSplit
from src.visualization.visualize import (plot_prediction,
                                         plot_joint_plot,
                                         residual_plots,
                                         print_residual_stats)

%matplotlib inline

PROJECT_DIR = pathlib.Path.cwd().parent.resolve()
CLEAN_DATA_DIR = PROJECT_DIR / 'data' /'05-clean'

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load Hourly Data

In [2]:
dtypes = {'temp': np.float64, 'dew_point_temp':np.float64, 'rel_hum':np.float64,
          'wind_speed': np.float64, 'visibility': np.float64, 'press': np.float64,
          'hmdx': np.float64, 'wind_chill': np.float64, 'weather': np.object,
         'hour_of_day': np.float64, 'year': np.float64, 'month': np.float64,
         'day_of_week': np.float64, 'day_of_year': np.float64, 'week_of_year': np.float64,
          'quarter': np.float64, 'stat_hol': 'category', 'day_light_hours': 'category',
          'hourly_demand': np.float64, 'daily_peak': np.float64}

df_orig = pd.read_csv(CLEAN_DATA_DIR / "clean-cut.csv", parse_dates=True,
                 dtype=dtypes, index_col=0, infer_datetime_format=True)

daymapper = {0.0: 'Mon', 1.0: 'Tue', 2.0: 'Wed', 3.0: 'Thu', 4.0: 'Fri', 5.0: 'Sat', 6.0: 'Sun'}
weekdaymapper = {0.0: 'Weekday', 1.0: 'Weekday', 2.0: 'Weekday', 3.0: 'Weekday', 4.0: 'Weekday',
                 5.0: 'Weekend', 6.0: 'Weekend'}
df_orig['dayofweek'] = df_orig['day_of_week'].map(daymapper)
df_orig['daytype'] = df_orig['day_of_week'].map(weekdaymapper)
df_orig['stat_hol'] = df_orig['stat_hol'].map({'True':1, 'False':0})
df_orig['day_light_hours'] = df_orig['day_light_hours'].map({'True':1, 'False':0})
df_orig['sun'] = df_orig['day_light_hours'].diff().fillna(0)

In [3]:
df_orig = df_orig.loc['1994':'2018']
df_orig.head()

Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun
1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,14422.0,16892.0,Sat,Weekend,0.0
1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13845.0,16892.0,Sat,Weekend,0.0
1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13372.0,16892.0,Sat,Weekend,0.0
1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13025.0,16892.0,Sat,Weekend,0.0
1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,12869.0,16892.0,Sat,Weekend,0.0


## Convert Hourly Data to Daily Features

The project objective is to identify Peak Demand Days. The Data we have got is for Hours.

Therefore, we need to resample the data to become Daily

However, we do not want to lose data within a day which may impact peak demand. For example, we may want to keep the minimum daily temperature and the maximum daily temperature for each day, or the number of hours where the temperature was higher than 25C. These are factors that could be potential drivers for peak demand.

So, our strategy will be to create these features prior to resampling down to daily data

The new feature set will be daily values for:
+ Temperature - minimum, maximum, mean, median, earliest hour with highest, earliest hour with lowest
+ Dew Point Temperature - minimum, maximum, mean, median, earliest hour with highest/lowest
+ Wind Speed - minimum, maximum, mean, median, earliest hour with highest/lowest
+ Relative Humidity - minimum, maximum, mean, median, earliest hour with highest/lowest
+ Visibility - minimum, maximum, mean, median, earliest hour with highest/lowest
+ Pressure - minimum, maximum, mean, median, earliest hour with highest/lowest
+ Humidex - minimum, maximum, mean, median, earliest hour with highest/lowest (Some of the humidex values are missing, but these can be calculated - [Humidex Calculation](https://en.wikipedia.org/wiki/Humidex)
+ Day of Week
+ Light Change

### Test Humidex Calculation against Collected Data

In [4]:
df = df_orig.copy(deep=True)

df['hmdxx'] = df['temp'] + (5/9)*(6.11*math.e**(5417.7530*(1/273.16 - 1/(273.15 + df['dew_point_temp'])))-10)
df.loc['1997-07-01'][['hmdx', 'hmdxx']].head(15)

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


Unnamed: 0,hmdx,hmdxx
1997-07-01 00:00:00,27.0,27.209337
1997-07-01 01:00:00,27.0,26.979867
1997-07-01 02:00:00,26.0,26.298556
1997-07-01 03:00:00,27.0,26.831899
1997-07-01 04:00:00,27.0,26.880342
1997-07-01 05:00:00,27.0,27.410368
1997-07-01 06:00:00,28.0,28.323219
1997-07-01 07:00:00,28.0,28.058639
1997-07-01 08:00:00,29.0,28.57968
1997-07-01 09:00:00,29.0,29.131899


### Create Min, Max, Mean, Median Daily Features

In [5]:
features = ['temp', 'dew_point_temp', 'rel_hum', 'visibility', 'press', 'hmdxx']

keeper_cols = []
for feature in features:
    df = df.join(df.groupby(by=['year', 'day_of_year'])[feature].min(),
                           on=['year', 'day_of_year'], rsuffix='_min')
    keeper_cols.append(feature + '_min')
    df = df.join(df.groupby(by=['year', 'day_of_year'])[feature].max(),
                           on=['year', 'day_of_year'], rsuffix='_max')
    keeper_cols.append(feature + '_max')
    df = df.join(df.groupby(by=['year', 'day_of_year'])[feature].mean(),
                           on=['year', 'day_of_year'], rsuffix='_mean')
    keeper_cols.append(feature + '_mean')
    df = df.join(df.groupby(by=['year', 'day_of_year'])[feature].median(),
                           on=['year', 'day_of_year'], rsuffix='_median')
    keeper_cols.append(feature + '_median')
    
    max_hours = df.loc[df.groupby(pd.Grouper(freq='D')).idxmax().loc[:, feature]]['hour_of_day'].values
    df[feature + '_max_hour'] = np.repeat(max_hours, 24)
    keeper_cols.append(feature + '_max_hour')

    min_hours = df.loc[df.groupby(pd.Grouper(freq='D')).idxmin().loc[:, feature]]['hour_of_day'].values
    df[feature + '_min_hour'] = np.repeat(min_hours, 24)
    keeper_cols.append(feature + '_min_hour')
    
    
df.head(30)

Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,dew_point_temp_max_hour,dew_point_temp_min_hour,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,rel_hum_max_hour,rel_hum_min_hour,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,press_max_hour,press_min_hour,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,hmdxx_max_hour,hmdxx_min_hour
1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,14422.0,16892.0,Sat,Weekend,0.0,-4.1915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13845.0,16892.0,Sat,Weekend,0.0,-3.734133,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13372.0,16892.0,Sat,Weekend,0.0,-3.574269,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13025.0,16892.0,Sat,Weekend,0.0,-3.510459,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,12869.0,16892.0,Sat,Weekend,0.0,-3.894137,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 05:00:00,-1.8,-3.6,87.0,11.0,16.1,99.67,,-6.0,Cloudy,5.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,12866.0,16892.0,Sat,Weekend,0.0,-4.752946,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 06:00:00,-1.7,-3.9,85.0,15.0,19.3,99.57,,-7.0,Cloudy,6.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13015.0,16892.0,Sat,Weekend,0.0,-4.710583,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 07:00:00,-1.8,-4.8,80.0,15.0,16.1,99.37,,-7.0,Mostly Cloudy,7.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13577.0,16892.0,Sat,Weekend,0.0,-4.976663,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 08:00:00,-0.5,-3.8,78.0,17.0,25.0,99.31,,-5.0,Mostly Cloudy,8.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,1,13567.0,16892.0,Sat,Weekend,1.0,-3.4915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1994-01-01 09:00:00,0.5,-3.7,73.0,24.0,25.0,99.14,,,Mostly Cloudy,9.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,1,14130.0,16892.0,Sat,Weekend,0.0,-2.472288,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0


In [6]:
features = ['sun']

for feature in features:
    min_hours = df.loc[df.groupby(pd.Grouper(freq='D')).idxmin().loc[:, feature]]['hour_of_day'].values
    df[feature + '_set'] = np.repeat(min_hours, 24)
    keeper_cols.append(feature + '_set')
    
    max_hours = df.loc[df.groupby(pd.Grouper(freq='D')).idxmax().loc[:, feature]]['hour_of_day'].values
    df[feature + '_rise'] = np.repeat(max_hours, 24)
    keeper_cols.append(feature + '_rise')
df.head()

Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,dew_point_temp_max_hour,dew_point_temp_min_hour,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,rel_hum_max_hour,rel_hum_min_hour,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,press_max_hour,press_min_hour,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,hmdxx_max_hour,hmdxx_min_hour,sun_set,sun_rise
1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,14422.0,16892.0,Sat,Weekend,0.0,-4.1915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0,17.0,8.0
1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13845.0,16892.0,Sat,Weekend,0.0,-3.734133,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0,17.0,8.0
1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13372.0,16892.0,Sat,Weekend,0.0,-3.574269,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0,17.0,8.0
1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,13025.0,16892.0,Sat,Weekend,0.0,-3.510459,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0,17.0,8.0
1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,12869.0,16892.0,Sat,Weekend,0.0,-3.894137,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0,17.0,8.0


### Resample from Hourly to Daily Data

In [7]:
df_daily = df.copy(deep=True)

df_daily = df_daily.resample('D').last()
df_daily.head()

Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,dew_point_temp_max_hour,dew_point_temp_min_hour,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,rel_hum_max_hour,rel_hum_min_hour,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,press_max_hour,press_min_hour,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,hmdxx_max_hour,hmdxx_min_hour,sun_set,sun_rise
1994-01-01,1.8,1.1,95.0,15.0,16.1,98.74,,-5.0,Rain,23.0,1994.0,1.0,5.0,1.0,52.0,1.0,1,0,14152.0,16892.0,Sat,Weekend,0.0,-0.082704,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0,17.0,8.0
1994-01-02,-14.3,-20.3,60.0,13.0,25.0,100.51,,-21.0,Mainly Clear,23.0,1994.0,1.0,6.0,2.0,52.0,1.0,0,0,16527.0,18947.0,Sun,Weekend,0.0,-19.165499,-14.3,1.7,-8.704167,-9.9,0.0,23.0,-20.3,0.5,-12.445833,-13.9,0.0,23.0,60.0,92.0,74.416667,75.5,0.0,23.0,6.4,40.2,28.3875,25.0,9.0,6.0,98.77,100.51,99.92,100.125,18.0,0.0,-19.165499,-0.338394,-12.827363,-14.393998,0.0,23.0,17.0,8.0
1994-01-03,-11.0,-15.0,72.0,19.0,24.1,99.41,,-19.0,Cloudy,23.0,1994.0,1.0,0.0,3.0,1.0,1.0,0,0,17550.0,21923.0,Mon,Weekday,0.0,-15.484201,-16.3,-10.3,-12.6125,-12.0,17.0,2.0,-23.2,-12.6,-15.820833,-15.0,17.0,2.0,55.0,91.0,77.25,80.5,9.0,2.0,1.2,25.0,10.404167,8.0,0.0,9.0,99.41,100.47,100.057083,100.155,0.0,23.0,-21.317386,-14.555717,-17.143932,-16.423198,17.0,2.0,17.0,8.0
1994-01-04,-8.8,-12.9,72.0,13.0,25.0,98.64,,-15.0,Mostly Cloudy,23.0,1994.0,1.0,1.0,4.0,1.0,1.0,0,0,17395.0,21457.0,Tue,Weekday,0.0,-13.086503,-11.5,-7.4,-9.833333,-9.95,15.0,8.0,-15.0,-11.5,-13.570833,-13.7,13.0,0.0,57.0,84.0,74.416667,76.0,10.0,16.0,9.7,40.2,24.879167,24.1,13.0,10.0,98.27,99.34,98.675417,98.61,0.0,14.0,-15.874948,-11.70666,-14.183286,-14.310177,15.0,8.0,17.0,8.0
1994-01-05,-17.4,-20.1,79.0,7.0,25.0,100.37,,-23.0,Mainly Clear,23.0,1994.0,1.0,2.0,5.0,1.0,1.0,0,0,18485.0,22082.0,Wed,Weekday,0.0,-22.253714,-17.5,-7.2,-11.866667,-11.75,5.0,22.0,-20.8,-10.7,-16.154167,-17.0,4.0,22.0,56.0,84.0,70.791667,74.0,4.0,14.0,1.0,40.2,27.033333,25.0,9.0,7.0,98.44,100.37,99.352083,99.375,22.0,3.0,-22.394182,-11.256395,-16.407695,-16.464188,5.0,22.0,17.0,8.0


### Create "Day Before" Features

In [8]:
df_shifted = df_daily.copy(deep=True)
df_shifted['hmdxx_median' + '-1'] = df_shifted['hmdxx_median'].shift(1)
keeper_cols.append('hmdxx_median' + '-1')
df_shifted.dropna(subset=['hmdxx_median' + '-1'], axis=0, inplace=True)
df_shifted.head()

Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,dew_point_temp_max_hour,dew_point_temp_min_hour,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,rel_hum_max_hour,rel_hum_min_hour,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,press_max_hour,press_min_hour,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,hmdxx_max_hour,hmdxx_min_hour,sun_set,sun_rise,hmdxx_median-1
1994-01-02,-14.3,-20.3,60.0,13.0,25.0,100.51,,-21.0,Mainly Clear,23.0,1994.0,1.0,6.0,2.0,52.0,1.0,0,0,16527.0,18947.0,Sun,Weekend,0.0,-19.165499,-14.3,1.7,-8.704167,-9.9,0.0,23.0,-20.3,0.5,-12.445833,-13.9,0.0,23.0,60.0,92.0,74.416667,75.5,0.0,23.0,6.4,40.2,28.3875,25.0,9.0,6.0,98.77,100.51,99.92,100.125,18.0,0.0,-19.165499,-0.338394,-12.827363,-14.393998,0.0,23.0,17.0,8.0,-1.513575
1994-01-03,-11.0,-15.0,72.0,19.0,24.1,99.41,,-19.0,Cloudy,23.0,1994.0,1.0,0.0,3.0,1.0,1.0,0,0,17550.0,21923.0,Mon,Weekday,0.0,-15.484201,-16.3,-10.3,-12.6125,-12.0,17.0,2.0,-23.2,-12.6,-15.820833,-15.0,17.0,2.0,55.0,91.0,77.25,80.5,9.0,2.0,1.2,25.0,10.404167,8.0,0.0,9.0,99.41,100.47,100.057083,100.155,0.0,23.0,-21.317386,-14.555717,-17.143932,-16.423198,17.0,2.0,17.0,8.0,-14.393998
1994-01-04,-8.8,-12.9,72.0,13.0,25.0,98.64,,-15.0,Mostly Cloudy,23.0,1994.0,1.0,1.0,4.0,1.0,1.0,0,0,17395.0,21457.0,Tue,Weekday,0.0,-13.086503,-11.5,-7.4,-9.833333,-9.95,15.0,8.0,-15.0,-11.5,-13.570833,-13.7,13.0,0.0,57.0,84.0,74.416667,76.0,10.0,16.0,9.7,40.2,24.879167,24.1,13.0,10.0,98.27,99.34,98.675417,98.61,0.0,14.0,-15.874948,-11.70666,-14.183286,-14.310177,15.0,8.0,17.0,8.0,-16.423198
1994-01-05,-17.4,-20.1,79.0,7.0,25.0,100.37,,-23.0,Mainly Clear,23.0,1994.0,1.0,2.0,5.0,1.0,1.0,0,0,18485.0,22082.0,Wed,Weekday,0.0,-22.253714,-17.5,-7.2,-11.866667,-11.75,5.0,22.0,-20.8,-10.7,-16.154167,-17.0,4.0,22.0,56.0,84.0,70.791667,74.0,4.0,14.0,1.0,40.2,27.033333,25.0,9.0,7.0,98.44,100.37,99.352083,99.375,22.0,3.0,-22.394182,-11.256395,-16.407695,-16.464188,5.0,22.0,17.0,8.0,-14.310177
1994-01-06,-16.0,-18.4,82.0,22.0,4.0,99.77,,-26.0,Snow,23.0,1994.0,1.0,3.0,6.0,1.0,1.0,0,0,18796.0,22948.0,Thu,Weekday,0.0,-20.745923,-18.2,-14.0,-16.0,-15.75,13.0,4.0,-22.1,-16.2,-18.533333,-17.6,14.0,5.0,70.0,88.0,80.875,83.0,19.0,5.0,1.0,25.0,9.858333,3.2,0.0,16.0,99.71,100.41,100.021667,100.025,2.0,14.0,-23.127172,-18.584225,-20.744823,-20.476987,14.0,4.0,17.0,8.0,-16.464188


## When is Peak Season?

In [9]:
## Grab the first 4 Years and look for the Summer peaks
n_peaks=10
# Get the first 4 years
df_first = df_orig.loc['1994': '1997']
# Get May to October
df_first_summers = df_first[df_first['month'].isin(list(range(5,11)))]
# Get the annual peaks for each summer
for year in df_first_summers.index.year.unique():
    print(year)
    daily = df_first_summers[str(year)].resample('D').max()
    daily_peaks = daily.sort_values(by=['daily_peak'], ascending=False)['daily_peak'].head(n_peaks)
    for peak_indx, peak in zip(daily_peaks.index, daily_peaks):
        print(df_first_summers.loc[datetime.strftime(peak_indx,'%Y-%m-%d')][df_first_summers['hourly_demand'] == peak][['hourly_demand', 'dayofweek']])
    print('\n')
    


1994



Boolean Series key will be reindexed to match DataFrame index.



                     hourly_demand dayofweek
1994-06-17 13:00:00        20918.0       Fri
                     hourly_demand dayofweek
1994-06-16 15:00:00        20468.0       Thu
                     hourly_demand dayofweek
1994-07-08 13:00:00        20239.0       Fri
                     hourly_demand dayofweek
1994-07-06 15:00:00        20196.0       Wed
                     hourly_demand dayofweek
1994-06-21 15:00:00        19953.0       Tue
                     hourly_demand dayofweek
1994-07-07 15:00:00        19839.0       Thu
                     hourly_demand dayofweek
1994-07-21 13:00:00        19777.0       Thu
                     hourly_demand dayofweek
1994-06-20 12:00:00        19313.0       Mon
                     hourly_demand dayofweek
1994-06-18 10:00:00        19304.0       Sat
                     hourly_demand dayofweek
1994-07-19 16:00:00        19290.0       Tue


1995
                     hourly_demand dayofweek
1995-08-15 15:00:00        21674.0       Tue
   

The earliest peak is mid June, and the latest peak is early September

Because of the variation for when heat waves start and end we can build some contingency into these dates by selecting our period of interest to be between mid May and early October. This should ensure we catch all heatwaves

We can consider the summer solstice to be a reference point each year, and build our window around this

### Use Summer Solstice as a Reference to Define the "Summer" Weeks

In [10]:
# Get the solstice date for each year
from skyfield import api
from skyfield import almanac

ts = api.load.timescale()
e = api.load('de421.bsp')

In [11]:
# https://docs.python.org/3/library/datetime.html#datetime.date.isocalendar
t0 = ts.utc(1994, 1, 1)
t1 = ts.utc(2030, 12, 31)
t, y = almanac.find_discrete(t0, t1, almanac.seasons(e))

for yi, ti in zip(y, t):
    if yi == 1:
        #print(ti.week)
        summ_sols = ti.utc_strftime('%Y-%m-%d')
        print(summ_sols,
              datetime.strptime(summ_sols, '%Y-%m-%d').isocalendar()[1],
              datetime.strptime(summ_sols, '%Y-%m-%d').isocalendar()[2])

1994-06-21 25 2
1995-06-21 25 3
1996-06-21 25 5
1997-06-21 25 6
1998-06-21 25 7
1999-06-21 25 1
2000-06-21 25 3
2001-06-21 25 4
2002-06-21 25 5
2003-06-21 25 6
2004-06-21 26 1
2005-06-21 25 2
2006-06-21 25 3
2007-06-21 25 4
2008-06-20 25 5
2009-06-21 25 7
2010-06-21 25 1
2011-06-21 25 2
2012-06-20 25 3
2013-06-21 25 5
2014-06-21 25 6
2015-06-21 25 7
2016-06-20 25 1
2017-06-21 25 3
2018-06-21 25 4
2019-06-21 25 5
2020-06-20 25 6
2021-06-21 25 1
2022-06-21 25 2
2023-06-21 25 3
2024-06-20 25 4
2025-06-21 25 6
2026-06-21 25 7
2027-06-21 25 1
2028-06-20 25 2
2029-06-21 25 4
2030-06-21 25 5


The solstice occurs most frequently in week 25. However, in 1994 it occurred on the Monday of week 26.

We will take Week 25 as a reference point and go back 4 weeks (28 days) so start at Week 21

Go forward 15 weeks (105 days)  to end at end of week 40

This gives us a 20 week window for summer

### Scope the Data to Working Days over Summer

In [12]:
# Get Summer Period

df_summer = df_shifted.copy(deep=True)
df_summer = df_summer[df_summer['week_of_year'].isin(list(range(21,41)))]

In [13]:
# Drop weekends
df_workdays = df_summer.copy(deep=True)
df_workdays = df_workdays[df_workdays['day_of_week'].isin(list(range(0,5)))]

df_workdays.iloc[0:60]

Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,dew_point_temp_max_hour,dew_point_temp_min_hour,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,rel_hum_max_hour,rel_hum_min_hour,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,press_max_hour,press_min_hour,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,hmdxx_max_hour,hmdxx_min_hour,sun_set,sun_rise,hmdxx_median-1
1994-05-23,11.7,7.4,75.0,17.0,25.0,99.32,25.0,,Mainly Clear,23.0,1994.0,5.0,0.0,143.0,21.0,2.0,1,0,11369.0,13882.0,Mon,Weekday,0.0,11.867941,11.7,24.0,18.070833,17.35,12.0,23.0,7.4,15.0,11.108333,11.35,11.0,23.0,45.0,79.0,65.083333,69.5,2.0,13.0,24.1,40.2,31.779167,25.0,7.0,5.0,99.02,99.32,99.17,99.16,23.0,4.0,11.867941,26.769401,19.949472,19.655075,11.0,23.0,21.0,6.0,23.988273
1994-05-24,10.5,8.7,89.0,9.0,16.1,98.52,,,Cloudy,23.0,1994.0,5.0,1.0,144.0,21.0,2.0,0,0,12386.0,16173.0,Tue,Weekday,0.0,11.201129,9.0,19.6,14.3,14.6,12.0,3.0,6.2,13.4,9.508333,8.95,18.0,11.0,42.0,92.0,75.083333,81.0,22.0,12.0,6.4,40.2,24.975,24.55,5.0,8.0,98.52,99.39,99.045,99.155,9.0,23.0,8.998045,19.818202,15.44033,17.205396,15.0,4.0,21.0,6.0,19.655075
1994-05-25,12.7,11.1,90.0,9.0,9.7,98.36,,,Fog,23.0,1994.0,5.0,2.0,145.0,21.0,2.0,0,0,12214.0,16017.0,Wed,Weekday,0.0,14.503621,10.4,18.2,14.491667,14.95,16.0,2.0,9.4,14.0,11.825,12.25,18.0,1.0,64.0,96.0,84.666667,87.0,2.0,16.0,6.4,12.9,9.358333,9.7,15.0,5.0,98.26,98.46,98.353125,98.36,0.0,17.0,11.406291,20.665711,16.695849,17.722172,18.0,3.0,21.0,6.0,17.205396
1994-05-26,4.6,2.7,87.0,11.0,24.1,99.09,,,Mostly Cloudy,23.0,1994.0,5.0,3.0,146.0,21.0,2.0,0,0,12828.0,16656.0,Thu,Weekday,0.0,3.163201,3.9,13.0,8.0375,7.0,0.0,20.0,1.9,12.3,6.729167,5.8,5.0,22.0,81.0,97.0,91.416667,91.5,5.0,22.0,4.8,24.1,9.65,8.85,23.0,5.0,97.89,99.09,98.295,98.125,21.0,6.0,2.563201,15.259916,8.102568,6.567827,2.0,17.0,21.0,6.0,17.722172
1994-05-27,7.8,0.9,62.0,13.0,25.0,99.49,,,Clear,23.0,1994.0,5.0,4.0,147.0,21.0,2.0,0,0,12218.0,16252.0,Fri,Weekday,0.0,5.864725,2.0,14.8,8.7125,8.7,16.0,3.0,-0.9,2.3,0.5625,0.4,14.0,15.0,36.0,95.0,59.833333,54.5,1.0,15.0,25.0,40.2,34.5,40.2,5.0,0.0,99.12,99.52,99.392083,99.45,9.0,0.0,-0.012865,12.970553,6.695368,6.599016,17.0,3.0,21.0,6.0,6.567827
1994-05-30,20.5,12.4,60.0,11.0,24.1,99.49,25.0,,Mainly Clear,23.0,1994.0,5.0,0.0,150.0,22.0,2.0,0,0,12521.0,16525.0,Mon,Weekday,0.0,22.970716,13.1,27.2,20.85,21.7,15.0,3.0,8.0,13.6,11.225,12.4,14.0,5.0,40.0,74.0,55.375,52.5,1.0,15.0,16.1,25.0,22.270833,24.1,0.0,1.0,99.39,99.62,99.519583,99.535,8.0,20.0,13.632519,30.133976,22.777217,23.590459,14.0,3.0,21.0,6.0,18.724332
1994-05-31,15.5,14.9,96.0,7.0,8.0,98.62,31.0,,Fog,23.0,1994.0,5.0,1.0,151.0,22.0,2.0,0,0,12407.0,17104.0,Tue,Weekday,0.0,19.407431,15.3,27.0,20.533333,19.5,13.0,22.0,11.9,17.9,15.220833,14.9,13.0,1.0,58.0,96.0,72.666667,71.5,23.0,0.0,8.0,40.2,21.491667,19.3,16.0,23.0,98.59,99.42,98.957083,98.825,0.0,14.0,18.96314,32.931899,24.700298,22.741797,13.0,22.0,21.0,6.0,23.590459
1994-06-01,7.2,1.2,66.0,15.0,25.0,99.35,,,Clear,23.0,1994.0,6.0,2.0,152.0,22.0,2.0,0,0,12246.0,15876.0,Wed,Weekday,0.0,5.343838,7.2,15.3,12.316667,13.0,15.0,23.0,-1.6,14.0,5.033333,4.3,0.0,19.0,38.0,94.0,63.083333,62.0,0.0,19.0,9.7,40.2,33.120833,40.2,5.0,0.0,98.59,99.35,98.963333,99.005,22.0,2.0,5.343838,18.365711,11.821877,12.297241,0.0,23.0,21.0,6.0,22.741797
1994-06-02,9.5,6.4,81.0,7.0,25.0,100.05,,,Clear,23.0,1994.0,6.0,3.0,153.0,22.0,2.0,0,0,12318.0,15818.0,Thu,Weekday,0.0,9.285912,5.4,17.0,11.433333,11.75,14.0,3.0,2.1,7.6,5.054167,5.65,14.0,0.0,52.0,84.0,65.916667,65.5,3.0,13.0,25.0,40.2,33.866667,40.2,6.0,0.0,99.26,100.05,99.586667,99.59,23.0,1.0,4.022227,17.247222,10.767852,11.289358,14.0,3.0,21.0,6.0,12.297241
1994-06-03,12.0,5.8,66.0,7.0,25.0,99.79,,,Clear,23.0,1994.0,6.0,4.0,154.0,22.0,2.0,0,0,11961.0,16169.0,Fri,Weekday,0.0,11.567827,7.0,23.0,15.7,16.8,17.0,4.0,3.2,7.8,5.3125,5.25,9.0,3.0,30.0,87.0,54.125,52.5,4.0,17.0,25.0,40.2,35.133333,40.2,5.0,0.0,99.72,100.15,99.944167,100.005,7.0,17.0,5.812192,22.373969,15.112276,16.345562,12.0,3.0,21.0,6.0,11.289358


In [14]:
# Drop stat hols, but mark workdays before and after stat holidays
df_workdays_ = df_workdays.copy(deep=True)

df_workdays_['workday_before_stat_hol'] = df_workdays_['stat_hol'].shift(-1).fillna(0)
df_workdays_['workday_after_stat_hol'] = df_workdays_['stat_hol'].shift(1).fillna(0)

# Drop Stat Hols
df_workdays_ = df_workdays_[df_workdays_['stat_hol'] == 0]
df_workdays_.head()

Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,dew_point_temp_max_hour,dew_point_temp_min_hour,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,rel_hum_max_hour,rel_hum_min_hour,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,press_max_hour,press_min_hour,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,hmdxx_max_hour,hmdxx_min_hour,sun_set,sun_rise,hmdxx_median-1,workday_before_stat_hol,workday_after_stat_hol
1994-05-24,10.5,8.7,89.0,9.0,16.1,98.52,,,Cloudy,23.0,1994.0,5.0,1.0,144.0,21.0,2.0,0,0,12386.0,16173.0,Tue,Weekday,0.0,11.201129,9.0,19.6,14.3,14.6,12.0,3.0,6.2,13.4,9.508333,8.95,18.0,11.0,42.0,92.0,75.083333,81.0,22.0,12.0,6.4,40.2,24.975,24.55,5.0,8.0,98.52,99.39,99.045,99.155,9.0,23.0,8.998045,19.818202,15.44033,17.205396,15.0,4.0,21.0,6.0,19.655075,0,1
1994-05-25,12.7,11.1,90.0,9.0,9.7,98.36,,,Fog,23.0,1994.0,5.0,2.0,145.0,21.0,2.0,0,0,12214.0,16017.0,Wed,Weekday,0.0,14.503621,10.4,18.2,14.491667,14.95,16.0,2.0,9.4,14.0,11.825,12.25,18.0,1.0,64.0,96.0,84.666667,87.0,2.0,16.0,6.4,12.9,9.358333,9.7,15.0,5.0,98.26,98.46,98.353125,98.36,0.0,17.0,11.406291,20.665711,16.695849,17.722172,18.0,3.0,21.0,6.0,17.205396,0,0
1994-05-26,4.6,2.7,87.0,11.0,24.1,99.09,,,Mostly Cloudy,23.0,1994.0,5.0,3.0,146.0,21.0,2.0,0,0,12828.0,16656.0,Thu,Weekday,0.0,3.163201,3.9,13.0,8.0375,7.0,0.0,20.0,1.9,12.3,6.729167,5.8,5.0,22.0,81.0,97.0,91.416667,91.5,5.0,22.0,4.8,24.1,9.65,8.85,23.0,5.0,97.89,99.09,98.295,98.125,21.0,6.0,2.563201,15.259916,8.102568,6.567827,2.0,17.0,21.0,6.0,17.722172,0,0
1994-05-27,7.8,0.9,62.0,13.0,25.0,99.49,,,Clear,23.0,1994.0,5.0,4.0,147.0,21.0,2.0,0,0,12218.0,16252.0,Fri,Weekday,0.0,5.864725,2.0,14.8,8.7125,8.7,16.0,3.0,-0.9,2.3,0.5625,0.4,14.0,15.0,36.0,95.0,59.833333,54.5,1.0,15.0,25.0,40.2,34.5,40.2,5.0,0.0,99.12,99.52,99.392083,99.45,9.0,0.0,-0.012865,12.970553,6.695368,6.599016,17.0,3.0,21.0,6.0,6.567827,0,0
1994-05-30,20.5,12.4,60.0,11.0,24.1,99.49,25.0,,Mainly Clear,23.0,1994.0,5.0,0.0,150.0,22.0,2.0,0,0,12521.0,16525.0,Mon,Weekday,0.0,22.970716,13.1,27.2,20.85,21.7,15.0,3.0,8.0,13.6,11.225,12.4,14.0,5.0,40.0,74.0,55.375,52.5,1.0,15.0,16.1,25.0,22.270833,24.1,0.0,1.0,99.39,99.62,99.519583,99.535,8.0,20.0,13.632519,30.133976,22.777217,23.590459,14.0,3.0,21.0,6.0,18.724332,0,0


In [15]:
keeper_cols.extend(['day_of_week', 'week_of_year',
                    'workday_before_stat_hol', 'workday_after_stat_hol',
                   'daily_peak', 'year'])
keeper_cols

['temp_min',
 'temp_max',
 'temp_mean',
 'temp_median',
 'temp_max_hour',
 'temp_min_hour',
 'dew_point_temp_min',
 'dew_point_temp_max',
 'dew_point_temp_mean',
 'dew_point_temp_median',
 'dew_point_temp_max_hour',
 'dew_point_temp_min_hour',
 'rel_hum_min',
 'rel_hum_max',
 'rel_hum_mean',
 'rel_hum_median',
 'rel_hum_max_hour',
 'rel_hum_min_hour',
 'visibility_min',
 'visibility_max',
 'visibility_mean',
 'visibility_median',
 'visibility_max_hour',
 'visibility_min_hour',
 'press_min',
 'press_max',
 'press_mean',
 'press_median',
 'press_max_hour',
 'press_min_hour',
 'hmdxx_min',
 'hmdxx_max',
 'hmdxx_mean',
 'hmdxx_median',
 'hmdxx_max_hour',
 'hmdxx_min_hour',
 'sun_set',
 'sun_rise',
 'hmdxx_median-1',
 'day_of_week',
 'week_of_year',
 'workday_before_stat_hol',
 'workday_after_stat_hol',
 'daily_peak',
 'year']

In [16]:
df = df_workdays_.copy(deep=True)

df = df[keeper_cols]
# Drop duplicate columns
df = df.loc[:,~df.columns.duplicated()]

df.head(10)

Unnamed: 0,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,dew_point_temp_max_hour,dew_point_temp_min_hour,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,rel_hum_max_hour,rel_hum_min_hour,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,press_max_hour,press_min_hour,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,hmdxx_max_hour,hmdxx_min_hour,sun_set,sun_rise,hmdxx_median-1,day_of_week,week_of_year,workday_before_stat_hol,workday_after_stat_hol,daily_peak,year
1994-05-24,9.0,19.6,14.3,14.6,12.0,3.0,6.2,13.4,9.508333,8.95,18.0,11.0,42.0,92.0,75.083333,81.0,22.0,12.0,6.4,40.2,24.975,24.55,5.0,8.0,98.52,99.39,99.045,99.155,9.0,23.0,8.998045,19.818202,15.44033,17.205396,15.0,4.0,21.0,6.0,19.655075,1.0,21.0,0,1,16173.0,1994.0
1994-05-25,10.4,18.2,14.491667,14.95,16.0,2.0,9.4,14.0,11.825,12.25,18.0,1.0,64.0,96.0,84.666667,87.0,2.0,16.0,6.4,12.9,9.358333,9.7,15.0,5.0,98.26,98.46,98.353125,98.36,0.0,17.0,11.406291,20.665711,16.695849,17.722172,18.0,3.0,21.0,6.0,17.205396,2.0,21.0,0,0,16017.0,1994.0
1994-05-26,3.9,13.0,8.0375,7.0,0.0,20.0,1.9,12.3,6.729167,5.8,5.0,22.0,81.0,97.0,91.416667,91.5,5.0,22.0,4.8,24.1,9.65,8.85,23.0,5.0,97.89,99.09,98.295,98.125,21.0,6.0,2.563201,15.259916,8.102568,6.567827,2.0,17.0,21.0,6.0,17.722172,3.0,21.0,0,0,16656.0,1994.0
1994-05-27,2.0,14.8,8.7125,8.7,16.0,3.0,-0.9,2.3,0.5625,0.4,14.0,15.0,36.0,95.0,59.833333,54.5,1.0,15.0,25.0,40.2,34.5,40.2,5.0,0.0,99.12,99.52,99.392083,99.45,9.0,0.0,-0.012865,12.970553,6.695368,6.599016,17.0,3.0,21.0,6.0,6.567827,4.0,21.0,0,0,16252.0,1994.0
1994-05-30,13.1,27.2,20.85,21.7,15.0,3.0,8.0,13.6,11.225,12.4,14.0,5.0,40.0,74.0,55.375,52.5,1.0,15.0,16.1,25.0,22.270833,24.1,0.0,1.0,99.39,99.62,99.519583,99.535,8.0,20.0,13.632519,30.133976,22.777217,23.590459,14.0,3.0,21.0,6.0,18.724332,0.0,22.0,0,0,16525.0,1994.0
1994-05-31,15.3,27.0,20.533333,19.5,13.0,22.0,11.9,17.9,15.220833,14.9,13.0,1.0,58.0,96.0,72.666667,71.5,23.0,0.0,8.0,40.2,21.491667,19.3,16.0,23.0,98.59,99.42,98.957083,98.825,0.0,14.0,18.96314,32.931899,24.700298,22.741797,13.0,22.0,21.0,6.0,23.590459,1.0,22.0,0,0,17104.0,1994.0
1994-06-01,7.2,15.3,12.316667,13.0,15.0,23.0,-1.6,14.0,5.033333,4.3,0.0,19.0,38.0,94.0,63.083333,62.0,0.0,19.0,9.7,40.2,33.120833,40.2,5.0,0.0,98.59,99.35,98.963333,99.005,22.0,2.0,5.343838,18.365711,11.821877,12.297241,0.0,23.0,21.0,6.0,22.741797,2.0,22.0,0,0,15876.0,1994.0
1994-06-02,5.4,17.0,11.433333,11.75,14.0,3.0,2.1,7.6,5.054167,5.65,14.0,0.0,52.0,84.0,65.916667,65.5,3.0,13.0,25.0,40.2,33.866667,40.2,6.0,0.0,99.26,100.05,99.586667,99.59,23.0,1.0,4.022227,17.247222,10.767852,11.289358,14.0,3.0,21.0,6.0,12.297241,3.0,22.0,0,0,15818.0,1994.0
1994-06-03,7.0,23.0,15.7,16.8,17.0,4.0,3.2,7.8,5.3125,5.25,9.0,3.0,30.0,87.0,54.125,52.5,4.0,17.0,25.0,40.2,35.133333,40.2,5.0,0.0,99.72,100.15,99.944167,100.005,7.0,17.0,5.812192,22.373969,15.112276,16.345562,12.0,3.0,21.0,6.0,11.289358,4.0,22.0,0,0,16169.0,1994.0
1994-06-06,13.5,25.2,18.075,16.45,14.0,8.0,6.0,17.5,12.095833,12.4,13.0,1.0,53.0,85.0,68.75,68.5,9.0,2.0,8.0,25.0,16.804167,19.3,0.0,12.0,98.22,99.16,98.585833,98.475,0.0,15.0,14.795895,29.74366,20.583455,18.620874,14.0,7.0,21.0,6.0,18.349587,0.0,23.0,0,0,17036.0,1994.0


### Save the Reduced Data

In [17]:
df.to_csv(CLEAN_DATA_DIR / 'daily.csv')