# 03.04 - Calendar Data

## Imports & setup

In [1]:
import pathlib
import datetime
import dateutil
from os import PathLike
from typing import Union

#import simplegeneric
import pandas as pd
import numpy as np
from astral import Astral

import matplotlib.pyplot as plt
plt.style.use('grayscale')
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import palettable
%matplotlib inline


PROJECT_DIR = pathlib.Path.cwd().parent.resolve()
IMPUTED_DATA_DIR_DEMAND = PROJECT_DIR / 'data' / '03-imputed' / 'demand'
CALCULATED_FEATURES_DATA_DIR = PROJECT_DIR / 'data' / '03-calculated-features' / 'calendar'

## Load

In [2]:
demand_df = pd.read_csv(IMPUTED_DATA_DIR_DEMAND / 'demand.csv', index_col=0, parse_dates=True,
                       date_parser=dateutil.parser.parse)
#demand_df.index.tz_localize(None)
demand_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 222840 entries, 1994-01-01 00:00:00 to 2019-06-03 23:00:00
Data columns (total 1 columns):
ont_demand    222840 non-null float64
dtypes: float64(1)
memory usage: 3.4 MB


In [3]:
features_df = demand_df.copy(deep=True)
features_df['hour_of_day'] = features_df.index.hour
features_df['year'] = features_df.index.year
features_df['month'] = features_df.index.month
features_df['day_of_week'] = features_df.index.dayofweek
features_df['day_of_year'] = features_df.index.dayofyear
features_df['week_of_year'] = features_df.index.weekofyear
features_df['quarter'] = features_df.index.quarter
features_df.drop(columns=['ont_demand'], inplace=True)

features_df.head()

Unnamed: 0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter
1994-01-01 00:00:00,0,1994,1,5,1,52,1
1994-01-01 01:00:00,1,1994,1,5,1,52,1
1994-01-01 02:00:00,2,1994,1,5,1,52,1
1994-01-01 03:00:00,3,1994,1,5,1,52,1
1994-01-01 04:00:00,4,1994,1,5,1,52,1


In [4]:
import holidays


In [5]:
hols = holidays.Canada(state='ON') # default is ontario Holidays
print(features_df.loc['2018-01-01'].index.date[0] in hols)
print(features_df.loc['2018-12-27'].index.date[0] in hols)

True
False


In [6]:
features_df['stat_hol'] = pd.Series(features_df.index.date).apply(lambda x: x in hols).values
features_df.head()

Unnamed: 0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol
1994-01-01 00:00:00,0,1994,1,5,1,52,1,True
1994-01-01 01:00:00,1,1994,1,5,1,52,1,True
1994-01-01 02:00:00,2,1994,1,5,1,52,1,True
1994-01-01 03:00:00,3,1994,1,5,1,52,1,True
1994-01-01 04:00:00,4,1994,1,5,1,52,1,True


In [7]:
features_df.tail()

Unnamed: 0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol
2019-06-03 19:00:00,19,2019,6,0,154,23,2,False
2019-06-03 20:00:00,20,2019,6,0,154,23,2,False
2019-06-03 21:00:00,21,2019,6,0,154,23,2,False
2019-06-03 22:00:00,22,2019,6,0,154,23,2,False
2019-06-03 23:00:00,23,2019,6,0,154,23,2,False


In [8]:
from astral import Astral
a = Astral()
city_name='Toronto'
city = a[city_name]
#city.latitude
sun = city.sun(date=datetime.date(2019, 7, 2), local=True)
print(sun['sunrise'])
print(sun['sunset'])
print(type(sun['sunrise']))

2019-07-02 05:39:03-04:00
2019-07-02 21:03:43-04:00
<class 'datetime.datetime'>


In [9]:
print(features_df.loc['2018-01-01'].index[0])
print(features_df.loc['2018-12-27'].index[0])

2018-01-01 00:00:00
2018-12-27 00:00:00


In [10]:
features_df.head()

Unnamed: 0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol
1994-01-01 00:00:00,0,1994,1,5,1,52,1,True
1994-01-01 01:00:00,1,1994,1,5,1,52,1,True
1994-01-01 02:00:00,2,1994,1,5,1,52,1,True
1994-01-01 03:00:00,3,1994,1,5,1,52,1,True
1994-01-01 04:00:00,4,1994,1,5,1,52,1,True


In [11]:
def get_daylight_hours(row, city):
    sun = city.sun(date=row.name, local=True)
    sunrise = sun['sunrise'].replace(tzinfo=None) ; sunset = sun['sunset'].replace(tzinfo=None)
    bool_val = (row.name > sunrise) & (row.name < sunset)
    return bool_val


a = Astral()
city = a['Toronto']
features_df['day_light_hours'] = features_df.apply(get_daylight_hours, city=city, axis=1)
features_df.head()

Unnamed: 0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours
1994-01-01 00:00:00,0,1994,1,5,1,52,1,True,False
1994-01-01 01:00:00,1,1994,1,5,1,52,1,True,False
1994-01-01 02:00:00,2,1994,1,5,1,52,1,True,False
1994-01-01 03:00:00,3,1994,1,5,1,52,1,True,False
1994-01-01 04:00:00,4,1994,1,5,1,52,1,True,False


In [12]:
features_df.tail()

Unnamed: 0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours
2019-06-03 19:00:00,19,2019,6,0,154,23,2,False,True
2019-06-03 20:00:00,20,2019,6,0,154,23,2,False,True
2019-06-03 21:00:00,21,2019,6,0,154,23,2,False,False
2019-06-03 22:00:00,22,2019,6,0,154,23,2,False,False
2019-06-03 23:00:00,23,2019,6,0,154,23,2,False,False


In [13]:
features_df.to_csv(CALCULATED_FEATURES_DATA_DIR / 'calendar.csv')

In [14]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 222840 entries, 1994-01-01 00:00:00 to 2019-06-03 23:00:00
Data columns (total 9 columns):
hour_of_day        222840 non-null int64
year               222840 non-null int64
month              222840 non-null int64
day_of_week        222840 non-null int64
day_of_year        222840 non-null int64
week_of_year       222840 non-null int64
quarter            222840 non-null int64
stat_hol           222840 non-null bool
day_light_hours    222840 non-null bool
dtypes: bool(2), int64(7)
memory usage: 24.0 MB
