# Seasonal changes related variables
* Weather underground
* U.S. holidays

In [12]:
#Data management
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import geopandas as gpd

#Data visualization
import pylab as pl
from matplotlib import ticker
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


### Weather underground
* Weather related variables
* The number of weather events per week

In [13]:
# Weekly Weather Data
weather = pd.read_csv('../data/dataprocessing/NYC_Weekly_Weather_Data_1316.csv', parse_dates = ['Unnamed: 0']) # weather data
weather_events = pd.read_csv('../data/dataprocessing/NYC_Weekly_Weather_Events_1316.csv', parse_dates = ['Date']) # weather events

In [14]:
weather['Date'] = weather['Unnamed: 0']
weather = weather.drop(['Unnamed: 0', 'Temp_mean (F)','Temp_max (F)','Temp_DewPoint (F)',
                       'Prep_mean (in)','WindGust (mph)' ], axis =1)
weather_events = weather_events.drop(['Unnamed: 0'], axis =1)

In [15]:
wv = pd.merge(weather, weather_events, how = 'left', on='Date')
wv = wv.fillna(0)
wv['Date'] = pd.to_datetime(wv['Date'])

In [16]:
wv.head(2)

Unnamed: 0,Temp_min (F),Prep_sum (in),Snow (in),WindSpeed (mph),Date,Weather_events
0,23.0,0.0,0.0,16.0,2012-12-31,0.0
1,30.0,0.64,0.0,9.0,2013-01-07,0.0


### U.S. holidays 

In [17]:
from pandas.tseries.holiday import get_calendar, USFederalHolidayCalendar,\
HolidayCalendarFactory, GoodFriday, AbstractHolidayCalendar
from datetime import datetime

days = pd.date_range('1/01/2013', periods=(365 * 4), freq='D')
weeks = pd.date_range('1/01/2013', periods=(52 * 4), freq='W')

cal = get_calendar('USFederalHolidayCalendar')  # Create calendar instance
tradingCal = HolidayCalendarFactory('TradingCalendar', cal, GoodFriday)

tradingCal.rules

[Holiday: Labor Day (month=9, day=1, offset=<DateOffset: kwds={'weekday': MO(+1)}>),
 Holiday: Presidents Day (month=2, day=1, offset=<DateOffset: kwds={'weekday': MO(+3)}>),
 Holiday: Columbus Day (month=10, day=1, offset=<DateOffset: kwds={'weekday': MO(+2)}>),
 Holiday: Veterans Day (month=11, day=11, observance=<function nearest_workday at 0x116e382a8>),
 Holiday: Good Friday (month=1, day=1, offset=[<Easter>, <-2 * Days>]),
 Holiday: Dr. Martin Luther King Jr. (month=1, day=1, offset=<DateOffset: kwds={'weekday': MO(+3)}>),
 Holiday: New Years Day (month=1, day=1, observance=<function nearest_workday at 0x116e382a8>),
 Holiday: Thanksgiving (month=11, day=1, offset=<DateOffset: kwds={'weekday': TH(+4)}>),
 Holiday: July 4th (month=7, day=4, observance=<function nearest_workday at 0x116e382a8>),
 Holiday: Christmas (month=12, day=25, observance=<function nearest_workday at 0x116e382a8>),
 Holiday: MemorialDay (month=5, day=31, offset=<DateOffset: kwds={'weekday': MO(-1)}>)]

In [18]:
holidays = cal.holidays(datetime(2013, 1, 1), datetime(2016, 12, 31))
print holidays

DatetimeIndex(['2013-01-01', '2013-01-21', '2013-02-18', '2013-05-27',
               '2013-07-04', '2013-09-02', '2013-10-14', '2013-11-11',
               '2013-11-28', '2013-12-25', '2014-01-01', '2014-01-20',
               '2014-02-17', '2014-05-26', '2014-07-04', '2014-09-01',
               '2014-10-13', '2014-11-11', '2014-11-27', '2014-12-25',
               '2015-01-01', '2015-01-19', '2015-02-16', '2015-05-25',
               '2015-07-03', '2015-09-07', '2015-10-12', '2015-11-11',
               '2015-11-26', '2015-12-25', '2016-01-01', '2016-01-18',
               '2016-02-15', '2016-05-30', '2016-07-04', '2016-09-05',
               '2016-10-10', '2016-11-11', '2016-11-24', '2016-12-26'],
              dtype='datetime64[ns]', freq=None)


In [19]:
holiday = pd.DataFrame()
holiday['Date'] = holidays
holiday['Holiday_Count'] = 1

# In order to do 'resample', Index of dataframe should be as datetime type
index = holiday['Date']
holiday.index = index

holiday = holiday.resample('W-MON')['Holiday_Count'].sum()

# Series to Dataframe
holiday = holiday.to_frame().reset_index()

# Fillna as O (which is non record of Heat and hot water complaints)
holiday = holiday.fillna(0)

holiday.head(2)

Unnamed: 0,Date,Holiday_Count
0,2013-01-07,1.0
1,2013-01-14,0.0


### Join Weather + Holiday dataset

In [20]:
print len(wv)
print len(holiday)
weather_holiday = pd.merge(wv, holiday, how = 'left', on = 'Date')
weather_holiday['Holiday_Count'] = weather_holiday['Holiday_Count'].fillna(0)
print len(weather_holiday)

210
208
210


In [21]:
weather_holiday.head(2)

Unnamed: 0,Temp_min (F),Prep_sum (in),Snow (in),WindSpeed (mph),Date,Weather_events,Holiday_Count
0,23.0,0.0,0.0,16.0,2012-12-31,0.0,0.0
1,30.0,0.64,0.0,9.0,2013-01-07,0.0,1.0


In [22]:
weather_holiday.to_csv('../data/output/00VAR_weather_holiday_2013_2016.csv')