# Load nyc_energy and enrich it with weather data

In this notebook, we try to enrich the NYC Energy data in Jupyter Notebook in a scalable way.
We enrich the input data by month, put the monthly enriched data in the temp folder, and save the final result in the current folder every time we have done one month.

* Load csv file which is downloaded from:  https://notebooks.azure.com/frlazzeri/projects/automatedml-ms-build/html/nyc_energy.csv
* Time range: 1/1/2012  to 8/12/2017
* Location: 'PORT AUTH DOWNTN MANHATTAN WALL ST' station at <font color='red'>lat: 40.701, long: -74.009</font>

In [1]:
# install packages if it's not availble.

# !pip uninstall -y azureml-opendatasets
# !pip install azureml-opendatasets

### Initialize global variables.

In [2]:
from datetime import datetime


start_date = datetime(2012, 1, 1, 0, 0)
end_date = datetime(2017, 8, 12, 23, 59)

start_date, end_date

(datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2017, 8, 12, 23, 59))

In [3]:
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import math


r = relativedelta(end_date, start_date)
months = r.years * 12 + r.months + math.floor((r.days + 30)/31)
months

68

In [4]:
lat, long = 40.701, -74.009
lat, long

(40.701, -74.009)

### Load ``"./nyc_energy.csv"`` (download and save to local) and preview the data.

In [5]:
from pandas import read_csv


df = read_csv('./nyc_energy.csv').drop(columns=['precip', 'temp'], axis=1)
df['lat'] = lat
df['long'] = long
df.head(5)

Unnamed: 0,timeStamp,demand,lat,long
0,2012-01-01 00:00:00,4937.5,40.701,-74.009
1,2012-01-01 01:00:00,4752.1,40.701,-74.009
2,2012-01-01 02:00:00,4542.6,40.701,-74.009
3,2012-01-01 03:00:00,4357.7,40.701,-74.009
4,2012-01-01 04:00:00,4275.5,40.701,-74.009


### Extend the timeStamp column so that we can filter it easily.

In [6]:
from dateutil import parser


df['new_datetime'] = df['timeStamp'].apply(parser.parse)
raw_columns = list(df.columns)
df.head(5)

Unnamed: 0,timeStamp,demand,lat,long,new_datetime
0,2012-01-01 00:00:00,4937.5,40.701,-74.009,2012-01-01 00:00:00
1,2012-01-01 01:00:00,4752.1,40.701,-74.009,2012-01-01 01:00:00
2,2012-01-01 02:00:00,4542.6,40.701,-74.009,2012-01-01 02:00:00
3,2012-01-01 03:00:00,4357.7,40.701,-74.009,2012-01-01 03:00:00
4,2012-01-01 04:00:00,4275.5,40.701,-74.009,2012-01-01 04:00:00


### Create the temp folder in which we save the enriched data per month.

In [7]:
!if [ ! -d "./temp" ]; then mkdir temp; fi

### Enriching...

In [8]:
import os.path
import pandas as pd
import numpy as np
from azureml.opendatasets.accessories.location_data import LatLongColumn
from azureml.opendatasets.accessories.location_time_customer_data \
    import LocationTimeCustomerData
from azureml.opendatasets import NoaaIsdWeather
from azureml.opendatasets.environ import PandasEnv


if os.path.exists('./nyc_energy_enriched.csv'):
    raise RuntimeError('nyc_energy_enriched.csv exists already.')
else:
    print('[%s] Start enriching...' % datetime.now())
    all = pd.DataFrame([])
    report_joined = {}
    i_date = start_date
    for m in range(months):
        j_date = i_date + relativedelta(months=1) - timedelta(milliseconds=1)

        # This is important to set monotonically increasing index for successful enrichemnt.
        df1 = df[(df['new_datetime'] >= i_date) & (df['new_datetime'] <= j_date)].copy()
        df1['idx'] = list(range(len(df1.index)))
        df1 = df1.set_index('idx')

        energy = LocationTimeCustomerData(
            df1,
            LatLongColumn('lat', 'long'),
            'new_datetime')

        weather = NoaaIsdWeather(
            cols=["temperature", "precipTime", "precipDepth", "snowDepth"],
            start_date=i_date,
            end_date=j_date)

        weather_enricher = weather.get_enricher()
        new_energy, processed_weather = weather_enricher.enrich_customer_data_no_agg(
            customer_data_object=energy,
            location_match_granularity=5, # higher for high join success rate, lower for performance.
            time_round_granularity='day')
        
        # ---=== Begin of cusomtized aggregation ===---
        
        processed_weather.data['precipDepth'] = processed_weather.data['precipDepth'].apply(
            lambda x: np.nan if x == 9999 else x)
        processed_weather.data['precipTime'] = processed_weather.data['precipTime'].apply(
            lambda x: np.nan if x == 99 else x)

        processed_weather.data['precipDepth/precipTime'] = \
        processed_weather.data[['precipDepth', 'precipTime']].apply(
            lambda x: np.nan if (
                pd.isna(x[0]) or pd.isna(x[1]) or x[1] == 0.0) else (x[0] / x[1]), axis=1)
        
        aggregations = {
            "temperature": "mean",
            "snowDepth": "mean",
            "precipDepth/precipTime": "mean",
            "precipDepth": "max",
            "precipTime": "max"}
        
        public_rankgroup = processed_weather.id

        public_join_time = [
            s for s in list(processed_weather.data.columns)
            if s.startswith('ds_join_time')][0]

        customer_rankgroup = weather_enricher.location_selector.customer_rankgroup

        customer_join_time = [
            s for s in list(new_energy.data.columns)
            if s.startswith('customer_join_time')][0]

        weather_df_grouped = processed_weather.data.groupby(
            by=[public_rankgroup, public_join_time]).agg(aggregations)
        
        joined_dataset = new_energy.data.merge(
            weather_df_grouped,
            left_on=[customer_rankgroup, customer_join_time],
            right_on=[public_rankgroup, public_join_time],
            how='left')

        final_df = joined_dataset[raw_columns + [
            "temperature", "precipTime", "precipDepth", "snowDepth", "precipDepth/precipTime"]]

        report_joined[i_date] = final_df.describe()
        
        # ---=== End of customized aggregation ===---
        
        fn = './temp/nyc_energy_enriched_%s.csv' % i_date
        final_df.to_csv(fn)

        all = pd.concat([all, final_df])
        all.to_csv('./nyc_energy_enriched.csv')

        i_date += relativedelta(months=1)

    print('[%s] End enriching...' % datetime.now())

[2019-04-29 09:00:03.181218] Start enriching...
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.29 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target paths: ['/year=2012/month=1/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2012/month=1/part-00004-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-57.c000.snappy.parquet under container isdweatherdatacontainer
Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=84996.24 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=85094.1 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=18.5 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target paths: ['/year=2012/month=2/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2012/mont

ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44269.14 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.68 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target paths: ['/year=2013/month=2/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2013/month=2/part-00011-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-77.c000.snappy.parquet under container isdweatherdatacontainer
Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=62644.02 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=62696.92 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.12 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target paths: ['/year=2013/month=3/']
Looking for parquet files...
Reading them into Pa

ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44652.11 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44703.73 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.69 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target paths: ['/year=2014/month=3/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2014/month=3/part-00001-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-80.c000.snappy.parquet under container isdweatherdatacontainer
Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43063.67 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43116.56 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=16.72 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target pa

Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44280.98 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44339.07 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.61 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target paths: ['/year=2015/month=4/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2015/month=4/part-00011-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-103.c000.snappy.parquet under container isdweatherdatacontainer
Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43311.91 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43347.73 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.16 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Ta

Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43887.97 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43966.98 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.72 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target paths: ['/year=2016/month=5/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2016/month=5/part-00006-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-111.c000.snappy.parquet under container isdweatherdatacontainer
Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44077.01 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44163.85 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.48 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Ta

Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=45802.38 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=45874.65 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.22 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Target paths: ['/year=2017/month=6/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2017/month=6/part-00010-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-128.c000.snappy.parquet under container isdweatherdatacontainer
Done.
ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=47608.01 [ms]
ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=47667.03 [ms]
ActivityStarted, get_enricher
ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.83 [ms]
ActivityStarted, enrich_customer_data_no_agg
ActivityStarted, enrich
Ta

### The final result has been saved to ``"./nyc_energy_enriched.csv"``

In [9]:
all.head(5)

Unnamed: 0,timeStamp,demand,lat,long,new_datetime,temperature,precipTime,precipDepth,snowDepth,precipDepth/precipTime
0,2012-01-01 00:00:00,4937.5,40.701,-74.009,2012-01-01 00:00:00,7.665934,24.0,58.0,0.0,0.046384
1,2012-01-01 01:00:00,4752.1,40.701,-74.009,2012-01-01 01:00:00,7.665934,24.0,58.0,0.0,0.046384
2,2012-01-01 02:00:00,4542.6,40.701,-74.009,2012-01-01 02:00:00,7.665934,24.0,58.0,0.0,0.046384
3,2012-01-01 03:00:00,4357.7,40.701,-74.009,2012-01-01 03:00:00,7.665934,24.0,58.0,0.0,0.046384
4,2012-01-01 04:00:00,4275.5,40.701,-74.009,2012-01-01 04:00:00,7.665934,24.0,58.0,0.0,0.046384


<font color='blue'>The join success rate is 100%</font>

In [10]:
all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49205 entries, 0 to 270
Data columns (total 10 columns):
timeStamp                 49205 non-null object
demand                    49124 non-null float64
lat                       49205 non-null float64
long                      49205 non-null float64
new_datetime              49205 non-null datetime64[ns]
temperature               49205 non-null float64
precipTime                49205 non-null float64
precipDepth               49205 non-null float64
snowDepth                 49205 non-null float64
precipDepth/precipTime    49205 non-null float64
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 4.1+ MB


In [11]:
all.describe()

Unnamed: 0,demand,lat,long,temperature,precipTime,precipDepth,snowDepth,precipDepth/precipTime
count,49124.0,49205.0,49205.0,49205.0,49205.0,49205.0,49205.0,49205.0
mean,6067.447361,40.701,-74.009,13.372627,24.0,391.145534,1.072569,19.859086
std,1285.607657,7.1055e-15,0.0,9.64006,0.0,1042.9095,4.32994,61.989804
min,2859.6,40.701,-74.009,-13.226429,24.0,0.0,0.0,0.0
25%,5133.86225,40.701,-74.009,5.637931,24.0,0.0,0.0,0.0
50%,6020.071,40.701,-74.009,13.955882,24.0,10.0,0.0,0.065359
75%,6684.3,40.701,-74.009,22.236709,24.0,135.0,0.0,2.71158
max,11456.0,40.701,-74.009,32.852857,24.0,7630.0,51.228571,578.402778


In [12]:
report_joined[datetime(2016, 10, 1, 0, 0)]

Unnamed: 0,demand,lat,long,temperature,precipTime,precipDepth,snowDepth,precipDepth/precipTime
count,743.0,744.0,744.0,744.0,744.0,744.0,744.0,744.0
mean,5427.314386,40.701,-74.009,15.386461,24.0,81.774194,0.0,1.889804
std,880.418908,7.110207e-15,0.0,4.15907,0.0,138.53159,0.0,3.683908
min,3946.383,40.701,-74.009,7.38125,24.0,0.0,0.0,0.0
25%,4660.2705,40.701,-74.009,12.365179,24.0,0.0,0.0,0.0
50%,5433.483,40.701,-74.009,15.759633,24.0,0.0,0.0,0.0
75%,6101.083,40.701,-74.009,18.396,24.0,140.0,0.0,1.198718
max,7853.3,40.701,-74.009,24.327826,24.0,521.0,0.0,13.738443


In [13]:
report_joined[datetime(2016, 11, 1, 0, 0)]

Unnamed: 0,demand,lat,long,temperature,precipTime,precipDepth,snowDepth,precipDepth/precipTime
count,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0
mean,5371.189944,40.701,-74.009,10.440883,24.0,1787.366667,0.0,130.880351
std,783.680586,7.110367e-15,0.0,3.318841,0.0,1170.974488,0.0,122.276823
min,3999.292,40.701,-74.009,4.346117,24.0,0.0,0.0,0.0
25%,4622.8295,40.701,-74.009,7.377561,24.0,0.0,0.0,0.0
50%,5419.7165,40.701,-74.009,11.029123,24.0,2540.0,0.0,78.905303
75%,6115.79575,40.701,-74.009,12.923214,24.0,2553.0,0.0,232.143362
max,6797.308,40.701,-74.009,18.216071,24.0,2646.0,0.0,404.605201


In [14]:
# EOF