Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Tutorial: Load demo data and enrich it with NOAA ISD Weather data.

In this tutorial, you load the demo data (a parquet file in Azure Blob), check the data schema, enrich it with NOAA ISD Weather data.

Prerequisites:
> You must install the PyPI package on the cluster:
> * azureml-contrib-opendatasets --index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D

Learn how to:
> * Load the demo data from Azure Blob
> * Check the demo data schema
> * Initialize NoaaIsdWeather class to load weather data
> * Enrich the demo data with weather data
> * Display the joined result annd stats

## Load demo parquet file from Azure Blob

In [3]:
from azure.storage.blob import BlockBlobService
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

container_name = 'tutorials'
account_name = 'azureopendatastorage'
relative_path = 'noaa_isd_weather/demo.parquet'
df = spark.read.parquet('wasbs://%s@%s.blob.core.windows.net/%s' % (
    container_name,
    account_name,
    relative_path))
df.count()

# Display the demo data

In [4]:
display(df)

datetime,lat,long,stations.city,count,stations.dock_count
2015-05-01T00:00:00.000+0000,37.787152,-122.388013,San Francisco,28,15
2015-05-02T00:00:00.000+0000,37.787152,-122.388013,San Francisco,5,15
2015-05-03T00:00:00.000+0000,37.787152,-122.388013,San Francisco,11,15
2015-05-04T00:00:00.000+0000,37.787152,-122.388013,San Francisco,24,15
2015-05-05T00:00:00.000+0000,37.787152,-122.388013,San Francisco,24,15
2015-05-06T00:00:00.000+0000,37.787152,-122.388013,San Francisco,28,15
2015-05-07T00:00:00.000+0000,37.787152,-122.388013,San Francisco,20,15
2015-05-08T00:00:00.000+0000,37.787152,-122.388013,San Francisco,21,15
2015-05-09T00:00:00.000+0000,37.787152,-122.388013,San Francisco,9,15
2015-05-10T00:00:00.000+0000,37.787152,-122.388013,San Francisco,10,15


# Initialize NoaaIsdWeather class, get the enricher from it and enrich demo data

In [5]:
# This is a contrib package in preview. The package name is subject to change.

from azureml.contrib.opendatasets.accessories.location_data import LatLongColumn
from azureml.contrib.opendatasets.accessories.location_time_customer_data import LocationTimeCustomerData
from azureml.contrib.opendatasets import NoaaIsdWeather


_customer_data = LocationTimeCustomerData(df, LatLongColumn('lat', 'long'), 'datetime')
weather = NoaaIsdWeather(cols=["temperature", "windSpeed", "seaLvlPressure"])
weather_enricher = weather.get_enricher()
joined_data = weather_enricher.enrich_customer_data_with_agg(
  customer_data_object=_customer_data,
  location_match_granularity=5,
  time_round_granularity='day',
  agg='avg')

# Display the joined result

In [6]:
display(joined_data.data)

lat,long,datetime,stations.city,count,stations.dock_count,row_id,avg(temperature),avg(windSpeed),avg(seaLvlPressure)
37.330165,-121.885831,2015-05-27T00:00:00.000+0000,San Jose,2,15,555,17.041747572815535,4.40576923076923,1016.0708333333332
37.389218,-122.081896,2015-05-24T00:00:00.000+0000,Mountain View,1,15,607,14.564999999999998,3.620833333333334,1016.9791666666666
37.444521,-122.163093,2015-05-27T00:00:00.000+0000,Palo Alto,1,11,1848,14.850344827586207,3.705442176870746,1016.3666666666668
37.781039,-122.411748,2015-05-21T00:00:00.000+0000,San Francisco,38,23,1569,13.967088607594937,3.2710638297872303,1014.4471153846152
37.791464000000005,-122.391034,2015-05-23T00:00:00.000+0000,San Francisco,9,19,496,13.847904191616768,4.9656976744186005,1018.7134615384616
37.794139,-122.394434,2015-05-14T00:00:00.000+0000,San Francisco,50,23,430,14.23948717948718,3.145641025641024,1011.5009615384612
37.795392,-122.394203,2015-05-24T00:00:00.000+0000,San Francisco,30,23,1264,13.388111888111888,4.8027972027972,1017.2605769230772
37.795392,-122.394203,2015-05-31T00:00:00.000+0000,San Francisco,13,23,1271,13.20612244897959,4.1484693877551,1013.8009615384616
37.337391,-121.886995,2015-05-24T00:00:00.000+0000,San Jose,5,15,1800,16.338297872340423,3.821276595744681,1016.9791666666666
37.348742,-121.894715,2015-05-26T00:00:00.000+0000,San Jose,5,15,581,16.492473118279573,4.138709677419354,1015.5375


# Convert the joined spark dataframe to pandas dataframe

In [7]:
joined_data_pandas = joined_data.data.toPandas()

# Check the stats of joined result

In [8]:
joined_data_pandas.info()