Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Tutorial: Load NOAA ISD Weather Data

## Install azureml-opendatasets SDK

In [5]:
!pip uninstall -y azureml-contrib-opendatasets
!pip install azureml-contrib-opendatasets

Uninstalling azureml-contrib-opendatasets-1.0.30:
  Successfully uninstalled azureml-contrib-opendatasets-1.0.30
Collecting azureml-contrib-opendatasets
  Using cached https://files.pythonhosted.org/packages/64/51/4d3de57cf210941346d907584e0e6e56780067bc3555250b1fe62c2285f7/azureml_contrib_opendatasets-1.0.30-py3-none-any.whl
Installing collected packages: azureml-contrib-opendatasets
Successfully installed azureml-contrib-opendatasets-1.0.30


Import NoaaIsdWeather class from azureml-opendatasets

In [6]:
# This is a contrib package in preview. The package name is subject to change.

from azureml.contrib.opendatasets import NoaaIsdWeather

from datetime import datetime
from dateutil import parser
from dateutil.relativedelta import relativedelta

> 1. Set start_date and end_date.
>   * For weather data, due to size, by default we allow reading from the last month if multiple months are passed.If you want to load more, please refer to the bottom of this notebook for how.
> 2. New an instance of NoaaIsdWeather, disable telemetry if you don't want to send logs to Azure.
> 3. Call to_pandas_dataframe() method to get a pandas DataFrame for the last month of import datetime. We only retrieve the last month of the data due to the performance and memory size consideration.

In [7]:
start_date = parser.parse('2018-1-1')
end_date = parser.parse('2018-1-31')
isd = NoaaIsdWeather(start_date, end_date, enable_telemetry=False)
isd.to_pandas_dataframe().info()
print('isd done')

Target paths: ['/year=2018/month=1/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2018/month=1/part-00043-tid-2144550298476336893-08513eca-5858-4b26-81f4-947fb8ad614f-81081.c000.snappy.parquet under container isdweatherdatacontainer
Done.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11190325 entries, 0 to 11543557
Data columns (total 22 columns):
usaf                       object
wban                       object
datetime                   datetime64[ns]
latitude                   float64
longitude                  float64
elevation                  float64
windAngle                  float64
windSpeed                  float64
temperature                float64
seaLvlPressure             float64
cloudCoverage              object
presentWeatherIndicator    float64
pastWeatherIndicator       float64
precipTime                 float64
precipDepth                float64
snowDepth                  float64
stationName                object
c

## I want to load more months in my powerful machine
Define PandasDataLoadLimitToMonths class to load last N months of given date range.

Note that this is useful if you have a powerful machine but because of big size, expect a longer response time here. 

In [8]:
from azure.storage.blob import BlockBlobService
from azureml.contrib.opendatasets._utils.time_utils import day_range, month_range
from azureml.contrib.opendatasets.dataaccess.pandas_data_load_limit import PandasDataLoadLimitNone


class PandasDataLoadLimitToMonths(PandasDataLoadLimitNone):
    def __init__(
            self,
            start_date,
            end_date,
            n_months,
            path_pattern='/year=%d/month=%d/'):
        self.start_date = start_date
        self.end_date = end_date
        self.n_months = n_months
        self.path_pattern = path_pattern
        super(PandasDataLoadLimitToMonths, self).__init__()

    def get_target_blob_paths(
            self,
            blob_service: BlockBlobService,
            blob_container_name: str,
            blob_relative_path: str):
        self._match_paths = []
        for current_month in month_range(self.start_date, self.end_date):
            self._match_paths.append(self.path_pattern % (current_month.year, current_month.month))

        if len(self._match_paths) > 1:
            print('We are taking the latest n months: %s' % (self._match_paths[-1]))
            self._match_paths = self._match_paths[-self.n_months:]

        print('Target paths: %s' % (self._match_paths))
        return super(PandasDataLoadLimitToMonths, self).get_target_blob_paths(
            blob_service=blob_service,
            blob_container_name=blob_container_name,
            blob_relative_path=blob_relative_path)

Define NoaaIsdWeatherForMonths class inherits from NoaaIsdWeather
By overriding method get_pandas_limit(), we can balance the data load performance and the amount of the data.

In [9]:
from azureml.contrib.opendatasets import NoaaIsdWeather
from datetime import datetime
from dateutil import parser
from typing import List, Optional

class NoaaIsdWeatherForMonths(NoaaIsdWeather):
    _default_start_date = parser.parse('2008-01-01')
    _default_end_date = datetime.today()

    def __init__(
                self,
            start_date: datetime = _default_start_date,
            end_date: datetime = _default_end_date,
            n_months: int = 6,
            cols: Optional[List[str]] = None,
            enable_telemetry: bool = False):
        self.n_months = n_months
        super(NoaaIsdWeatherForMonths, self).__init__(
            start_date=start_date, end_date=end_date, cols=cols, enable_telemetry=enable_telemetry)
        
    def get_pandas_limit(self):
        return PandasDataLoadLimitToMonths(self.start_date, self.end_date, self.n_months)

In [10]:
weather = NoaaIsdWeatherForMonths(
    cols=["temperature", "precipTime", "precipDepth", "snowDepth"],
    start_date=datetime(2016, 2, 1, 0, 0),
    end_date=datetime(2016, 5, 31, 23, 59),
    n_months=4)
weather.to_pandas_dataframe().info()
print('4 months loaded.')

We are taking the latest n months: /year=2016/month=5/
Target paths: ['/year=2016/month=2/', '/year=2016/month=3/', '/year=2016/month=4/', '/year=2016/month=5/']
Looking for parquet files...
Reading them into Pandas dataframe...
Reading ISDWeather/year=2016/month=2/part-00011-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-116.c000.snappy.parquet under container isdweatherdatacontainer
Reading ISDWeather/year=2016/month=3/part-00004-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-109.c000.snappy.parquet under container isdweatherdatacontainer
Reading ISDWeather/year=2016/month=4/part-00008-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-113.c000.snappy.parquet under container isdweatherdatacontainer
Reading ISDWeather/year=2016/month=5/part-00006-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-111.c000.snappy.parquet under container isdweatherdatacontainer
Done.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 41043904 entries, 0 to 