In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from azureml.core import Workspace, Dataset, Datastore, Experiment
from azureml.core.compute import ComputeTarget
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails
from azureml.opendatasets import OjSalesSimulated
from azureml.automl.core.forecasting_parameters import ForecastingParameters

# Initializing The Compute Cluster

In [2]:
ws = Workspace.from_config(path = 'config.json')
compute_name = 'automl-cluster'
compute_target = ComputeTarget(ws, compute_name)

# Loading The Orange Juice Sales (Simulated) Dataset

In [4]:
datastore = Datastore.get_default(ws)

# Pulling out the first 10 out of 4000 files from the OJ Sales Simulated Dataset
oj_sales_files = OjSalesSimulated.get_file_dataset().take(10)

os.makedirs('Orange Juice Sales', exist_ok = True)
oj_sales_files.download('Orange Juice Sales', overwrite = True)

['/mnt/batch/tasks/shared/LS_root/mounts/clusters/automl-example-compute/code/Users/edwin.spartan117/AutoML/Orange Juice Sales/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1000_dominicks.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/automl-example-compute/code/Users/edwin.spartan117/AutoML/Orange Juice Sales/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1000_tropicana.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/automl-example-compute/code/Users/edwin.spartan117/AutoML/Orange Juice Sales/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1001_minute.maid.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/automl-example-compute/code/Users/edwin.spartan117/AutoML/Orange Juice Sales/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1002_dominicks.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/c

In [5]:
oj_filepath = Path('Orange Juice Sales').rglob('*.csv')
oj_files = [x for x in oj_filepath]
oj_sales_df = pd.concat((pd.read_csv(file) for file  in oj_files))

## Exploring The Dataset And Ensuring That It Has Been Loaded In Correctly 

In [6]:
oj_sales_df.head(20)

Unnamed: 0,WeekStarting,Store,Brand,Quantity,Advert,Price,Revenue
0,1990-06-14,1000,dominicks,12003,1,2.59,31087.77
1,1990-06-21,1000,dominicks,10239,1,2.39,24471.21
2,1990-06-28,1000,dominicks,17917,1,2.48,44434.16
3,1990-07-05,1000,dominicks,14218,1,2.33,33127.94
4,1990-07-12,1000,dominicks,15925,1,2.01,32009.25
5,1990-07-19,1000,dominicks,17850,1,2.17,38734.5
6,1990-07-26,1000,dominicks,10576,1,1.97,20834.72
7,1990-08-02,1000,dominicks,9912,1,2.26,22401.12
8,1990-08-09,1000,dominicks,9571,1,2.11,20194.81
9,1990-08-16,1000,dominicks,15748,1,2.42,38110.16


# Registering The Dataset

In [7]:
Dataset.Tabular.register_pandas_dataframe(oj_sales_df, datastore, 'Orange Juice Sales (Simulated) Sample')

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/730175e7-ae16-4baa-9921-d8f757786d8f/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


{
  "source": [
    "('workspaceblobstore', 'managed-dataset/730175e7-ae16-4baa-9921-d8f757786d8f/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "88e9fe44-558f-4d06-8d32-150e831757e0",
    "name": "Orange Juice Sales (Simulated) Sample",
    "version": 1,
    "workspace": "Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg')"
  }
}

# Model Training

## Initializing The Experiment For Model Training

In [9]:
experiment = Experiment(workspace = ws, name = 'OJ-Sales-Forecasting')

In [10]:
oj_sales_dataset = Dataset.get_by_name(ws, 'Orange Juice Sales (Simulated) Sample', version = 'latest')

## Using Standard Time Series Algorithms

In [11]:
standard_params = ForecastingParameters.from_parameters_dict(
    {
        'country_or_region_for_holidays': 'US', 'drop_column_names': 'Revenue', 'forecast_horizon': 6, 
        'target_rolling_window_size': 'auto', 'target_lags': 'auto', 'feature_lags': 'auto', 'seasonlity': 'auto',
        'short_series_handling': True, 'use_stl': 'season_trend', 'time_column_name': 'WeekStarting', 
        'time_series_id_column_names': ['Store', 'Brand'], 'short_series_handling_configuration': 'auto'
    }, validate_params = True)

In [12]:
run_configuration = AutoMLConfig(task = 'forecasting', primary_metric = 'normalized_root_mean_squared_error', featurization = 'auto',
                                 compute_target = compute_target, training_data = oj_sales_dataset, label_column_name = 'Quantity',
                                 experiment_timeout_minutes = 15, enable_early_stopping = True, n_cross_validations = 3, 
                                 model_explainability = True, enable_stack_ensemble = False, enable_voting_ensemble = True,
                                 forecasting_parameters = standard_params)

AutoML_run_standard = experiment.submit(run_configuration, show_output = True)
RunDetails(AutoML_run_standard).show()

Submitting remote run.
No run_configuration provided, running on automl-cluster with default configuration
Running on remote compute: automl-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
OJ-Sales-Forecasting,AutoML_8204a1cc-9200-47ae-bbe3-ff16db38500f,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: ModelSelection. Beginning model selection.
Heuristic parameters: Target_Lag = '[0]', Target_Rolling_Window = '0'.


********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Time Series ID detection
STATUS:       PASSED
DESCRIPTION:  The data set was analyzed, and no duplicate time index were detected.
              Learn more about time-series forecasting configurations: https://aka.ms/AutomatedMLForecastingConfiguration

********************************************************************************************

TYPE:         Frequency detection
STATUS:       PASSED
DESCRIPTION:  The time series was analyzed, all data points are aligned with detected frequency.
              Learn more about data preparation for time-series forecasting: https://aka.ms/AutomatedMLDataPreparation

********************************************************************************************

TYPE:         Missing featur

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Using Prophet & ARIMA

In [13]:
ARIMA_params = ForecastingParameters.from_parameters_dict(
    {
        'country_or_region_for_holidays': None, 'drop_column_names': ['Revenue', 'Price', 'Advert'], 
        'forecast_horizon': 6, 'target_rolling_window_size': None, 'target_lags': None, 'feature_lags': None, 
        'seasonlity': 'auto', 'short_series_handling': True, 'use_stl': 'season_trend', 
        'time_column_name': 'WeekStarting', 'time_series_id_column_names': ['Store', 'Brand'], 
        'short_series_handling_configuration': 'auto'
    }, validate_params = True)

In [14]:
ARIMA_run_configuration = AutoMLConfig(task = 'forecasting', primary_metric = 'normalized_root_mean_squared_error', 
                                       featurization = 'auto', compute_target = compute_target, training_data = oj_sales_dataset, 
                                       label_column_name = 'Quantity', experiment_timeout_minutes = 30, enable_early_stopping = True, 
                                       n_cross_validations = 3, model_explainability = True, enable_stack_ensemble = False, 
                                       enable_voting_ensemble = True, forecasting_parameters = ARIMA_params)

AutoML_run_ARIMA = experiment.submit(ARIMA_run_configuration, show_output = True)
RunDetails(AutoML_run_ARIMA).show()

Submitting remote run.
No run_configuration provided, running on automl-cluster with default configuration
Running on remote compute: automl-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
OJ-Sales-Forecasting,AutoML_28d9ced5-087e-4990-9384-114e83c36732,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Time Series ID detection
STATUS:       PASSED
DESCRIPTION:  The data set was analyzed, and no duplicate time index were detected.
              Learn more about time-series forecasting configurations: https://aka.ms/AutomatedMLForecastingConfiguration

********************************************************************************************

TYPE:         Frequency detection
STATUS:       PASSED
DESCRIPTION:  The time series was analyzed, all data points are aligned with detected frequency.
              Learn more about data preparation for time-series forecasting: https://aka.ms/AutomatedMLDataPreparation

********************************************************************************************

TYPE:         Non-station

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

# Model Registration For Future Use

In [15]:
AutoML_run_ARIMA.register_model(model_name = 'OJ-Sales-Forecasting-AutoML', 
                                description = 'Best AutoML Forecasting Model Run using the Orange Juice Sales (Simulated) Sample dataset',
                                tags = {'Project': 'Orange Juice Sales', 'Creator': ' Edwin Goh'})

Model(workspace=Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg'), name=OJ-Sales-Forecasting-AutoML, id=OJ-Sales-Forecasting-AutoML:1, version=1, tags={'Project': 'Orange Juice Sales', 'Creator': ' Edwin Goh'}, properties={})

## Using $R^2$ As The Evaluation Metric

In [16]:
AutoML_run_ARIMA.register_model(model_name = 'OJ-Sales-Forecasting-AutoML-R2', 
                                description = 'Best AutoML Forecasting Model Run using the Orange Juice Sales (Simulated) Sample dataset',
                                tags = {'Project': 'Orange Juice Sales', 'Creator': ' Edwin Goh', 'Metric': 'R-Squared'},
                                metric = 'r2_score')

Model(workspace=Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg'), name=OJ-Sales-Forecasting-AutoML-R2, id=OJ-Sales-Forecasting-AutoML-R2:1, version=1, tags={'Project': 'Orange Juice Sales', 'Creator': ' Edwin Goh', 'Metric': 'R-Squared'}, properties={})