In [1]:
import json
import logging

import azureml.core
import pandas as pd
from azureml.automl.core.featurization import FeaturizationConfig
from azureml.core import Experiment, Workspace, Dataset
from azureml.train.automl import AutoMLConfig

In [2]:
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")

You are currently using version 1.39.0 of the Azure ML SDK


In [3]:
ws = Workspace.from_config()

# choose a name for the run history container in the workspace
experiment_name = "automl-ojforecasting"

experiment = Experiment(ws, experiment_name)

output = {}
output["Subscription ID"] = ws.subscription_id
output["Workspace"] = ws.name
output["SKU"] = ws.sku
output["Resource Group"] = ws.resource_group
output["Location"] = ws.location
output["Run History Name"] = experiment_name
pd.set_option("display.max_colwidth", None)
outputDf = pd.DataFrame(data=output, index=[""])
outputDf.T

Unnamed: 0,Unnamed: 1
Subscription ID,ba7979f7-d040-49c9-af1a-7414402bf622
Workspace,yuzhua-rg-easts-2
SKU,Basic
Resource Group,yuzhua-rg-eastus
Location,eastus
Run History Name,automl-ojforecasting


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
amlcompute_cluster_name = "oj-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_D12_V2", max_nodes=12
    )
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
time_column_name = "WeekStarting"
data = pd.read_csv("dominicks_OJ.csv", parse_dates=[time_column_name])

# Drop the columns 'logQuantity' as it is a leaky feature.
data.drop("logQuantity", axis=1, inplace=True)

data.head()

Unnamed: 0,WeekStarting,Store,Brand,Quantity,Advert,Price,Age60,COLLEGE,INCOME,Hincome150,Large HH,Minorities,WorkingWoman,SSTRDIST,SSTRVOL,CPDIST5,CPWVOL5
0,1990-06-14,2,dominicks,10560,1,1.59,0.232865,0.248935,10.553205,0.463887,0.103953,0.11428,0.303585,2.110122,1.142857,1.92728,0.376927
1,1990-06-14,2,minute.maid,4480,0,3.17,0.232865,0.248935,10.553205,0.463887,0.103953,0.11428,0.303585,2.110122,1.142857,1.92728,0.376927
2,1990-06-14,2,tropicana,8256,0,3.87,0.232865,0.248935,10.553205,0.463887,0.103953,0.11428,0.303585,2.110122,1.142857,1.92728,0.376927
3,1990-06-14,5,dominicks,1792,1,1.59,0.117368,0.321226,10.922371,0.535883,0.103092,0.053875,0.410568,3.801998,0.681818,1.600573,0.736307
4,1990-06-14,5,minute.maid,4224,0,2.99,0.117368,0.321226,10.922371,0.535883,0.103092,0.053875,0.410568,3.801998,0.681818,1.600573,0.736307


In [6]:
time_series_id_column_names = ["Store", "Brand"]
nseries = data.groupby(time_series_id_column_names).ngroups
print("Data contains {0} individual time-series.".format(nseries))

Data contains 249 individual time-series.


In [7]:
n_test_periods = 20


def split_last_n_by_series_id(df, n):
    """Group df by series identifiers and split on last n rows for each group."""
    df_grouped = df.sort_values(time_column_name).groupby(  # Sort by ascending time
        time_series_id_column_names, group_keys=False
    )
    df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])
    df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])
    return df_head, df_tail


train, test = split_last_n_by_series_id(data, n_test_periods)

In [8]:
from azureml.data.dataset_factory import TabularDatasetFactory

datastore = ws.get_default_datastore()
train_dataset = TabularDatasetFactory.register_pandas_dataframe(
    train, target=(datastore, "dataset/"), name="dominicks_OJ_train"
)
test_dataset = TabularDatasetFactory.register_pandas_dataframe(
    test, target=(datastore, "dataset/"), name="dominicks_OJ_valid"
)

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to dataset//48ced0c5-75ba-4d42-bc36-88ce97a0a405/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to dataset//be03c8a5-902c-410d-958e-f49ec23cb6bd/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [9]:
train_dataset.to_pandas_dataframe().tail()

Unnamed: 0,WeekStarting,Store,Brand,Quantity,Advert,Price,Age60,COLLEGE,INCOME,Hincome150,Large HH,Minorities,WorkingWoman,SSTRDIST,SSTRVOL,CPDIST5,CPWVOL5
23962,1992-04-16,137,tropicana,23680,0,3.19,0.209602,0.528362,10.96649,0.860739,0.092996,0.11325,0.330293,6.026484,0.705882,0.77253,0.333761
23963,1992-04-23,137,tropicana,25728,0,2.74,0.209602,0.528362,10.96649,0.860739,0.092996,0.11325,0.330293,6.026484,0.705882,0.77253,0.333761
23964,1992-04-30,137,tropicana,80384,1,2.39,0.209602,0.528362,10.96649,0.860739,0.092996,0.11325,0.330293,6.026484,0.705882,0.77253,0.333761
23965,1992-05-07,137,tropicana,30464,0,3.19,0.209602,0.528362,10.96649,0.860739,0.092996,0.11325,0.330293,6.026484,0.705882,0.77253,0.333761
23966,1992-05-14,137,tropicana,27904,0,3.19,0.209602,0.528362,10.96649,0.860739,0.092996,0.11325,0.330293,6.026484,0.705882,0.77253,0.333761


In [10]:
target_column_name = "Quantity"

In [11]:
def prepare_dataset_for_parallel(
        input_dataset, workspace, datastore, dataset_name, dataset_type, grain_column_names):
    register_name = dataset_name + f"_{dataset_type}"
    registered_dataset = input_dataset.register(
        workspace=workspace, name=register_name, description=dataset_name, create_new_version=True)
    partition_keys = grain_column_names
    if partition_keys is None:
        return registered_dataset

    if isinstance(partition_keys, str):
        partition_keys = [partition_keys, ]
    partitioned_dataset = registered_dataset.partition_by(
        partition_keys=partition_keys,
        target=(datastore, f'{dataset_name}_{dataset_type}'),
        name=dataset_name + f"_{dataset_type}_partition")
    path = partitioned_dataset._dataflow._steps[0].arguments['datastores'][0]['path']
    splits = path.split('/')
    new_dataset_path = "/" + splits[0] + "/" + splits[1] + "/"
    partition_format = "/".join("{" + n + "}" for n in partition_keys)
    partition_format = partition_format + "/*.parquet"
    partitioned_new_dataset = Dataset.Tabular.from_parquet_files(
        path=(datastore,new_dataset_path), partition_format = partition_format)
    partitioned_new_dataset.register(workspace, register_name, create_new_version=True)

    return Dataset.get_by_name(workspace, register_name)

In [12]:
prepared_dataset = prepare_dataset_for_parallel(train_dataset, ws, datastore, train_dataset.name, "train", time_series_id_column_names)

Validating arguments.
Arguments validated.
Uploading file to /dominicks_OJ_train_train/35f8f6f0-19e1-4f3a-afa5-8a4f67f2c158/
Successfully uploaded file to datastore.
Creating a new dataset.
Successfully created a new dataset.
registering a new dataset.
Successfully created and registered a new dataset.


In [13]:
from azureml.automl.core.forecasting_parameters import ForecastingParameters

forecasting_parameters = ForecastingParameters(
    time_column_name=time_column_name,
    forecast_horizon=n_test_periods,
    time_series_id_column_names=time_series_id_column_names
)
automl_config = AutoMLConfig(
    task="forecasting",
    primary_metric="normalized_root_mean_squared_error",
    experiment_timeout_hours=0.5,
    training_data=prepared_dataset,
    label_column_name=target_column_name,
    verbosity=logging.INFO,
    compute_target=compute_target,
    max_concurrent_iterations=10,
    max_cores_per_iteration=-1,
    enable_dnn=True,
    enable_early_stopping=False,
    forecasting_parameters=forecasting_parameters,
    n_cross_validations=3,
    forecasting_dnn_models_only=True
)

	cv_step_size
	target_lags
	feature_lags
	target_rolling_window_size
	cv based validation settings


In [14]:
from azureml.core import Experiment

experiment = Experiment(ws, 'oj-distributed-tcn')

print('Experiment name: ' + experiment.name)

Experiment name: oj-distributed-tcn


In [None]:
remote_run = experiment.submit(automl_config, show_output=True)



Submitting remote run.
No run_configuration provided, running on oj-cluster with default configuration
Running on remote compute: oj-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
oj-distributed-tcn,AutoML_43760cbd-0bcd-4885-a40c-0d2c77b1b584,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



