In [None]:
# !git clone https://github.com/microsoft/solution-accelerator-many-models

In [1]:
import os
import sys
sys.path.append('solution-accelerator-many-models')
import datetime
import numpy as np
import pandas as pd
from azureml.core import Workspace, Dataset, Datastore, Experiment
from azureml.core.compute import ComputeTarget
from scripts.helper import split_data

# Data Preparation

In [2]:
ManyModelsSample = pd.read_csv('ManyModelsSampleData.csv', header = 0)
os.makedirs('MMSA Sample', exist_ok = True)

In [3]:
ManyModelsSample['Date'] = ManyModelsSample['Date'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))

In [4]:
ManyModelsSample

Unnamed: 0,Date,Store,Sales
0,2020-01-01,New York City,10
1,2020-01-02,New York City,20
2,2020-01-03,New York City,23
3,2020-01-04,New York City,2
4,2020-01-05,New York City,79
...,...,...,...
1819,2021-03-27,Seattle,15
1820,2021-03-28,Seattle,41
1821,2021-03-29,Seattle,85
1822,2021-03-30,Seattle,34


In [5]:
for store_name, store_data in ManyModelsSample.groupby('Store'):
    store_data.to_csv('MMSA Sample/{}.csv'.format(store_name), header = True, index_label = False)

In [6]:
train_path, inference_path = split_data('MMSA Sample', 'Date', '2021-03-01')

## Initializing The Workspace

In [7]:
ws = Workspace.from_config(path = 'config.json')
datastore = ws.get_default_datastore()

## Uploading Both The Training And Scoring (Inference) Data

In [8]:
datastore.upload(src_dir = train_path, target_path = 'MMSA Sample/Train', overwrite = True)
datastore.upload(src_dir = inference_path, target_path = 'MMSA Sample/Inference', overwrite = True)

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 4 files
Uploading MMSA Sample/upload_train_data/New York City.csv
Uploaded MMSA Sample/upload_train_data/New York City.csv, 1 files out of an estimated total of 4
Uploading MMSA Sample/upload_train_data/San Francisco.csv
Uploaded MMSA Sample/upload_train_data/San Francisco.csv, 2 files out of an estimated total of 4
Uploading MMSA Sample/upload_train_data/Seattle.csv
Uploaded MMSA Sample/upload_train_data/Seattle.csv, 3 files out of an estimated total of 4
Uploading MMSA Sample/upload_train_data/Washington DC.csv
Uploaded MMSA Sample/upload_train_data/Washington DC.csv, 4 files out of an estimated total of 4
Uploaded 4 files
Uploading an estimated of 4 files
Uploading MMSA Sample/upload_inference_data/New York City.csv
Uploaded MMSA Sample/upload_inference_data/New York City.csv, 1 files out of an estimated total of 4
Uploading MMSA Sample/upload_inference_data/San Francisco.csv
Uploaded MMSA Sample/upload_inference_data/San Francisco.csv, 2 files out of an es

$AZUREML_DATAREFERENCE_58005197b8624c4185ac7410d99a7671

In [9]:
train_dataset = Dataset.File.from_files(path = datastore.path('MMSA Sample/Train'), validate = False)
inference_dataset = Dataset.File.from_files(path = datastore.path('MMSA Sample/Inference'), validate = False)

## Registering The Datasets

In [10]:
train_dataset.register(ws, 'MMSA Sample - Train', create_new_version = True)
inference_dataset.register(ws, 'MMSA Sample - Inference', create_new_version = True)

{
  "source": [
    "('workspaceblobstore', 'MMSA Sample/Inference')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "2b3570f1-b4d2-40a9-9d40-1d56bf91394b",
    "name": "MMSA Sample - Inference",
    "version": 1,
    "workspace": "Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg')"
  }
}

# Model Training & Registration

In [11]:
import logging
from azureml.contrib.automl.pipeline.steps import AutoMLPipelineBuilder
from azureml.pipeline.core import Pipeline
sys.path.append('solution-accelerator-many-models/Automated_ML/02_AutoML_Training_Pipeline')
from training_scripts.helper import get_training_output

In [12]:
experiment = Experiment(ws, 'MMSA-Training-Pipeline')
MMSA_Sample_dataset = Dataset.get_by_name(ws, name = 'MMSA Sample - Train')
MMSA_Sample_models_input = MMSA_Sample_dataset.as_named_input('MMSA_Sample_Train')

In [13]:
automl_configuration = {'task': 'forecasting', 'primary_metric': 'normalized_root_mean_squared_error', 'iterations': 15,
                        'iteration_timeout_minutes': 10, 'experiment_timeout_hours': 1, 'label_column_name': 'Sales',
                        'n_cross_validations': 3, 'verbosity': logging.INFO, 'debug_log': 'automl_pandas_debug.txt',
                        'time_column_name': 'Date', 'max_horizon': 31, 'track_child_runs': False, 
                        'partition_column_names': ['Store'], 'grain_column_names': ['Store'], 
                        'pipeline_fetch_max_batch_size': 15}

In [14]:
train_steps = AutoMLPipelineBuilder.get_many_models_train_steps(
    experiment = experiment, automl_settings = automl_configuration, train_data = MMSA_Sample_models_input, 
    compute_target = 'automl-cluster', partition_column_names = ['Store'], node_count = 1, process_count_per_node = 4,
    run_invocation_timeout = 4000, output_datastore = datastore)
MMSA_Sample_train_pipeline = Pipeline(workspace = ws, steps = train_steps)
MMSA_Sample_train_run = experiment.submit(MMSA_Sample_train_pipeline)
MMSA_Sample_train_run.wait_for_completion(show_output = True)

Parameter automl_settings will be deprecated in the future. Please use ManyModelsParameters instead.
Parameter partition_column_names will be deprecated in the future. Please use ManyModelsParameters instead.


Created step many-models-train [afc42b6f][cbe5c8d6-31d1-499c-8237-952d39eb4a23], (This step will run and generate new outputs)
Submitted PipelineRun 2fe8ddc0-ef75-4956-93d8-5dd52505a112
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/2fe8ddc0-ef75-4956-93d8-5dd52505a112?wsid=/subscriptions/0c19fc19-85fd-4aa4-b133-61dd20fa93df/resourcegroups/edwin.spartan117-rg/workspaces/auotml-example-workspace&tid=c5f4b1c2-b533-4788-b1c5-99d0f10fb9b6
PipelineRunId: 2fe8ddc0-ef75-4956-93d8-5dd52505a112
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/2fe8ddc0-ef75-4956-93d8-5dd52505a112?wsid=/subscriptions/0c19fc19-85fd-4aa4-b133-61dd20fa93df/resourcegroups/edwin.spartan117-rg/workspaces/auotml-example-workspace&tid=c5f4b1c2-b533-4788-b1c5-99d0f10fb9b6
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 0835b7fd-f81a-4207-8b24-22e25a3989bb
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/0835b7fd-f81a-4207-8b24-22e25a3989bb?wsid=/sub



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '2fe8ddc0-ef75-4956-93d8-5dd52505a112', 'status': 'Completed', 'startTimeUtc': '2023-11-18T08:15:45.441877Z', 'endTimeUtc': '2023-11-18T08:27:57.608267Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelines.stages': '{"Initialization":null,"Execution":{"StartTime":"2023-11-18T08:15:45.872014+00:00","EndTime":"2023-11-18T08:27:57.4758904+00:00","Status":"Finished"}}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://auotmlexamplew5880114168.blob.core.windows.net/azureml/ExperimentRun/dcid.2fe8ddc0-ef75-4956-93d8-5dd52505a112/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=CORX%2Fst5HNGXTww1cBjVX%2FgzT%2Bl8dsB72DqFX4

'Finished'

In [15]:
MMSA_Sample_train_pipeline_published = MMSA_Sample_train_pipeline.publish(
    name = 'MMSA_Sample_pandas', description = 'MMSA Sample Solution using a pandas dataframe', version = '1', 
    continue_on_step_failure = False)

# Forecasting/Inference

In [16]:
import shutil
sys.path.append('solution-accelerator-many-models/Automated_ML/03_AutoML_Forecasting_Pipeline')
from forecasting_scripts.helper import get_forecasting_output

In [17]:
forecasting_experiment = Experiment(ws, 'MMSA-Forecasting-Pipeline')
MMSA_Sample_dataset = Dataset.get_by_name(ws, name = 'MMSA Sample - Inference')
MMSA_Sample_models_input = MMSA_Sample_dataset.as_named_input('MMSA_Sample_Inference')

training_experiment = 'MMSA-Training-Pipeline'
training_pipeline_run_id = '2fe8ddc0-ef75-4956-93d8-5dd52505a112'

In [18]:
inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(
    experiment = forecasting_experiment, inference_data = MMSA_Sample_models_input, compute_target = 'automl-cluster',
    node_count = 1, process_count_per_node = 4, run_invocation_timeout = 300, train_experiment_name = training_experiment,
    train_run_id = training_pipeline_run_id, partition_column_names = ['Store'], time_column_name = 'Date', 
    target_column_name = 'Sales')
MMSA_Sample_forecasting_pipeline = Pipeline(workspace = ws, steps = inference_steps)
MMSA_Sample_forecasting_run = forecasting_experiment.submit(MMSA_Sample_forecasting_pipeline)
MMSA_Sample_forecasting_run.wait_for_completion(show_output = True)

Parameter target_column_names will be deprecated in the future. Please use ManyModelsParameters instead.
Parameter time_column_name will be deprecated in the future. Please use ManyModelsParameters instead.
Parameter partition_column_names will be deprecated in the future. Please use ManyModelsParameters instead.
Output in the txt file does not include column header, use 'csv' file extension in 'append_row_file_name' parameter in 'get_many_models_batch_inference_steps' method to get column header in the output file.


Created step many-models-inference [6ef7d363][398218b8-be08-4a5e-b193-258ab1e7b8a8], (This step will run and generate new outputs)
Submitted PipelineRun 101555c1-4910-4972-b864-b49fda44cac4
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/101555c1-4910-4972-b864-b49fda44cac4?wsid=/subscriptions/0c19fc19-85fd-4aa4-b133-61dd20fa93df/resourcegroups/edwin.spartan117-rg/workspaces/auotml-example-workspace&tid=c5f4b1c2-b533-4788-b1c5-99d0f10fb9b6
PipelineRunId: 101555c1-4910-4972-b864-b49fda44cac4
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/101555c1-4910-4972-b864-b49fda44cac4?wsid=/subscriptions/0c19fc19-85fd-4aa4-b133-61dd20fa93df/resourcegroups/edwin.spartan117-rg/workspaces/auotml-example-workspace&tid=c5f4b1c2-b533-4788-b1c5-99d0f10fb9b6
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: c34aaba4-c7e3-4ff5-8857-034dc3960f5e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/c34aaba4-c7e3-4ff5-8857-034dc3960f5e?wsid=



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '101555c1-4910-4972-b864-b49fda44cac4', 'status': 'Completed', 'startTimeUtc': '2023-11-18T08:28:04.154343Z', 'endTimeUtc': '2023-11-18T08:30:12.763454Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelines.stages': '{"Initialization":null,"Execution":{"StartTime":"2023-11-18T08:28:04.4857148+00:00","EndTime":"2023-11-18T08:30:12.6444325+00:00","Status":"Finished"}}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://auotmlexamplew5880114168.blob.core.windows.net/azureml/ExperimentRun/dcid.101555c1-4910-4972-b864-b49fda44cac4/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=J8o1krpZ01NuwXpko8gt%2BIWIszyK2qX0sOREZ%2Bc

'Finished'

In [19]:
MMSA_Sample_forecasting_pipeline_published = MMSA_Sample_forecasting_pipeline.publish(
    name = 'MMSA_Forecasting_Sample_pandas', description = 'MMSA Sample Forecasting Solution using training features',
    version = '1', continue_on_step_failure = False)

In [22]:
# from azureml.train.automl.run import AutoMLRun
# MMSA_Sample_forecasting_run = AutoMLRun(experiment = forecasting_experiment, run_id = 'a49dbdae-0bd7-4ed4-9b39-5d14b5756bbd')
forecast_file = get_forecasting_output(MMSA_Sample_forecasting_run, 'forecasting_results', 'many_models_inference_output')
MMSA_Sample_forecast_df = pd.read_csv(forecast_file, delimiter = ' ', header = None)
MMSA_Sample_forecast_df.columns = ['Date', 'Store', 'Sales', 'Predicted']
print("Prediction has", MMSA_Sample_forecast_df.shape[0], "rows. Here the first 10 rows are being displayed:")
MMSA_Sample_forecast_df.head(10)

Prediction has 124 rows. Here the first 10 rows are being displayed:


Unnamed: 0,Date,Store,Sales,Predicted
0,2021-03-01,New York City,12,50.61
1,2021-03-02,New York City,50,50.98
2,2021-03-03,New York City,67,51.34
3,2021-03-04,New York City,16,51.7
4,2021-03-05,New York City,77,52.07
5,2021-03-06,New York City,13,52.43
6,2021-03-07,New York City,64,52.8
7,2021-03-08,New York City,65,51.79
8,2021-03-09,New York City,63,52.15
9,2021-03-10,New York City,54,52.52
