# Retail Store: Azure ML forecasting

In this notebook, we'll build and analyze a new model to predict retail sales.

## Create working directory

The cell below creates our working directory. This will hold our generated scripts.

In [None]:
import warnings
import os
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)
project_folder = './scripts'

if not os.path.exists(project_folder):
    os.makedirs(project_folder)

## Create utils.py file

In [None]:
%%writefile $project_folder/utils.py
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

def processResults(X_query, y_fcst_all):
    temp = X_query[['PRODUCT_NAME','DATE','TEMPERATURE_MEAN']]
    temp['DATE'] =  pd.to_datetime(temp['DATE'], format='%Y-%m-%d')
    temp['MONTH'] = temp['DATE'].dt.strftime("%B")
    temp = temp.drop(columns=['DATE'])
    temp = temp.groupby(['MONTH','PRODUCT_NAME'])['TEMPERATURE_MEAN'].mean()
    temp = pd.DataFrame(temp).reset_index()
    new_order = ['September','October', 'November', 'December']
    temp['MONTH'] = pd.Categorical(temp['MONTH'], categories=new_order, ordered=True)
    temp.sort_values(by='MONTH',inplace=True)
    r, c = y_fcst_all.shape
    y_fcst_all['PRODUCT_NAME'] = ['Canned Beans'] * r
    y_fcst_all = y_fcst_all[['DATE','PRODUCT_ID','PRODUCT_NAME', 'forecast']]
    forecast = y_fcst_all.copy()
    forecast['DATE'] =  pd.to_datetime(forecast['DATE'], format='%Y-%m-%d')
    forecast = forecast[['PRODUCT_NAME','forecast','DATE']]
    forecast['MONTH'] = forecast['DATE'].dt.strftime("%B")
    forecast = forecast.drop(columns=['DATE'])
    forecast = forecast.groupby(['MONTH','PRODUCT_NAME'])['forecast'].sum()
    forecast = pd.DataFrame(forecast).reset_index()
    new_order = ['September','October', 'November', 'December']
    forecast['MONTH'] = pd.Categorical(forecast['MONTH'], categories=new_order, ordered=True)
    forecast.sort_values(by='MONTH',inplace=True)
    forecast['TEMPERATURE_MEAN'] = temp['TEMPERATURE_MEAN']
    forecast = forecast.dropna()
    return forecast 
    
def facetgrid_two_axes(*args, **kwargs):
    data = kwargs.pop('data')
    dual_axis = kwargs.pop('dual_axis')
    alpha = kwargs.pop('alpha', 0.7)
    kwargs.pop('color')

    ax = plt.gca()
    if dual_axis:
        ax2 = ax.twinx()
        ax2.set_ylabel('forecast')
        
    ax.plot(data['MONTH'],data['TEMPERATURE_MEAN'], **kwargs, color='red',alpha=alpha)
    if dual_axis:
        sns.barplot(data['MONTH'],data['forecast'], alpha=alpha)

### Kernel Restart

Please restart your kernel after the above cell has finished execution.

## Setup Azure ML

In the next cell, we create a new Workspace config object using the `<subscription_id>`, `<resource_group_name>`, and `<workspace_name>`. This will fetch the matching Workspace and prompt you for authentication. Please click on the link and input the provided details.

For more information on **Workspace**, please visit: [Microsoft Workspace Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace?view=azure-ml-py)

`<subscription_id>` = You can get this ID from the landing page of your Resource Group.

`<resource_group_name>` = This is the name of your Resource Group.

`<workspace_name>` = This is the name of your Workspace.

In [None]:
from azureml.core.workspace import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

try:    
    interactive_auth = InteractiveLoginAuthentication(tenant_id='<tenant_id>')
    # Get instance of the Workspace and write it to config file
    ws = Workspace(
        subscription_id = '<subscription_id>', 
        resource_group = '<resource_group_name>', 
        workspace_name = '<workspace_name>',
        auth = interactive_auth)

    # Writes workspace config file
    ws.write_config()
    
    print('Library configuration succeeded')
except Exception as e:
    print(e)
    print('Workspace not found')

## Load training data

Let's retrieve our dataset from the default workspace Datastore.

In [None]:
from azureml.core import Dataset, Datastore
from azureml.data.datapath import DataPath

datastore = ws.get_default_datastore()

datastore_path = [DataPath(datastore, 'train_data.parquet')]

train_tabular = Dataset.Tabular.from_parquet_files(path=datastore_path)
train_tabular = train_tabular.register(workspace=ws, 
                                       name='retail_sales_training',
                                       description='Retail sales forecast training data',
                                       create_new_version=True)
train_tabular = Dataset.get_by_name(ws, 'retail_sales_training')

train_data = train_tabular.to_pandas_dataframe()

train_data.head(2)

Next, we'll take a subset of our data and then proceed to visualize it to better understand any patterns and trends that might exist to drive good ML models.

## Dataset Description

Describe our current dataset. The table below shows the different statistical values for our training subset.

In [None]:
subset = train_tabular.take_sample(probability=0.01, seed=123).to_pandas_dataframe()
subset.describe()

## 2019 Canned Beans Sales by Month

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

canned = train_data.loc[train_data['PRODUCT_NAME'] == 'Canned Beans']
canned['DATE'] =  pd.to_datetime(canned['DATE'], format='%Y-%m-%d')
canned = canned[['PRODUCT_NAME','QUANTITY','DATE']]
canned['MONTH'] = canned['DATE'].dt.strftime("%B")
canned = canned.loc[canned['DATE'] < '2019-12-31']
canned = canned.drop(columns=['DATE'])
canned = canned.groupby(['MONTH','PRODUCT_NAME'])['QUANTITY'].sum()
canned = pd.DataFrame(canned).reset_index()
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
canned['MONTH'] = pd.Categorical(canned['MONTH'], categories=new_order, ordered=True)
canned.sort_values(by='MONTH',inplace=True)
f, ax = plt.subplots(figsize=(14.5, 6.5))
ax = sns.barplot(x="MONTH", y="QUANTITY", data=canned)
ax.set(ylim=(0, 1600))

## 2020 Canned Beans Sales by Month

In [None]:
canned = train_data.loc[train_data['PRODUCT_NAME'] == 'Canned Beans']
canned['DATE'] =  pd.to_datetime(canned['DATE'], format='%Y-%m-%d')
canned = canned[['PRODUCT_NAME','QUANTITY','DATE']]
canned['MONTH'] = canned['DATE'].dt.strftime("%B")
canned = canned.loc[canned['DATE'] > '2020-01-01']
canned = canned.drop(columns=['DATE'])
canned = canned.groupby(['MONTH','PRODUCT_NAME'])['QUANTITY'].sum()
canned = pd.DataFrame(canned).reset_index()
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
canned['MONTH'] = pd.Categorical(canned['MONTH'], categories=new_order, ordered=True)
canned.sort_values(by='MONTH',inplace=True)
f, ax = plt.subplots(figsize=(14.5, 6.5))
ax = sns.barplot(x="MONTH", y="QUANTITY", data=canned)
ax.set(ylim=(0, 1600))

## Create a Workspace Experiment

The Experiment constructor allows to create an experiment instance. The constructor takes in the current workspace, which is fetched by calling `Workspace.from_config()` and an experiment name. 

For more information on **Experiment**, please visit: [Microsoft Experiment Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment.experiment?view=azure-ml-py)

In [None]:
from azureml.core.experiment import Experiment

# Get an instance of the Workspace from the config file
ws = Workspace.from_config()

experiment_name = 'retail-forecast-experiment'

# Create Experiment
experiment = Experiment(ws, experiment_name)

## Create Azure ML Compute cluster

Firstly, check for the existence of the cluster. If it already exists, we are able to reuse it. Checking for the existence of the cluster can be performed by calling the constructor `ComputeTarget()` with the current workspace and name of the cluster.

In case the cluster does not exist, the next step will be to provide a configuration for the new AML cluster by calling the function `AmlCompute.provisioning_configuration()`. It takes as parameters the VM size and the max number of nodes that the cluster can scale up to. After the configuration has executed, `ComputeTarget.create()` should be called with the previously configuration object and the workspace object.

For more information on **ComputeTarget**, please visit: [Microsoft ComputeTarget Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute.computetarget?view=azure-ml-py)

For more information on **AmlCompute**, please visit: [Microsoft AmlCompute Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute.akscompute?view=azure-ml-py)


**Note:** Please wait for the execution of the cell to finish before moving forward.

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Create AML CPU Compute Cluster
try:
    compute_target = ComputeTarget(workspace=ws, name='cpucluster')
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS12_v2',
                                                           min_nodes=0,
                                                           max_nodes=4)

    compute_target = ComputeTarget.create(ws, 'cpucluster', compute_config)
    compute_target.wait_for_completion(show_output=True)

## Submit Experiment

We'll use remote compute for this job.

The `experiment.submit()` function is called to send the experiment for execution. The only parameter received by this function is the `AutoMLConfig` object.

In [None]:
import logging
from azureml.train.automl import AutoMLConfig

label =  "QUANTITY"

time_series_settings = {
    "time_column_name": "DATE",
    "grain_column_names": ["PRODUCT_ID"],
    "max_horizon": 'auto',
    "target_lags": 'auto',
    "target_rolling_window_size": 'auto',
    "featurization": 'auto',
}

automl_config = AutoMLConfig(task='forecasting',
                             experiment_timeout_minutes=15,
                             compute_target=compute_target,
                             enable_early_stopping=True,
                             training_data=train_tabular, #tabular,
                             label_column_name=label,
                             n_cross_validations=5,
                             enable_ensembling=False,
                             verbosity=logging.INFO,
                             **time_series_settings)

run = experiment.submit(automl_config, show_output=False)
run

## Monitor Experiment

The creation of an object of type `Run` will enable us to observe the experiment progress and results. The object is created by calling the constructor `Run()`. It takes as arguments the experiment and the identifier of the run to fetch. After the object has been instantiated, the `RunDetails()` function will retrieve the progress, metrics, and tasks for the specified run. They will be displayed by calling the function `show()` over the mentioned object.

**Note:** Please wait for the execution of the cell to finish before moving forward. (Status should be **Completed**)

In [None]:
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails

run = AutoMLRun(experiment, run.id)
RunDetails(run).show()

In [None]:
best_run, fitted_model = run.get_output()
fitted_model.steps

## Featurization

You can access the engineered feature names generated in time-series featurization.

In [None]:
fitted_model.named_steps['timeseriestransformer'].get_engineered_feature_names()

In [None]:
import pandas as pd

featurization_summary = fitted_model.named_steps['timeseriestransformer'].get_featurization_summary()
pd.DataFrame.from_records(featurization_summary)

## Register Model

Next, register the model obtained from the best run. In order to register the model, the function `register_model()` should be called. This will take care of registering the model obtained from the best run.

In [None]:
model_name = best_run.properties['model_name']

description = 'AutoML forecaster'
model = run.register_model(model_name = model_name, description = description)

print(run.model_id)

## Develop the scoring script¶
For the deployment we need a function which will run the forecast on serialized data. It can be obtained from the best_run.

In [None]:
script_file_name = 'score_fcast.py'
best_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file_name)

## Deploy model to Azure Container Instance

In order to deploy the to an Azure Container Instance, the function `Model.deploy()` should be called, passing along the workspace object, service name and list of models to deploy.

For more information on **Model**, please visit: [Microsoft Model Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py)


**Note:** Please wait for the execution of the cell to finish before moving forward.

In [None]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.exceptions import WebserviceException
from azureml.core.webservice import Webservice
from azureml.core.model import Model

inference_config = InferenceConfig(environment = best_run.get_environment(), 
                                   entry_script = script_file_name)

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                               memory_gb = 2, 
                                               tags = {'type': "automl-forecasting"},
                                               description = "Automl forecasting service")

service_name_aci = 'forecasting-service'
print(service_name_aci)

try:
    aci_service = Webservice(ws, service_name_aci)
    print(aci_service.state)
except WebserviceException:
    aci_service = Model.deploy(ws, service_name_aci, [model], inference_config, aciconfig)
    aci_service.wait_for_deployment(True)
    print(aci_service.state)

## Load test data

In [None]:
datastore_path = [DataPath(datastore, 'ntest_data.parquet')]

test_tabular = Dataset.Tabular.from_parquet_files(path=datastore_path)
test_tabular = test_tabular.register(workspace=ws, 
                                       name='retail_sales_test',
                                       description='Retail sales forecast test data',
                                       create_new_version=True)
test_tabular = Dataset.get_by_name(ws, 'retail_sales_test')

test_data = test_tabular.to_pandas_dataframe()
test_labels = test_data.pop(label).values

## Connect to the deployed webservice

Now with test data, we can get it into a suitable format to consume the web service. 

In [None]:
import json
import pandas as pd

X_test = test_tabular.to_pandas_dataframe().reset_index(drop=True)
y_test = X_test.pop(label).values
X_query = X_test.copy()
# We have to convert datetime to string, because Timestamps cannot be serialized to JSON.

X_query['DATE'] = X_query['DATE'].astype(str)

# The Service object accept the complex dictionary, which is internally converted to JSON string.
# The section 'data' contains the data frame in the form of dictionary.
test_sample = json.dumps({'data': X_query.to_dict(orient='records')})
response = aci_service.run(input_data = test_sample)

# translate from networkese to datascientese
try: 
    res_dict = json.loads(response)
    y_fcst_all = pd.DataFrame(res_dict['index'])
    y_fcst_all['DATE'] = pd.to_datetime(y_fcst_all['DATE'], unit = 'ms')
    y_fcst_all['forecast'] = res_dict['forecast']    
except:
    print(res_dict)

In [None]:
from scripts.utils import *

forecast = processResults(X_query, y_fcst_all)
win_plot = sns.FacetGrid(forecast,  size=8.5)
(win_plot.map_dataframe(facetgrid_two_axes, dual_axis=True)
         .set_axis_labels("MONTH", "TEMPERATURE_MEAN"))
plt.show()