### Import packagesting

In [7]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from azureml.core import Workspace, Dataset
import azureml.core
import os

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.4.0


#### Connecting to workspace

In [8]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\t')

ml-environment	eastus	AIMLDemo


In [9]:
ws

Workspace.create(name='ml-environment', subscription_id='483b73e5-b0bf-40ea-840d-cdb839f75a3b', resource_group='AIMLDemo')

#### Create Experiment

In [10]:
experiment_name = 'sklearn-storage-prediction'

from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

#### Create or Attach existing compute resource

In [12]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target: cpu-cluster


#### Importing data

In [13]:
dataset = Dataset.get_by_name(ws, name='storage')
storage = dataset.to_pandas_dataframe()
storage.head()

Unnamed: 0,Week,Year,Pie_id_1_stock,Pie_id_2_stock,Pie_id_3_stock,Pie_id_4_stock,Pie_id_5_stock,Pie_id_6_stock,Pie_id_7_stock,Pie_id_8_stock,Pie_id_9_stock,Pie_id_10_stock,Pie_id_11_stock
0,1,2020,5,9,9,4,6,3,8,2,5,6,7
1,2,2020,2,6,9,8,6,8,6,6,5,9,8
2,3,2020,9,7,9,5,9,7,1,3,9,9,8
3,4,2020,1,8,0,2,5,2,1,3,2,3,2
4,5,2020,6,4,8,4,7,3,7,0,1,0,0


#### Create a directory

Create a directory to deliver the necessary code from your computer to the remote resource.

In [15]:
script_folder = os.path.join(os.getcwd(), "sklearn-storage-prediction")
os.makedirs(script_folder, exist_ok=True)

#### Creating the trainig code
- Everything should be included in the script below

In [16]:
import argparse
import os
import numpy as np
import glob
import math

from sklearn.linear_model import LogisticRegression
import joblib

from azureml.core import Run
from utils import load_data

In [17]:
ws = Workspace.from_config()
dataset = Dataset.get_by_name(ws, name='storage')
storage = dataset.to_pandas_dataframe()

In [18]:

storage = storage.drop(['Month'], axis=1, errors='ignore')
storage.insert(2, 'Month', [1,1,1,1,2,2,2,2,3,3,3,3,3,4,4,4,4,5,5,5,5,
                            6,6,6,6,6,7,7,7,7,8,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,11,12,12,12,12])

In [19]:
for columnName in storage.columns.drop(['Week','Year','Month']):
    prior1 = columnName+"_1_week_ago"
    prior2 = columnName+"_2_week_ago"
    storage = storage.drop([prior1], axis=1, errors='ignore')
    storage = storage.drop([prior2], axis=1, errors='ignore')
    storage.insert(3, prior1, storage[columnName].shift(1))
    storage.insert(3, prior2, storage[columnName].shift(2))
    
storage=storage.fillna(0)

In [20]:
currentPie = 'Pie_id_1_stock'
tempDf = storage.loc[:,['Week','Year','Month',(currentPie),(currentPie+'_1_week_ago'),(currentPie+'_2_week_ago')]]

x_train = tempDf.loc[0:math.floor(tempDf.shape[0]/3)*2,tempDf.columns.drop(currentPie)]
x_test = tempDf.loc[math.floor(tempDf.shape[0]/3)*2+1:,tempDf.columns.drop(currentPie)]
y_train = tempDf[currentPie].loc[0:math.floor(tempDf.shape[0]/3)*2]
y_test = tempDf[currentPie].loc[math.floor(tempDf.shape[0]/3)*2+1:]

In [21]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(x_train, y_train);
# Use the forest's predict method on the test data
predictions = rf.predict(x_test)

In [22]:
# get hold of the current run
run = Run.get_context()
# calculate accuracy on the prediction
acc = np.average(predictions == y_test)
print('Accuracy is', acc)


run.log('accuracy', np.float(acc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=rf, filename='outputs/sklearn-storage-prediction_model.pkl')

Accuracy is 0.0
Attempted to log scalar metric accuracy:
0.0


['outputs/sklearn-storage-prediction_model.pkl']

### Create a actual training script

To submit the job to the cluster, first create a training script. Run the following code to create the training script called `train.py` in the directory you just created. 

In [37]:
%%writefile $script_folder/train.py

import argparse
import os
import numpy as np
import glob
import math

import azureml.core
from azureml.core import Workspace,Dataset
from sklearn.linear_model import LogisticRegression
import joblib

from azureml.core import Run
from utils import load_data



# let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
# parser = argparse.ArgumentParser()
# parser.add_argument('--week', type=int, dest='week', help='week of the wanted estimate')
# parser.add_argument('--year', type=int, dest='year', help='year of the wanted estimate')
# parser.add_argument('--month', type=int, dest='month', help='month of the wanted estimate')
# parser.add_argument('--stock_1_week_ago', type=int, dest='stock_1_week_ago', help='stock the week before the wanted week')
# parser.add_argument('--stock_2_week_ago', type=int, dest='stock_2_week_ago', help='stock the 2 weeks before the wanted week')
# args = parser.parse_args()

# data_folder = args.data_folder
# print('Data folder:', data_folder)

# get hold of the current run
run = Run.get_context()

workspace = run.experiment.workspace
dataset_name = 'storage'
# Get a dataset by name
dataset = Dataset.get_by_name(workspace=workspace, name=dataset_name)
storage = dataset.to_pandas_dataframe()

storage = storage.drop(['Month'], axis=1, errors='ignore')
storage.insert(2, 'Month', [1,1,1,1,2,2,2,2,3,3,3,3,3,4,4,4,4,5,5,5,5,
                            6,6,6,6,6,7,7,7,7,8,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,11,12,12,12,12])

for columnName in storage.columns.drop(['Week','Year','Month']):
    prior1 = columnName+"_1_week_ago"
    prior2 = columnName+"_2_week_ago"
    storage = storage.drop([prior1], axis=1, errors='ignore')
    storage = storage.drop([prior2], axis=1, errors='ignore')
    storage.insert(3, prior1, storage[columnName].shift(1))
    storage.insert(3, prior2, storage[columnName].shift(2))
    
storage=storage.fillna(0)
    

currentPie = 'Pie_id_1_stock'
tempDf = storage.loc[:,['Week','Year','Month',(currentPie),(currentPie+'_1_week_ago'),(currentPie+'_2_week_ago')]]

x_train = tempDf.loc[0:math.floor(tempDf.shape[0]/3)*2,tempDf.columns.drop(currentPie)]
x_test = tempDf.loc[math.floor(tempDf.shape[0]/3)*2+1:,tempDf.columns.drop(currentPie)]
# x_test = [args.week,args.year,args.month,args.stock_1_week_ago,args.stock_1_week_ago]
y_train = tempDf[currentPie].loc[0:math.floor(tempDf.shape[0]/3)*2]
y_test = tempDf[currentPie].loc[math.floor(tempDf.shape[0]/3)*2+1:]




# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(x_train, y_train);
# Use the forest's predict method on the test data
predictions = rf.predict(x_test)

# calculate accuracy on the prediction
acc = np.average(predictions == y_test)
print('Accuracy is', acc)


run.log('accuracy', np.float(acc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=rf, filename='outputs/sklearn-storage-prediction_model.pkl')


Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/bethanys-ml-compute/code/users/apdarr/bethanys/sklearn-storage-prediction/train.py


The file `utils.py` is referenced from the training script to load the dataset correctly.  Copy this script into the script folder so that it can be accessed along with the training script on the remote resource.

In [38]:
import shutil
shutil.copy('utils.py', script_folder)

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/bethanys-ml-compute/code/users/apdarr/bethanys/sklearn-storage-prediction/utils.py'

### Create an estimator

An estimator object is used to submit the run. Azure Machine Learning has pre-configured estimators for common machine learning frameworks, as well as generic Estimator. Create an estimator by specifying

* The name of the estimator object, `est`
* The directory that contains your scripts. All the files in this directory are uploaded into the cluster nodes for execution. 
* The compute target.  In this case you will use the AmlCompute you created
* The training script name, train.py
* An environment that contains the libraries needed to run the script
* Parameters required from the training script. 

In this tutorial, the target is AmlCompute. All files in the script folder are uploaded into the cluster nodes for execution. The data_folder is set to use the dataset.

First, create the environment that contains: the scikit-learn library, azureml-dataprep required for accessing the dataset, and azureml-defaults which contains the dependencies for logging metrics. The azureml-defaults also contains the dependencies required for deploying the model as a web service later in the part 2 of the tutorial.

Once the environment is defined, register it with the Workspace to re-use it in part 2 of the tutorial.

In [39]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

# to install required packages
env = Environment('bethanys-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataprep[pandas,fuse]>=1.1.14', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "username": null
        },
        "enabled": false,
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "bethanys-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"
            ],
            "dependencies": [
                "python=3.6.2",
                {
                    "pip": [
                

### Submit the job to the cluster

Run the experiment by submitting the estimator object. And you can navigate to Azure portal to monitor the run.

In [40]:
from azureml.train.estimator import Estimator

script_params = {
   }

est = Estimator(source_directory=script_folder,
              script_params=script_params,
              compute_target=compute_target,
              environment_definition=env,
              entry_script='train.py')

In [41]:
run = exp.submit(config=est)
run

Experiment,Id,Type,Status,Details Page,Docs Page
sklearn-storage-prediction,sklearn-storage-prediction_1589439291_9c3f6b4f,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [42]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [43]:
# specify show_output to True for a verbose log
run.wait_for_completion(show_output=True) 

RunId: sklearn-storage-prediction_1589439291_9c3f6b4f
Web View: https://ml.azure.com/experiments/sklearn-storage-prediction/runs/sklearn-storage-prediction_1589439291_9c3f6b4f?wsid=/subscriptions/483b73e5-b0bf-40ea-840d-cdb839f75a3b/resourcegroups/AIMLDemo/workspaces/ml-environment

Streaming azureml-logs/55_azureml-execution-tvmps_8103651318e5fdfb0a8c3de7ea4353c929ffb52f72ca2ca9bacc9d0d161f1707_d.txt

2020-05-14T07:01:06Z Starting output-watcher...
2020-05-14T07:01:06Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
9125e2303251fcff723c8bcdfce6a8d647f25e3fe8d9d6b2dcd6456722e1b70a
2020/05/14 07:02:47 Instrumentation Key Is Empty Skipping App Insight Logger
2020/05/14 07:02:47 Version: 3.0.01220.0001 Branch: master Commit: 1565d0f6
2020/05/14 07:02:47 /dev/infiniband/uverbs0 found (implying presence of InfiniBand)?: false
2020/05/14 07:02:47 sshd inside container not required for job, skipping setup.

Streaming azureml-logs/65_job_prep-tvmps_8103651318e5fdfb0a8c3de7ea4353c

{'runId': 'sklearn-storage-prediction_1589439291_9c3f6b4f',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-05-14T07:01:00.126024Z',
 'endTimeUtc': '2020-05-14T07:03:26.711567Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'ede1e08c-5b04-460d-bfe0-ce08b6e923d6',
  'AzureML.DerivedImageName': 'azureml/azureml_7b48670a0423858f69b60b23972c8e06',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'eb6fb24f-0ee3-4051-8a1f-9281d0e4f376'}, 'consumptionDetails': {'type': 'Reference'}}],
 'runDefinition': {'script': 'train.py',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cpu-cluster',
  'dataReferences': {},
  'data': {},
  'jobName': None,
  'maxRunDurationSeconds': None,
  'nodeCount': 1,
  'environment': {'name': 'bethanys-env'

In [45]:
print(run.get_metrics())

{'accuracy': 0.0}


In [44]:
print(run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_8103651318e5fdfb0a8c3de7ea4353c929ffb52f72ca2ca9bacc9d0d161f1707_d.txt', 'azureml-logs/65_job_prep-tvmps_8103651318e5fdfb0a8c3de7ea4353c929ffb52f72ca2ca9bacc9d0d161f1707_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_8103651318e5fdfb0a8c3de7ea4353c929ffb52f72ca2ca9bacc9d0d161f1707_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/100_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/sklearn-storage-prediction_model.pkl']


In [47]:
# register model 
model = run.register_model(model_name='sklearn-storage-prediction', model_path='outputs/sklearn-storage-prediction_model.pkl')
print(model.name, model.id, model.version, sep='\t')

sklearn-storage-prediction	sklearn-storage-prediction:1	1
