# Automated ML

In [14]:
import logging
import os
import csv
import json
import requests
import pkg_resources
import azureml.core
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn import datasets

from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep
from azureml.core.dataset import Dataset
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.model import InferenceConfig, Model

from azureml.pipeline.steps import AutoMLStep
from azureml.widgets import RunDetails

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.27.0


In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'predict_rain_automl'
project_folder = '.'

experiment=Experiment(ws, experiment_name)

amlcompute_cluster_name = "notebook144713"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)
compute_target.get_status()

Found existing cluster, use it.

Running


{
  "errors": [],
  "creationTime": "2021-05-13T12:33:15.043599+00:00",
  "createdBy": {
    "userObjectId": "dda90ccb-9476-4fa5-9c6f-fbc50afb3b62",
    "userTenantId": "660b3398-b80e-49d2-bc5b-ac1dc93b5254",
    "userName": null
  },
  "modifiedTime": "2021-05-13T12:35:31.546388+00:00",
  "state": "Running",
  "vmSize": "STANDARD_DS3_V2"
}

## Dataset

### Overview
For this capstone project, I will use a [Dataset](https://www.kaggle.com/jsphyg/weather-dataset-rattle-package?select=weatherAUS.csv) compiled by Joe Young, originally hosted on kaggle.com and uploaded to my personal [github account](https://github.com/Aschteroth/udacity_capstone_project).

The Dataset itself contains 10 years worth of meteorological data (years 2007-2017) from different places in Australia, including the info if it rained that day and if it rained the day after. 

This project aims to deploy a webservice that can predict if it will rain tomorrow in Australia if given a specific set of weather parameters. 

In [3]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "rain-in-australia"
description_text = """
This dataset contains about 10 years of daily weather observations from many locations across Australia. 
RainTomorrow is the target variable to predict. It means -- did it rain the next day, Yes or No? This column is Yes if the rain for that day was 1mm or more.

Source & Acknowledgements
Observations were drawn from numerous weather stations. The daily observations are available from http://www.bom.gov.au/climate/data.
An example of latest weather observations in Canberra: http://www.bom.gov.au/climate/dwo/IDCJDW2801.latest.shtml

Definitions adapted from http://www.bom.gov.au/climate/dwo/IDCJDW0000.shtml
Data source: http://www.bom.gov.au/climate/dwo/ and http://www.bom.gov.au/climate/data.

Copyright Commonwealth of Australia 2010, Bureau of Meteorology.
"""
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        import requests
        url = 'https://raw.githubusercontent.com/Aschteroth/udacity_capstone_project/main/weatherAUS.csv'
        dataset = Dataset.Tabular.from_delimited_files(url)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
Date             145460 non-null object
Location         145460 non-null object
MinTemp          143975 non-null float64
MaxTemp          144199 non-null float64
Rainfall         142199 non-null float64
Evaporation      82670 non-null object
Sunshine         75625 non-null object
WindGustDir      135134 non-null object
WindGustSpeed    135197 non-null float64
WindDir9am       134894 non-null object
WindDir3pm       141232 non-null object
WindSpeed9am     143693 non-null float64
WindSpeed3pm     142398 non-null float64
Humidity9am      142806 non-null float64
Humidity3pm      140953 non-null float64
Pressure9am      130395 non-null float64
Pressure3pm      130432 non-null float64
Cloud9am         89572 non-null float64
Cloud3pm         86102 non-null float64
Temp9am          143693 non-null float64
Temp3pm          141851 non-null float64
RainToday        142199 non-null object

## AutoML Configuration

For this project, I decided to stick with the automl-settings that we used in the previous excercise. 

- The timeout is set to 20 minutes for efficiency reasons
- the number of max concurrent iterations is set to 5, one below the maximum number of nodes
- for the primary metric, I chose AUC_weighted (Area under the curve) 

My choices for the config were as follows: 

- Since we want to answer the question "Will it rain in Australia tomorrow?" where The answer is of a binary nature, either "Yes" or "No", we are obviously dealing with a classification problem. 
- Our target column is "RainTomorrow"
- Since this is just a project to show my newly aquired skills, I enabled early stopping (otherwise, the model might run for several days)

In [4]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'AUC_weighted'
}

automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="RainTomorrow",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [5]:
# Submit the experiment
remote_run = experiment.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on notebook144713 with default configuration
Running on remote compute: notebook144713


Experiment,Id,Type,Status,Details Page,Docs Page
predict_rain_automl,AutoML_87495046-e406-4aaf-94ae-1fe2afbc4734,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Train-Test data split
STATUS:       DONE
DESCRIPTION:  Your input data has been split into a training dataset and a holdout test dataset for validation of the model. The test holdout dataset reflects the original distribution of your input data.
              
DETAILS:      
+---------------------------------+---------------------------------+---------------------------------+
|Dataset                          |Row counts                       |Percentage                       |
|train                            |130914           

## Run Details

In [6]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

The best model was a **stackEnsemble** with a **weighted AUC** of 0.88998



In [7]:
best_run, fitted_model = remote_run.get_output()
print(best_run)

best_run_metrics = best_run.get_metrics()
print('Best Run Id: ', best_run.id)

Run(Experiment: predict_rain_automl,
Id: AutoML_87495046-e406-4aaf-94ae-1fe2afbc4734_19,
Type: azureml.scriptrun,
Status: Completed)
Best Run Id:  AutoML_87495046-e406-4aaf-94ae-1fe2afbc4734_19


In [8]:
print(fitted_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('stackensembleclassifier',
                 StackE...
                                         meta_learner=LogisticRegressionCV(Cs=10,
                                                                           class_weight=None,
                                                                           cv=None,
                                                                           dual=False,
                           

In [9]:
autoMLmodel = best_run.register_model(model_name='rain_autoML_model', 
                                    model_path='outputs/model.pkl',
                                    tags={'Method':'AutoML'},
                                    properties={'AUC_weighted': best_run_metrics['AUC_weighted']})

print(autoMLmodel)

Model(workspace=Workspace.create(name='quick-starts-ws-144713', subscription_id='61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30', resource_group='aml-quickstarts-144713'), name=rain_autoML_model, id=rain_autoML_model:1, version=1, tags={'Method': 'AutoML'}, properties={'AUC_weighted': '0.8899867796008221'})


## Model Deployment

Since the autoML yielded a slightly better result than the hyperdrive runs (around 3%), I will deploy the "rain_autoML_model

In [15]:
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')
best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'env.yml')
aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1, 
                                               description='Predict if it will rain tomorrow in Australia')

inference_config = InferenceConfig(entry_script="score.py", environment=best_run.get_environment())



In [17]:
# Deploy service
service = Model.deploy(workspace=ws, 
                       name='predict-rain-webservice', 
                       models=[autoMLmodel], 
                       inference_config=inference_config, 
                       deployment_config=aciconfig)

In [18]:
# Check deployment status
service.wait_for_deployment(show_output=True)

print("Service State: ",service.state)
print("Scoring URI: ",service.scoring_uri)
print("Swagger URI: ",service.swagger_uri)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-05-13 14:51:20+00:00 Creating Container Registry if not exists.
2021-05-13 14:51:20+00:00 Registering the environment.
2021-05-13 14:51:21+00:00 Use the existing image.
2021-05-13 14:51:21+00:00 Generating deployment configuration.
2021-05-13 14:51:22+00:00 Submitting deployment to compute.
2021-05-13 14:51:25+00:00 Checking the status of deployment predict-rain-webservice..
2021-05-13 14:55:11+00:00 Checking the status of inference endpoint predict-rain-webservice.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Service State:  Healthy
Scoring URI:  http://fd3b1788-817d-4183-916f-de6dda0b7bb8.southcentralus.azurecontainer.io/score
Swagger URI:  http://fd3b1788-817d-4183-916f-de6dda0b7bb8.southcentralus.azurecontainer.io/swagger.json


In [37]:
# Sample data from original dataset as payload
df_test = df.dropna()
test_sample = df_test.sample(n=10, random_state=10) 
label_sample = test_sample.pop('RainTomorrow')

samples = json.dumps({'data': test_sample.to_dict(orient="records")})

print(samples)

{"data": [{"Date": "28.07.2010", "Location": "Sale", "MinTemp": 1.2, "MaxTemp": 12.4, "Rainfall": 0.2, "Evaporation": "1.2", "Sunshine": "5.7", "WindGustDir": "E", "WindGustSpeed": 19.0, "WindDir9am": "NW", "WindDir3pm": "ESE", "WindSpeed9am": 6.0, "WindSpeed3pm": 13.0, "Humidity9am": 97.0, "Humidity3pm": 67.0, "Pressure9am": 1029.0, "Pressure3pm": 1023.2, "Cloud9am": 7.0, "Cloud3pm": 6.0, "Temp9am": 4.9, "Temp3pm": 12.2, "RainToday": false}, {"Date": "12.04.2010", "Location": "MountGambier", "MinTemp": 10.5, "MaxTemp": 16.7, "Rainfall": 0.2, "Evaporation": "3.4", "Sunshine": "2.2", "WindGustDir": "SW", "WindGustSpeed": 39.0, "WindDir9am": "SW", "WindDir3pm": "SW", "WindSpeed9am": 20.0, "WindSpeed3pm": 22.0, "Humidity9am": 59.0, "Humidity3pm": 56.0, "Pressure9am": 1021.9, "Pressure3pm": 1022.0, "Cloud9am": 3.0, "Cloud3pm": 7.0, "Temp9am": 14.7, "Temp3pm": 15.6, "RainToday": false}, {"Date": "29.07.2014", "Location": "Portland", "MinTemp": 10.6, "MaxTemp": 15.2, "Rainfall": 3.6, "Evapor

In [38]:
scoring_uri = service.scoring_uri
input_data = samples
headers = {'Content-Type': 'application/json'}

# Send request to webservice, display response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.text)

"{\"result\": [false, false, true, false, false, false, false, false, false, false]}"


In [39]:
#  print logs of web service & delete service
print(service.get_logs())
service.delete()


2021-05-13T14:55:05,952620900+00:00 - gunicorn/run 
2021-05-13T14:55:05,957248900+00:00 - iot-server/run 
2021-05-13T14:55:05,952058900+00:00 - rsyslog/run 
2021-05-13T14:55:05,994146300+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd