# Automated ML



In [1]:
pip install xgboost==0.90

Collecting xgboost==0.90
  Downloading xgboost-0.90-py2.py3-none-manylinux1_x86_64.whl (142.8 MB)
[K     |████████████████████████████████| 142.8 MB 25 kB/s s eta 0:00:01��████████████████████████▌   | 127.4 MB 78.1 MB/s eta 0:00:01
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.3.3
    Uninstalling xgboost-1.3.3:
      Successfully uninstalled xgboost-1.3.3
Successfully installed xgboost-0.90
Note: you may need to restart the kernel to use updated packages.


In [2]:
import azureml.core
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Dataset
from azureml.train.automl.utilities import get_primary_metrics
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep
from azureml.widgets import RunDetails
import os
import joblib
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice.aci import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model

## Dataset

### Overview

The dataset is taken from kaggle : "https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists".

The task is to predict whether the employee will leave the current job or not, based on the following factors-

enrollee_id : Unique ID for candidate
city: City code
city_ development _index : Developement index of the city (scaled)
gender: Gender of candidate
relevent_experience: Relevant experience of candidate
enrolled_university: Type of University course enrolled if any
education_level: Education level of candidate
major_discipline :Education major discipline of candidate
experience: Candidate total experience in years
company_size: No of employees in current employer's company
company_type : Type of current employer
lastnewjob: Difference in years between previous job and current job
training_hours: training hours completed
target: 0 – Not looking for job change, 1 – Looking for a job change



In [3]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'automl-job-change-expt'

experiment=Experiment(ws, experiment_name)
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-141031
Azure region: southcentralus
Subscription id: 610d6e37-4747-4a20-80eb-3aad70a55f43
Resource group: aml-quickstarts-141031


In [4]:
compute_cluster_name="demo-1-cluster"
try:
    Compute_target=ComputeTarget(workspace=ws,name=compute_cluster_name)
    print("Found existing cluster! No need to create new!")
except ComputeTargetException:
    config_of_compute=AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2",max_nodes=4)
    Compute_target=ComputeTarget.create(ws,compute_cluster_name,config_of_compute)
    Compute_target.wait_for_completion(show_output=True)

Found existing cluster! No need to create new!


In [5]:
dataset = Dataset.get_by_name(ws, name='aug_train')
dataset.to_pandas_dataframe()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [6]:
dataset =dataset.drop_columns(["enrollee_id","city"])

## AutoML Configuration



In [7]:
# TODO: Put your automl settings here
automl_settings = {"experiment_timeout_minutes": 30,
    "task": "classification", 
    "primary_metric": "accuracy",
    "training_data": dataset,
    "label_column_name": "target" }

# TODO: Put your automl config here
automl_config = AutoMLConfig( n_cross_validations= 6,
    enable_early_stopping= True,
    compute_target= Compute_target,
    max_cores_per_iteration= -1,
    max_concurrent_iterations= 4,
    featurization= "auto",
    
    **automl_settings)

In [8]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on demo-1-cluster with default configuration
Running on remote compute: demo-1-cluster
Parent Run ID: AutoML_4c8b955c-9895-4d84-bbc7-96a12522b18d

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values ar

## Run Details



In [9]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [10]:
remote_run.wait_for_completion(show_output=True)



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the run complete. Otherwise cancel the current run and use a script to customize the handling of missing feature values that may be more appropriate based on the data type and business requirement.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization
DETAILS:      
+---------------------------------+---------------------------------+
|Column name                   

{'runId': 'AutoML_4c8b955c-9895-4d84-bbc7-96a12522b18d',
 'target': 'demo-1-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-03-21T12:03:02.798092Z',
 'endTimeUtc': '2021-03-21T12:34:21.700401Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '6',
  'target': 'demo-1-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"2c34e0c2-7327-45ef-b8c2-4c40d6343374\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/03-21-2021_113957_UTC/aug_train.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-141031\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"610d6e37-4747-4a20-80eb-3aad70a

## Best Model





In [11]:
automl_best_run, model= remote_run.get_output()
automl_best_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-job-change-expt,AutoML_4c8b955c-9895-4d84-bbc7-96a12522b18d_48,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [12]:
metrics = automl_best_run.get_metrics()
parameters = automl_best_run.get_details()

print("Best Run ID: ",automl_best_run.id)
print("Accuracy: ",metrics["accuracy"])

Best Run ID:  AutoML_4c8b955c-9895-4d84-bbc7-96a12522b18d_48
Accuracy:  0.8004488986324251


In [13]:
model._final_estimator

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('31',
                                           Pipeline(memory=None,
                                                    steps=[('standardscalerwrapper',
                                                            <azureml.automl.runtime.shared.model_wrappers.StandardScalerWrapper object at 0x7fadc75c9c50>),
                                                           ('lightgbmclassifier',
                                                            LightGBMClassifier(boosting_type='gbdt',
                                                                               class_weight=None,
                                                                               colsample_bytree=0.6933333333333332,
                                                                               importance_typ...
                                                                                   min_sample

In [14]:
# Save the best model
joblib.dump(model, filename="outputs/automl_model.joblib")

['outputs/automl_model.joblib']

In [15]:
model_name = automl_best_run.properties['model_name']
model_name

'AutoML4c8b955c948'

## Model Deployment



In [16]:
env = automl_best_run.get_environment().save_to_directory(path='environments')

script_file= 'score.py'

automl_best_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file)

In [17]:
model = remote_run.register_model(model_name = model_name, description = 'AutoML best model')

model.id

'AutoML4c8b955c948:1'

In [18]:
inference_config= InferenceConfig(entry_script = script_file, environment = env)
aci_config= AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

In [19]:
service = Model.deploy(ws, 'analytics-api', [model], inference_config, aci_config)
service.wait_for_deployment(True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-03-21 12:42:08+00:00 Creating Container Registry if not exists.
2021-03-21 12:42:09+00:00 Use the existing image.
2021-03-21 12:42:09+00:00 Generating deployment configuration.
2021-03-21 12:42:11+00:00 Submitting deployment to compute..
2021-03-21 12:42:16+00:00 Checking the status of deployment analytics-api..
2021-03-21 12:45:54+00:00 Checking the status of inference endpoint analytics-api.
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [20]:
print("State: " + service.state)
print("Scoring URI: " + service.scoring_uri)

State: Healthy
Scoring URI: http://62222515-5a71-4175-a1ae-13605caa2184.southcentralus.azurecontainer.io/score


In [21]:
%run endpoint.py

{"result": [0.0, 0.0, 0.0]}


In [26]:
service.delete()