## Predecir Notas de Estudiantes, Tunear modelo con Hyperdrive

In [1]:
import azureml.core
from azureml.core import Workspace, Dataset

ws = Workspace.from_config()
ds_notas = Dataset.get_by_name(ws, name='Notas Estudiantes')


### Crear Script de Entrenamiento

In [2]:
%%writefile notas_training.py
import argparse, joblib, os
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score


run = Run.get_context()

parser = argparse.ArgumentParser()

parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')

parser.add_argument('--learning_rate', type=float, dest='learning_rate', default=0.1, help='learning rate')
parser.add_argument('--n_estimators', type=int, dest='n_estimators', default=100, help='number of estimators')

args = parser.parse_args()

run.log('learning_rate',  np.float(args.learning_rate))
run.log('n_estimators',  np.int(args.n_estimators))

print("Loading Data...")
ds_notas = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input

X, y = ds_notas[['g1', 'g2', 'dalc', 'walc', 'goout', 'failures', 'age', 'medu', 'fedu']].values, ds_notas['g3'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=64)

print('Training a classification model')
model = GradientBoostingRegressor(learning_rate=args.learning_rate,
                                   n_estimators=args.n_estimators).fit(X_train, y_train)

y_hat = model.predict(X_test)
r2score = r2_score(y_test, y_hat)
print('R2:', r2score)
run.log('R2', np.float(r2score))

joblib.dump(value=model, filename='predecir_notas.pkl')

run.complete()

Writing notas_training.py


### Crear Compute Cluster

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "my-ml-cluster"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=3)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    

InProgress......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [4]:
%%writefile hyperdrive_env.yml
name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

Writing hyperdrive_env.yml


### Ejecutar Experimento de Tuning con HyperDrive

In [5]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.train.hyperdrive import BayesianParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

hyper_env = Environment.from_conda_specification("experiment_env", "hyperdrive_env.yml")


script_config = ScriptRunConfig(
                                source_directory='.', 
                                script='notas_training.py',
                                # Add non-hyperparameter arguments -in this case, the training dataset
                                arguments = ['--input-data', ds_notas.as_named_input('training_data')],
                                environment=hyper_env,
                                compute_target = training_cluster)


params = BayesianParameterSampling(
    {
        '--learning_rate': choice(0.01, 0.1, 1.0),
        '--n_estimators' : choice(10, 50, 100, 250)
    }
)


hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None,
                          primary_metric_name='R2', 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=12, 
                          max_concurrent_runs=3) 


experiment = Experiment(workspace=ws, name='predecir-notas-hyperdrive')
run = experiment.submit(config=hyperdrive)


run.wait_for_completion()

For best results with Bayesian Sampling we recommend using a maximum number of runs greater than or equal to 20 times the number of hyperparameters being tuned. Recommendend value:40.


{'runId': 'HD_9cb7861d-2d4c-41c4-98bc-3b48dcac4e09',
 'target': 'my-ml-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-11-06T12:55:10.033932Z',
 'endTimeUtc': '2021-11-06T13:06:50.124165Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name": "R2", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'd7e4ccad-6786-4267-877e-8ad3cbe4e250',
  'user_agent': 'python/3.6.9 (Linux-5.4.0-1056-azure-x86_64-with-debian-buster-sid) msrest/0.6.21 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.34.0',
  'score': '0.8330504796502911',
  'best_child_run_id': 'HD_9cb7861d-2d4c-41c4-98bc-3b48dcac4e09_11',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mymlworkspace0006243372.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_9cb7861d-2d4c-41c4-98bc-3b48dcac4e09/azureml-logs/hyperdrive

### Obtener Mejor Modelo

In [6]:
for child_run in run.get_children_sorted_by_primary_metric():
    print(child_run)


best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
script_arguments = best_run.get_details() ['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print(' -R2:', best_run_metrics['R2'])
print(' -Arguments:',script_arguments)

{'run_id': 'HD_9cb7861d-2d4c-41c4-98bc-3b48dcac4e09_11', 'hyperparameters': '{"--learning_rate": 0.1, "--n_estimators": 50}', 'best_primary_metric': 0.8330504796502911, 'status': 'Completed'}
{'run_id': 'HD_9cb7861d-2d4c-41c4-98bc-3b48dcac4e09_5', 'hyperparameters': '{"--learning_rate": 0.1, "--n_estimators": 100}', 'best_primary_metric': 0.82991265736463, 'status': 'Completed'}
{'run_id': 'HD_9cb7861d-2d4c-41c4-98bc-3b48dcac4e09_6', 'hyperparameters': '{"--learning_rate": 0.01, "--n_estimators": 250}', 'best_primary_metric': 0.8225300923528687, 'status': 'Completed'}
{'run_id': 'HD_9cb7861d-2d4c-41c4-98bc-3b48dcac4e09_9', 'hyperparameters': '{"--learning_rate": 0.1, "--n_estimators": 250}', 'best_primary_metric': 0.8108274711844944, 'status': 'Completed'}
{'run_id': 'HD_9cb7861d-2d4c-41c4-98bc-3b48dcac4e09_1', 'hyperparameters': '{"--learning_rate": 1.0, "--n_estimators": 10}', 'best_primary_metric': 0.7616028752977622, 'status': 'Completed'}
{'run_id': 'HD_9cb7861d-2d4c-41c4-98bc-3b4

### Registrar Mejor Modelo

In [8]:
best_run.register_model(model_path='.', model_name='predecir_notas.pkl',
                        tags={'Training context':'Hyperdrive'},
                        properties={'R2': best_run_metrics['R2']})

Model(workspace=Workspace.create(name='my_ml_workspace', subscription_id='030feb6f-715f-420c-90a9-4d556309931c', resource_group='my_ml_workspace'), name=predecir_notas.pkl, id=predecir_notas.pkl:1, version=1, tags={'Training context': 'Hyperdrive'}, properties={'R2': '0.8330504796502911'})

### Eliminar Compute Cluster

In [11]:
training_cluster.delete()