In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="ilkkaamlws")
exp = Experiment(workspace=ws, name="udacity-project-new")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: ilkkaamlws
Azure region: northeurope
Subscription id: 1f63a07e-5703-4d10-925f-b1c603594482
Resource group: ilkka-aml


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
training_cluster = ComputeTarget.create(ws, "minunudacluster2", compute_config)

In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import choice
import os

# Specify parameter sampler
param_space = {
                 '--C': choice(0.1, 0.3, 1),
                 '--max_iter': choice(10, 20, 30, 100)
              }
ps = RandomParameterSampling(param_space)

# Specify a Policy
policy = BanditPolicy(slack_amount = 0.2, evaluation_interval=1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory=".",
              entry_script='train.py',
             compute_target = training_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=5,
                                     max_concurrent_runs=2)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
experiment = Experiment(workspace=ws, name ='minunudahyperi2')
run = experiment.submit(config=hyperdrive_config)
RunDetails(run).show()
run.wait_for_completion()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

{'runId': 'HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797',
 'target': 'minunudacluster2',
 'status': 'Completed',
 'startTimeUtc': '2020-11-06T09:59:16.231682Z',
 'endTimeUtc': '2020-11-06T10:09:01.418581Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '4fa74c66-c122-4097-932d-44c4cfdeefe0',
  'score': '0.9087157076960427',
  'best_child_run_id': 'HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797_2',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://ilkkaamlws0991200259.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=oMY%2BddEQCzhdidkaLbX9G1GvylhoRSJIsK06XQROOeA%3D&st=2020-11-06T09%3A59%3A09Z&se=2020-11-06T18%3A09%3A09Z&sp=r'}}

In [5]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
for child_run in run.get_children_sorted_by_primary_metric():
    print(child_run)
    
best_run = run.get_best_run_by_primary_metric()
#joblib.dump(best_run, 'model.joblib')

{'run_id': 'HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797_2', 'hyperparameters': '{"--C": 0.1, "--max_iter": 100}', 'best_primary_metric': 0.9087157076960427, 'status': 'Completed'}
{'run_id': 'HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797_4', 'hyperparameters': '{"--C": 0.1, "--max_iter": 20}', 'best_primary_metric': 0.9077445982034474, 'status': 'Completed'}
{'run_id': 'HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797_0', 'hyperparameters': '{"--C": 0.3, "--max_iter": 30}', 'best_primary_metric': 0.9073804321437242, 'status': 'Completed'}
{'run_id': 'HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797_3', 'hyperparameters': '{"--C": 1, "--max_iter": 100}', 'best_primary_metric': 0.9045884923525127, 'status': 'Completed'}
{'run_id': 'HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797_1', 'hyperparameters': '{"--C": 1, "--max_iter": 10}', 'best_primary_metric': 0.9038601602330663, 'status': 'Completed'}
{'run_id': 'HD_c8789b19-6b5f-4cbf-a65d-a0d21c92a797_preparation', 'hyperparameters': None, 'best_primary_metric': None, 'status

In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
### YOUR CODE HERE ###
ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")


In [7]:
import pandas as pd
def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
    contact = pd.get_dummies(x_df.contact, prefix="contact")
    x_df.drop("contact", inplace=True, axis=1)
    x_df = x_df.join(contact)
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0)

    #y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)
    return x_df
    #return x_df, y_df

In [8]:
# tryin to import this causes some weird ipykernel error...
#from train import clean_data

# Use the clean_data function to clean your data.
#x, y = clean_data(ds)
# It makes no sense to split it to x and y in the clean data...
from sklearn.model_selection import train_test_split

cleaned_df = clean_data(ds)
train_df, test_df = train_test_split(cleaned_df, test_size=0.2)


In [25]:
cleaned_df.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
0,57,1,0,0,1,5,1,371,1,999,...,1,0,0,0,0,1,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,0,1,0,0,0,0,0,0,0,1
2,33,1,0,0,0,5,5,52,1,999,...,1,0,0,0,1,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,0,1,0,0,0,1,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,1,0,0,0,0,1,0,0,0,0


In [9]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

# nowhere in the course I saw any instructions on what to fill in here so I'll
# follow the microsoft documentation... I'm not should I use accuracy here as a metric or not

# In the project instructions on 5. automl run it says to "split data into train and valid tests"
# so I do that even though I would like to let the automl do its on cross-validation which
# seems like a better idea

# Note on the above, I can\t even use the crossvalidation if I split the data
# myself, yet another mistake in the project instructions. Anyway, I'll skip
# splitting the data myself then...
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data= cleaned_df,
    label_column_name='y',
    n_cross_validations=2)

In [11]:
# Submit your automl run

### YOUR CODE HERE ###
#from azureml.core.experiment import Experiment

automl_experiment = Experiment(ws, 'automl_experiment2')
automl_run = automl_experiment.submit(automl_config)



In [12]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, fitted_model = automl_run.get_output()
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

precision_score_macro 0.8024709413022642
precision_score_micro 0.9162367223065251
AUC_macro 0.9478183556341432
recall_score_macro 0.7389831565343454
weighted_accuracy 0.9602670741443091
norm_macro_recall 0.47796631306869086
f1_score_weighted 0.9113310776187307
f1_score_micro 0.9162367223065251
matthews_correlation 0.537508841672188
average_precision_score_macro 0.8247844802527047
AUC_micro 0.9807562789990814
accuracy 0.9162367223065251
balanced_accuracy 0.7389831565343454
f1_score_macro 0.7652844325323995
log_loss 0.19406776784049354
recall_score_micro 0.9162367223065251
average_precision_score_micro 0.9815786721005222
average_precision_score_weighted 0.9555007096105637
AUC_weighted 0.9478183556341431
precision_score_weighted 0.9091819202578171
recall_score_weighted 0.9162367223065251
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_2726df91-1472-4473-9ace-0b0e30bc8f19_61/confusion_matrix
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_2726df91-1472-4473-9ace-0b0e3

In [10]:
cleaned_df.to_csv('training_data.csv')

In [26]:
#clean_ds = TabularDatasetFactory.get_by_name()

AttributeError: type object 'TabularDatasetFactory' has no attribute 'get_by_name'

In [59]:
#for ds_name in ws.datastores:
#    print(ds_name)

my_temp_datastore
workspaceblobstore
workspacefilestore


In [None]:
#tab_dataset = Dataset.Tabular.from_delimited_files(path=(default_ds,'training_data.csv'))

In [11]:
default_ds = ws.get_default_datastore()
default_ds.upload_files(files=['./training_data.csv'], # Upload the diabetes csv files in /data
                       target_path='training-data/', # Put it in a folder path in the datastore
                       overwrite=True, # Replace existing files of the same name
                       show_progress=True)

Uploading an estimated of 1 files
Uploading ./training_data.csv
Uploaded ./training_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_c4016d474917437aa06d0c3b301e4bf8

In [14]:
from azureml.core import Dataset

blob_ds = ws.get_default_datastore()
csv_paths = [(blob_ds, './training-data/training_data.csv')]
tab_ds = Dataset.Tabular.from_delimited_files(path=csv_paths)
tab_ds = tab_ds.register(workspace=ws, name='csv_table')

In [15]:
# Also running in the cloud compute target in case that was a requirement
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data= tab_ds,
    compute_target = training_cluster,
    label_column_name='y',
    n_cross_validations=2)

In [16]:
automl_experiment = Experiment(ws, 'automl_experiment3')
automl_run = automl_experiment.submit(automl_config)

Running on remote.


In [28]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, fitted_model = automl_run.get_output()
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

balanced_accuracy 0.7276485524057095
norm_macro_recall 0.45529710481141905
recall_score_macro 0.7276485524057095
f1_score_micro 0.9143854324734446
AUC_micro 0.9802809673920803
weighted_accuracy 0.9607722394588862
log_loss 0.17855504287929735
precision_score_macro 0.7992603434698753
accuracy 0.9143854324734446
f1_score_macro 0.7564311285923817
precision_score_micro 0.9143854324734446
f1_score_weighted 0.9086173079094375
matthews_correlation 0.5217934644908475
average_precision_score_micro 0.9811264177399588
average_precision_score_weighted 0.9545539983425243
AUC_weighted 0.946096917302776
precision_score_weighted 0.9063985559064617
recall_score_micro 0.9143854324734446
average_precision_score_macro 0.8213968176240831
recall_score_weighted 0.9143854324734446
AUC_macro 0.946096917302776
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_5c4204b6-5881-49e9-abe4-01f73c9c1e68_1/confusion_matrix
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_5c4204b6-5881-49e9-abe4-01f73c9

In [29]:
fitted_model

PipelineWithYTransformations(Pipeline={'memory': None,
                                       'steps': [('datatransformer',
                                                  DataTransformer(enable_dnn=None,
                                                                  enable_feature_sweeping=None,
                                                                  feature_sweeping_config=None,
                                                                  feature_sweeping_timeout=None,
                                                                  featurization_config=None,
                                                                  force_text_dnn=None,
                                                                  is_cross_validation=None,
                                                                  is_onnx_compatible=None,
                                                                  logger=None,
                                                              