# Automated ML

Import Dependencies.

In [1]:
from azureml.core import Workspace, Experiment,Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails

import azureml.core
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.model_selection import train_test_split
from azureml.train.automl import AutoMLConfig

import os
import joblib
import pandas as pd
import numpy as np

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.19.0


## Initialize Workspace

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-133132
aml-quickstarts-133132
southcentralus
81cefad3-d2c9-4f77-a466-99a7f541c7bb


## Create an Azure ML experiment

In [3]:
experiment_name = 'automl-experiment'
project_folder = './automl-model'

experiment = Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
automl-experiment,quick-starts-ws-133132,Link to Azure Machine Learning studio,Link to Documentation


## Dataset

### Overview

We will build prediction model that will classify if a customer is going to churn or not from kaggle's data set [Credit Card customers](https://www.kaggle.com/sakshigoyal7/credit-card-customers). Data set was downloaded as a csv and uploaded i.e. registered as data set in the workspace.

In [4]:
dataset=Dataset.get_by_name(ws,name="Credit-Card-Churners")
run = experiment.start_logging()
df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0
mean,739177600.0,46.32596,2.346203,35.928409,3.81258,2.341167,2.455317,8631.953698,1162.814061,7469.139637,0.759941,4404.086304,64.858695,0.712222,0.274894,0.159997,0.840003
std,36903780.0,8.016814,1.298908,7.986416,1.554408,1.010622,1.106225,9088.77665,814.987335,9090.685324,0.219207,3397.129254,23.47257,0.238086,0.275691,0.365301,0.365301
min,708082100.0,26.0,0.0,13.0,1.0,0.0,0.0,1438.3,0.0,3.0,0.0,510.0,10.0,0.0,0.0,8e-06,0.00042
25%,713036800.0,41.0,1.0,31.0,3.0,2.0,2.0,2555.0,359.0,1324.5,0.631,2155.5,45.0,0.582,0.023,9.9e-05,0.99966
50%,717926400.0,46.0,2.0,36.0,4.0,2.0,2.0,4549.0,1276.0,3474.0,0.736,3899.0,67.0,0.702,0.176,0.000181,0.99982
75%,773143500.0,52.0,3.0,40.0,5.0,3.0,3.0,11067.5,1784.0,9859.0,0.859,4741.0,81.0,0.818,0.503,0.000337,0.9999
max,828343100.0,73.0,5.0,56.0,6.0,6.0,6.0,34516.0,2517.0,34516.0,3.397,18484.0,139.0,3.714,0.999,0.99958,0.99999


In [5]:
currDir=os.getcwd()
print(currDir)
os.listdir(currDir)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/compute-aml-cluster/code/Users/odl_user_133132


['.config',
 '.ipynb_checkpoints',
 'automl-model',
 'automl.ipynb',
 'automl.log',
 'automl_errors.log',
 'azureml_automl.log',
 'conda_dependencies.yml',
 'CreditCardChurners.csv',
 'hyperparameter_tuning.ipynb',
 'outputs',
 'README.md',
 'train.py',
 'training',
 'udacity-project.ipynb']

## Create or Attach a Compute Resource

In [6]:
compute_name = "compute-aml-cluster"

# Check if the compute target exists
try:
    compute_aml_cluster = ComputeTarget(workspace=ws, name=compute_name)
    print('Found existing cluster.')
except ComputeTargetException:
    # If not, create it
    compute_config = AmlCompute.provisionibng_configuration(vm_size='STANDARD_DS12_V2',
                                                           max_nodes=5)
    compute_aml_cluster = ComputeTarget.create(ws, compute_name, compute_config)

compute_aml_cluster.wait_for_completion(show_output=True)

Found existing cluster.

Running


In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Use the clean_data function to clean your data.
#x, y = clean_data(df)

def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

def pre_processing(data):
    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()

    # Drop last two columns (unneeded)
    x_df.drop(x_df.columns[-2:],inplace=True, axis=1)

    # Drop CLIENTNUM columns
    x_df.drop("CLIENTNUM",inplace=True, axis=1)

    # Encode unknown values as np.NaN
    x_df = x_df.replace('Unknown', np.NaN)

    # Fill ordinal missing values with modes (Education_Level and Income_Category columns)
    x_df['Education_Level'] = x_df['Education_Level'].fillna('Graduate')
    x_df['Income_Category'] = x_df['Income_Category'].fillna('Less than $40K')

    # Encode binary columns
    x_df = binary_encode(x_df, 'Attrition_Flag', positive_value='Attrited Customer')
    x_df = binary_encode(x_df, 'Gender', positive_value='M')

    # Encode ordinal columns
    education_ordering = [
        'Uneducated',
        'High School',
        'College',
        'Graduate',
        'Post-Graduate',
        'Doctorate'
    ]
    income_ordering = [
        'Less than $40K',
        '$40K - $60K',
        '$60K - $80K',
        '$80K - $120K',
        '$120K +'
    ]

    x_df = ordinal_encode(x_df, 'Education_Level', ordering=education_ordering)
    x_df = ordinal_encode(x_df, 'Income_Category', ordering=income_ordering)

    # Encode nominal columns
    x_df = onehot_encode(x_df, 'Marital_Status', prefix='Marital_Status')
    x_df = onehot_encode(x_df, 'Card_Category', prefix='Card_Category')

    # Split df into X and y
    X = x_df.drop('Attrition_Flag', axis=1).copy()
    y = x_df['Attrition_Flag'].copy()

    # Scale X with a standard scaler
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    return X, y

x, y = pre_processing(dataset)

all_data = pd.concat([x,y], axis = 1)
all_data.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Income_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,...,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Attrition_Flag
0,-0.165406,1.059956,0.503368,-0.89368,0.5973,0.384621,0.763943,-1.327136,0.492404,0.446622,...,3.834003,-0.775882,-0.282405,1.077338,-0.798507,0.270611,-0.107644,-0.044484,-0.240794,0
1,0.33357,-0.943436,2.043199,0.593388,-0.887628,1.010715,1.407306,-1.327136,-0.411616,-0.041367,...,12.608573,-0.616276,-0.282405,-0.928214,1.252337,0.270611,-0.107644,-0.044484,-0.240794,0
2,0.583058,1.059956,0.503368,0.593388,1.339764,0.008965,0.120579,-1.327136,-2.219655,-0.573698,...,6.807864,-0.997155,-0.282405,1.077338,-0.798507,0.270611,-0.107644,-0.044484,-0.240794,0
3,-0.789126,-0.943436,1.273283,-0.89368,-0.887628,-0.241473,-0.522785,1.641478,-1.315636,-0.585251,...,6.807864,1.759686,-0.282405,-0.928214,-0.798507,0.270611,-0.107644,-0.044484,-0.240794,0
4,-0.789126,1.059956,0.503368,-1.637214,0.5973,-1.869317,0.763943,-1.327136,-2.219655,-0.430877,...,7.509325,-0.997155,-0.282405,1.077338,-0.798507,0.270611,-0.107644,-0.044484,-0.240794,0


In [36]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(all_data, test_size=0.2,random_state=0)
# columns=x_train.columns
# x_train.reset_index(drop=True, inplace=True)
# x_test.reset_index(drop=True, inplace=True)
print(x_train.head(3))
print(x_test.head(3))
print(x_train.shape)
print(x_test.shape)

if "training" not in os.listdir():
    os.mkdir("./training")

x_train.to_csv('training/train_data.csv',index=False)
x_test.to_csv('training/test_data.csv',index=False)

# x_train.to_csv(path_or_buf='training/train_data.csv', columns=columns, header=True, index=False) 
# x_test.to_csv(path_or_buf='training/test_data.csv', columns=columns, header=True, index=False)

      Customer_Age  Gender  Dependent_count  Education_Level  Income_Category  \
147           1.08    1.06            -0.27             0.59             2.08   
1985         -1.79    1.06            -1.81             0.59            -0.89   
8316         -1.41   -0.94             1.27            -1.64            -0.89   

      Months_on_book  Total_Relationship_Count  Months_Inactive_12_mon  \
147             1.76                     -0.52                   -0.34   
1985           -1.12                      1.41                    0.65   
8316           -1.37                     -1.17                    0.65   

      Contacts_Count_12_mon  Credit_Limit  ...  Total_Ct_Chng_Q4_Q1  \
147                   -2.22          0.25  ...                 1.21   
1985                   0.49          0.07  ...                -0.98   
8316                  -0.41         -0.58  ...                -0.04   

      Avg_Utilization_Ratio  Marital_Status_Divorced  Marital_Status_Married  \
147          

In [32]:
# get the datastore to upload prepared data
datastore = ws.get_default_datastore()
print(datastore)

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='training/', target_path='data/')
# datastore.upload_files(['trainset.csv'])

# create a dataset referencing the cloud location
train_data = TabularDatasetFactory.from_delimited_files(path = [(datastore, ('data/train_data.csv'))])
test_data = TabularDatasetFactory.from_delimited_files(path = [(datastore, ('data/test_data.csv'))])

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-1bc1184b-0d99-47b2-b8b1-a9aba83249ed",
  "account_name": "mlstrg133132",
  "protocol": "https",
  "endpoint": "core.windows.net"
}
Uploading an estimated of 3 files
Target already exists. Skipping upload for data/test_data.csv
Target already exists. Skipping upload for data/train_data.csv
Target already exists. Skipping upload for data/.ipynb_checkpoints/train_data-checkpoint.csv
Uploaded 0 files


## AutoML Configuration

In [10]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 10,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 3,
    "max_cores_per_iteration":-1
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(compute_target=compute_aml_cluster,
                             task = "classification",
                             training_data=train_data,
                             label_column_name="Attrition_Flag",   
                             path = project_folder,
                             enable_early_stopping= True,
                             debug_log = "automl_errors.log",
                             **automl_settings
)

## Submit Run

In [11]:
remote_run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on compute-aml-cluster with default configuration
Running on remote compute: compute-aml-cluster
Parent Run ID: AutoML_ceea2569-ef01-4a09-ac5c-5230d248456c

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------

## Run Details

In [12]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-experiment,AutoML_ceea2569-ef01-4a09-ac5c-5230d248456c,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [13]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|1318                             |1                                |8101                                  |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_ceea2569-ef01-4a09-ac5c-5230d248456c',
 'target': 'compute-aml-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-03T12:42:57.064186Z',
 'endTimeUtc': '2021-01-03T12:53:56.929822Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'compute-aml-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"ab55ce9e-4800-458d-9c95-c0641a633cfa\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/train_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-133132\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"81cefad3-d2c9-4f77-a466-99a7f541c7bb\\\\

## Best Model

In [14]:
best_run, fitted_automl_best_model = remote_run.get_output()
best_run_metrics = best_run.get_metrics()
print('Best Run ID',best_run.id)

Best Run ID AutoML_ceea2569-ef01-4a09-ac5c-5230d248456c_57


In [15]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-experiment,AutoML_ceea2569-ef01-4a09-ac5c-5230d248456c_57,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [16]:
fitted_automl_best_model

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               max_depth=10,
                                                                                               max_leaves=127,
                                                                                               min_child_weight=1,
                                                         

In [17]:
# Metric in best run
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)
    
print('\nAccuracy of Best run',best_run_metrics['accuracy'],sep='\n')
print(best_run)

model_name = best_run.properties['model_name']
model_name

precision_score_weighted 0.9884679366831852
log_loss 0.03950192825182192
precision_score_micro 0.9885200268761913
matthews_correlation 0.9574955630022789
average_precision_score_weighted 0.9983986967778214
norm_macro_recall 0.9488050764311423
f1_score_micro 0.9885200268761913
recall_score_macro 0.9744025382155712
AUC_weighted 0.9982497423027814
f1_score_weighted 0.9884665953344453
recall_score_weighted 0.9885200268761913
accuracy 0.9885200268761913
precision_score_macro 0.9831402968284659
AUC_micro 0.9989819049621033
average_precision_score_micro 0.9989988841805254
AUC_macro 0.9982497423027814
weighted_accuracy 0.9937742520007841
f1_score_macro 0.9786967986141067
recall_score_micro 0.9885200268761913
average_precision_score_macro 0.9958124604969072
balanced_accuracy 0.9744025382155712
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_ceea2569-ef01-4a09-ac5c-5230d248456c_57/accuracy_table
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_ceea2569-ef01-4a09-ac5c-5230d24

'AutoMLceea2569e57'

In [18]:
#TODO: Save the best model

os.makedirs('./outputs', exist_ok=True)

joblib.dump(fitted_automl_best_model, filename='outputs/automl.joblib')

['outputs/automl.joblib']

## Fetch Environment

In [19]:
env = best_run.get_environment()

script_file = 'score.py'
best_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file)

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [20]:
#Register the model
model = remote_run.register_model(model_name=model_name)
print(model.name, model.id, model.version, sep='\t')

AutoMLceea2569e57	AutoMLceea2569e57:1	1


In [22]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

inference_config = InferenceConfig(entry_script=script_file, environment=env)

deployment_config = AciWebservice.deploy_configuration(auth_enabled=True, #Genrates API key to secure access
                                                       cpu_cores = 1, 
                                                       memory_gb = 1)

deploy_service_name= 'automl-model-deployment'
service = Model.deploy(ws,deploy_service_name,  [model], inference_config, deployment_config)

service.wait_for_deployment(show_output = True)

scoring_uri = service.scoring_uri

print("State: ",service.state)
print("\nScoring URI: ", scoring_uri)
print("\nService Logs:\n",service.get_logs())

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running............
Succeeded
ACI service creation operation finished, operation "Succeeded"
State:  Healthy

Scoring URI:  http://70543f65-1d64-4754-9d48-266698ebf690.southcentralus.azurecontainer.io/score

Service Logs 2021-01-03T12:57:45,803367800+00:00 - rsyslog/run 
2021-01-03T12:57:45,802026300+00:00 - gunicorn/run 
2021-01-03T12:57:45,824800300+00:00 - iot-server/run 
2021-01-03T12:57:45,854668600+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib

## Test the Deploy model with two different methods

### Method 1

In [24]:
import json
data_test = test_data.to_pandas_dataframe().dropna()
data_sample = data_test.sample(3)
y_true = data_sample.pop('Attrition_Flag')
sample_json = json.dumps({'data':data_sample.to_dict(orient='records')})
print(sample_json)

Uploading an estimated of 3 files
Target already exists. Skipping upload for data/test_data.csv
Target already exists. Skipping upload for data/train_data.csv
Uploading training/.ipynb_checkpoints/train_data-checkpoint.csv
Uploaded training/.ipynb_checkpoints/train_data-checkpoint.csv, 1 files out of an estimated total of 1
Uploaded 1 files
{"data": [{"Column1": 8510, "Customer_Age": -0.5396375527574563, "Gender": 1.0599556481738364, "Dependent_count": 1.2732834007681237, "Education_Level": -1.6372140257929357, "Income_Category": -0.1451643075621299, "Months_on_book": 0.259402089648562, "Total_Relationship_Count": -1.809511627862365, "Months_Inactive_12_mon": 1.6414782926434333, "Contacts_Count_12_mon": -1.315635733377402, "Credit_Limit": -0.6808682264320558, "Total_Revolving_Bal": -1.4268583411935547, "Avg_Open_To_Buy": -0.5528062607047155, "Total_Amt_Chng_Q4_Q1": 0.12801033415221885, "Total_Trans_Amt": -0.05419145983530442, "Total_Trans_Ct": -0.6330550070392089, "Total_Ct_Chng_Q4_Q1"

TODO: In the cell below, print the logs of the web service and delete the service

In [25]:
Output = service.run(sample_json)
print(Output)

{"result": [0, 0, 0]}


In [26]:
print('Prediction: ',Output)
print('True Values: ', y_true.values)

Prediction:  {"result": [0, 0, 0]}
True Values:  [0 0 0]


In [None]:
service.get_logs()

In [None]:
primary_key,secondary_key=service.get_keys()
print(primary_key,secondary_key,sep='\n')

In [None]:
import requests

headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header
headers['Authorization'] = f'Bearer {primary_key}'

# Make the request and display the response
resp = requests.post(scoring_uri, sample_json, headers=headers)
print(resp.json())

### Method 2

In [None]:
%run endpoint.py

## Clean Up

In [None]:
service.delete()