Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

In [2]:
%run "../includes/setup_env"

#Model Building

We now run a training experiment and use the Azure ML SDK to save it to our AML Workspace.

In [5]:
import os
import pprint
import numpy as np

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [6]:
import azureml.core

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

Let's load our Azure ML Workspace first:

In [8]:
# import the Workspace class and check the azureml SDK version
from azureml.core import Workspace

config_path = '/dbfs/tmp/'

ws = Workspace.from_config(path=os.path.join(config_path, 'aml_config','config.json'))
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [9]:
df = spark.read.parquet("dbfs:/FileStore/tables/preprocessed").cache()
display(df)

In [10]:
# from pyspark.sql.types import DateType
from pandas import datetime
from pyspark.sql.functions import col, hour

# we sample every nth row of the data using the `hour` function
df_train = df.filter((col('datetime') < datetime(2015, 10, 1))) # & (hour(col('datetime')) % 3 == 0))
df_test = df.filter(col('datetime') > datetime(2015, 10, 15))

In [11]:
df_train = df_train.drop("y_1","y_2","y_3","datetime", "machineID")
df_train = df_train.withColumnRenamed("y_0", "error")
df_train.cache()

df_test = df_test.drop("y_1","y_2","y_3","datetime", "machineID")
df_test = df_test.withColumnRenamed("y_0", "error")
df_test.cache()

print("train: ({}, {})".format(df_train.count(), len(df_train.columns)))
print("test: ({}, {})".format(df_test.count(), len(df_test.columns)))

df_train.printSchema()

#Define Model

It is time to run the experiment. To do so we load the root experiment and call the `start_logging` method. We then invoke each iteration of the experiment using the `run` and tell it which metrics to log. Examine the code below and see it all happening in action.

In [14]:
from azureml.core.run import Run
from azureml.core.experiment import Experiment
import numpy as np
import os
import shutil

model_name = "PdM_logistic_regression.mml"
model_dbfs = os.path.join("/dbfs", model_name)
run_history_name = 'spark-ml-notebook'

# start a training run by defining an experiment
myexperiment = Experiment(ws, "AI_Airlft")
root_run = myexperiment.start_logging()

# Regularization Rates - 
regs = [0.0001, 0.001, 0.01, 0.1]
 
# try a bunch of regularization rate in a Logistic Regression model
for reg in regs:
    print("Regularization rate: {}".format(reg))
    # create a bunch of child runs
    with root_run.child_run("reg-" + str(reg)) as run:
        # create a new Logistic Regression model.
        lr = (LogisticRegression(regParam=reg)
              .setLabelCol("error")
              .setFeaturesCol("norm_features"))
        
        # put together the pipeline
        pipe = Pipeline(stages=[lr])

        # train the model
        model_p = pipe.fit(df_train)
        
        # make prediction
        pred = model_p.transform(df_test)
        
        # evaluate. note only 2 metrics are supported out of the box by Spark ML.
        bce = (BinaryClassificationEvaluator()
               .setLabelCol("error")
               .setRawPredictionCol('rawPrediction'))
               
        au_roc = bce.setMetricName('areaUnderROC').evaluate(pred)
        au_prc = bce.setMetricName('areaUnderPR').evaluate(pred)

        print("Area under ROC: {}".format(au_roc))
        print("Area Under PR: {}".format(au_prc))
      
        # log reg, au_roc, au_prc and feature names in run history
        run.log("reg", reg)
        run.log("au_roc", au_roc)
        run.log("au_prc", au_prc)
        run.log_list("columns", df_train.columns)

        # save model
        model_p.write().overwrite().save(model_name)
        
        # upload the serialized model into run history record
        mdl, ext = model_name.split(".")
        model_zip = mdl + ".zip"
        shutil.make_archive(mdl, 'zip', model_dbfs)
        run.upload_file("outputs/" + model_name, model_zip)        
        #run.upload_file("outputs/" + model_name, path_or_stream = model_dbfs) #cannot deal with folders

        # now delete the serialized model from local folder since it is already uploaded to run history 
        shutil.rmtree(model_dbfs)
        os.remove(model_zip)
        
# Declare run completed
root_run.complete()
root_run_id = root_run.id
print ("run id:", root_run.id)

In [15]:
# load all run metrics from run history into a dictionary object
child_runs = {}

for r in root_run.get_children():
    child_runs[r.id] = r

We can now select the best model based on the metric we choose.

In [17]:
metrics = root_run.get_metrics(recursive = True)
best_run_id = max(metrics, key = lambda k: metrics[k]['au_roc'])
best_run = child_runs[best_run_id]
print('Best run is:', best_run_id)
print('Metrics:', metrics[best_run_id]['au_roc'], metrics[best_run_id]['reg'])

We save the best model on disk for future use.

In [19]:
# download the model from the best run to a local folder
best_model_file_name = "best_model.zip"
best_run.download_file(name = 'outputs/' + model_name, output_file_path = best_model_file_name)

#Model Evaluation

We can load the best model we selected earlier and use it to evaluate its accuracy.

In [22]:
## unzip the model to dbfs (as load() seems to require that) and load it
if os.path.isfile(model_dbfs) or os.path.isdir(model_dbfs):
    shutil.rmtree(model_dbfs)
shutil.unpack_archive(best_model_file_name, model_dbfs)

model_p_best = PipelineModel.load(model_name)

In [23]:
# make prediction
df_pred = model_p_best.transform(df_test)
display(df_pred.limit(5))

In [24]:
import pyspark.sql.functions as F

df_select = df_pred.orderBy(F.desc('prediction')).limit(5)
df_select = df_pred.union(df_pred.orderBy(F.asc('prediction')).limit(5))

display(df_select)

In [25]:
# evaluate. note only 2 metrics are supported out of the box by Spark ML.
bce = (BinaryClassificationEvaluator()
               .setLabelCol("error")
               .setRawPredictionCol('rawPrediction'))
au_roc = bce.setMetricName('areaUnderROC').evaluate(df_pred)
au_prc = bce.setMetricName('areaUnderPR').evaluate(df_pred)

print("Area under ROC: {}".format(au_roc))
print("Area Under PR: {}".format(au_prc))

#Model Persistence

In [27]:
print(model_name[:-4])

In [28]:
## NOTE: by default the model is saved to and loaded from /dbfs/ instead of cwd!
model_p_best.write().overwrite().save(model_name[:-4])
print("saved model to {}".format(model_dbfs))

In [29]:
%sh

ls -la /dbfs/PdM_logistic_regression/*

In [30]:
dbutils.notebook.exit("success")

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.