# Step 4a: Model operationalization & Deployment

The best model is saved as a .model file along with the relevant scheme for deployment. The functions are first tested locally before operationalizing the model using Azure Machine Learning Model Management environment for use in production in realtime.


In [1]:
## Setup our environment by importing required libraries
import os
import csv

import pandas as pd
import io
import requests

import glob
from azure.storage.blob import BlockBlobService
from azure.storage.blob import PublicAccess

# For creating pipelines and model
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Setup the pyspark environment
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
%%time
# load the previous created final dataset into the workspace
from azure.storage.blob import BlockBlobService
import glob
import os

# define parameters 
ACCOUNT_NAME = "pdmvienna"
ACCOUNT_KEY = "PDuXK61GpmMVWMrWdvr29THbPdlOXa61fN5RfgQV/jBO8berC1zLzZ678Nxrx+D3CRp4+ZvSff9al+lrUh8qUQ=="
CONTAINER_NAME = "featureengineering"

# define your blob service     
my_service = BlockBlobService(account_name=ACCOUNT_NAME, account_key=ACCOUNT_KEY)

# create a local path where to store the results later.
LOCAL_DIRECT = 'model_operationalize.parquet'
if not os.path.exists(LOCAL_DIRECT):
    os.makedirs(LOCAL_DIRECT)
    print('DONE creating a local directory!')

# define your blob service     
my_service = BlockBlobService(account_name=ACCOUNT_NAME, account_key=ACCOUNT_KEY)

# download the entire parquet result folder to local path for a new run 
for blob in my_service.list_blobs(CONTAINER_NAME):
    if 'featureengineering_files.parquet' in blob.name:
        local_file = os.path.join(LOCAL_DIRECT, os.path.basename(blob.name))
        my_service.get_blob_to_path(CONTAINER_NAME, blob.name, local_file)

data = spark.read.parquet('model_operationalize.parquet')
#data.persist()
data.show(5)
print('Feature engineering final dataset files loaded!')

DONE creating a local directory!
+---------+--------------------+------------------+--------------------+----------------------+-----------------------+-------------------+---------------------+-----------------------+------------------------+------------------+-------------------+---------------------+----------------------+------------------+--------------------+----------------------+-----------------------+------------------------+------------------------+------------------------+------------------------+------------------------+-----------------+-----------------+-----------------+-----------------+------+---+-------------+--------+-------+
|machineID|        dt_truncated|volt_rollingmean_3|rotate_rollingmean_3|pressure_rollingmean_3|vibration_rollingmean_3|volt_rollingmean_24|rotate_rollingmean_24|pressure_rollingmean_24|vibration_rollingmean_24| volt_rollingstd_3|rotate_rollingstd_3|pressure_rollingstd_3|vibration_rollingstd_3|volt_rollingstd_24|rotate_rollingstd_24|pressure_rol

# Define the features, labels for the model

In [3]:
# define list of input columns for downstream modeling - note model variable was removed as string was not supported
input_features = [
'volt_rollingmean_3',
'rotate_rollingmean_3',
'pressure_rollingmean_3',
'vibration_rollingmean_3',
'volt_rollingmean_24',
'rotate_rollingmean_24',
'pressure_rollingmean_24',
'vibration_rollingmean_24',
'volt_rollingstd_3',
'rotate_rollingstd_3',
'pressure_rollingstd_3',
'vibration_rollingstd_3',
'volt_rollingstd_24',
'rotate_rollingstd_24',
'pressure_rollingstd_24',
'vibration_rollingstd_24',
'error1sum_rollingmean_24',
'error2sum_rollingmean_24',
'error3sum_rollingmean_24',
'error4sum_rollingmean_24',
'error5sum_rollingmean_24',
'comp1sum',
'comp2sum',
'comp3sum',
'comp4sum',
'age'  
]

label_var = ['label_e']
key_cols =['machineID','dt_truncated']

In [4]:
# assemble features
va = VectorAssembler(inputCols=(input_features), outputCol='features')
data = va.transform(data).select('machineID','dt_truncated','label_e','features')

In [5]:
# set maxCategories so features with > 10 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", 
                               outputCol="indexedFeatures", 
                               maxCategories=10).fit(data)

In [6]:
# fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol="label_e", outputCol="indexedLabel").fit(data)

In [7]:
# split the data into train/test based on date
training = data.filter(data.dt_truncated > "2015-01-01").filter(data.dt_truncated < "2015-09-30")
testing = data.filter(data.dt_truncated > "2015-09-30")

print(training.count())
print(testing.count())

2174000
747000


# Train your best model

In [8]:
# train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

# chain indexers and forest in a Pipeline
pipeline_rf = Pipeline(stages=[labelIndexer, featureIndexer, rf])

# train model.  This also runs the indexers.
model_rf = pipeline_rf.fit(training)

In [9]:
# make predictions.
predictions_rf = model_rf.transform(testing)

In [10]:
predictionAndLabels = predictions_rf.select("indexedLabel", "prediction").rdd

In [11]:
# select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
print("Accuracy = %g" % evaluator.evaluate(predictions_rf, {evaluator.metricName: "accuracy"}))

Accuracy = 0.994112


# Save your model and schema

Once you have a model that performs well, you can package it into a scoring service. To prepare for this, save your model and dataset schema locally first.

In [12]:
# save model
model_rf.write().overwrite().save("/azureml-share/pdmrfull.model")
print("Model saved")

Model saved


In [13]:
# check to see if the model was saved in the shared location
!ls /azureml-share

pdmrfull.model
