# Model Training

In [48]:
import os
from os import path, makedirs
import pandas as pd
import numpy as np

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from azureml.logging import get_azureml_logger
from pyspark.sql import SparkSession

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


# Fill in your Azure storage account information here
account_name = ''

# initialize logger
logger = get_azureml_logger()

## Load the preprocessed data

In [49]:
spark = SparkSession.builder.getOrCreate()
data_filename = 'wasb://model@{}.blob.core.windows.net/trainingdata'.format(account_name)
df = spark.read.parquet(data_filename)

## Partition the data

In [50]:
train, test = df.randomSplit([0.8, 0.2], seed=0)
train = train.sampleBy('label', fractions={0.0: 0.2, 1.0: 0.8}, seed=0)

## Train the model

Many machine learning algorithms have one or more knobs, called hyperparameters. These knobs allow tuning of algorithms to optimize their performance over future data, measured according to user-specified metrics (for example, accuracy, AUC, RMSE). Data scientist needs to provide values of hyperparameters when building a model over training data and before seeing the future test data. How based on the known training data can we set up the values of hyperparameters so that the model has a good performance over the unknown test data? 

A popular technique for tuning hyperparameters is a grid search combined with cross-validation. Cross-validation is a technique that assesses how well a model, trained on a training set, predicts over the test set. Using this technique, initially we divide the dataset into K folds and then train the algorithm K times, in a round-robin fashion, on all but one of the folds, called held-out fold. We compute the average value of the metrics of K models over K held-out folds. This average value, called cross-validated performance estimate, depends on the values of hyperparameters used when creating K models. When tuning hyperparameters, we search through the space of candidate hyperparameter values to find the ones that optimize cross-validation performance estimate. Grid search is a common search technique, where the space of candidate values of multiple hyperparameters is a cross-product of sets of candidate values of individual hyperparameters. 

Grid search using cross-validation can be time-consuming. If an algorithm has 5 hyperparameters, each with 5 candidate values and we use K=5 folds, then to complete a grid search we need to train 56=15625 models. Fortunately, grid-search using cross-validation is an embarrassingly parallel procedure and all these models can be trained in parallel.

In this particular case, the grid has four combinations of values of hyperparameters (maxDepth and maxBins). We use 3-fold cross validation, resulting 4x3=12 runs of RandomForestClassifier. Note that the default metric for the BinaryClassificationEvaluator is `areaUnderROC`. To measure performance of the models, we use negative log loss metric. The following code finds the best model which applies the values of hyperparameters from the grid that maximize the cross-validated `areaUnderROC`. We use this best model as our trained\_model, which is to be saved for future scoring.

For more information about hyperparameter tuning with Azure ML Workbench please refer to article [Distributed tuning of hyperparameters using Azure Machine Learning Workbench](https://docs.microsoft.com/en-us/azure/machine-learning/preview/scenario-distributed-tuning-of-hyperparameters).

In [51]:
#trained_model = RandomForestClassifier(featuresCol='features', labelCol='label').fit(train)

# Define the classifier   
clf = RandomForestClassifier(seed=0)
evaluator = BinaryClassificationEvaluator()
paramGrid = ParamGridBuilder().addGrid(clf.maxDepth, [5, 10]).addGrid(clf.maxBins, [32, 64]).build()

# Create 3-fold CrossValidator
cv = CrossValidator(estimator=clf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)


# Run cross validations.  This can take up-to 5 minutes since there 2*2=4 parameter settings for each model, each of which trains with 3 traing set 
cvModel = cv.fit(train)

# Get the best model
trained_model = cvModel.bestModel

In [52]:
print("The evaluation metric is {}.".format(evaluator.getMetricName()))

print("Parameter MaxDepth of the best model is {}.".format(clf.getMaxDepth()))
print("Parameter MaxBins of the best model is {}.".format(clf.getMaxBins()))

logger.log("MaxDepth", (clf.getMaxDepth()))
logger.log("MaxBins", (clf.getMaxBins()))


The evaluation metric is areaUnderROC.
Parameter MaxDepth of the best model is 5.
Parameter MaxBins of the best model is 32.


<azureml.logging.script_run_request.ScriptRunRequest at 0x7f6ea6da7358>

In [55]:
# Store the model
model_filename = 'wasb://model@{}.blob.core.windows.net/model'.format(account_name)

trained_model.save(model_filename)

## Evaluate the model

In [56]:
# Make predictions on test dataset. 
predictions = trained_model.transform(test)

# Evaluate the best trained model on the test dataset with default metric "areaUnderROC"
evaluator.evaluate(predictions)

0.6632974750185294

In [57]:
# Create the confusion matrix for the multiclass prediction results
# This result assumes a decision boundary of p = 0.5

pred_pd = predictions.toPandas()
confuse = pd.crosstab(pred_pd['label'],pred_pd['prediction'])
confuse.columns = confuse.columns.map(str)
print(confuse)

prediction    0.0  1.0
label                 
0.0         17928  220
1.0          2270  144


In [58]:
# select (prediction, true label) and compute test error
# True positives - diagonal failure terms 
tp = confuse['1.0'][1]

# False positves - All failure terms - True positives
fp = np.sum(np.sum(confuse[['1.0']])) - tp

# True negatives 
tn = confuse['0.0'][0]

# False negatives total of non-failure column - TN
fn = np.sum(np.sum(confuse[['0.0']])) - tn


# Accuracy is diagonal/total 
acc_n = tn + tp
acc_d = np.sum(np.sum(confuse[['0.0','1.0']]))
acc = acc_n/acc_d

# Calculate precision and recall.
prec = tp/(tp+fp)
rec = tp/(tp+fn)

# Print the evaluation metrics to the notebook
print("Accuracy = %g" % acc)
print("Precision = %g" % prec)
print("Recall = %g" % rec )
print("F1 = %g" % (2.0 * prec * rec/(prec + rec)))
print("")

# logger writes information back into the AML Workbench run time page.
# Each title (i.e. "Model Accuracy") can be shown as a graph to track
# how the metric changes between runs.
logger.log("Model Accuracy", (acc))
logger.log("Model Precision", (prec))
logger.log("Model Recall", (rec))
logger.log("Model F1", (2.0 * prec * rec/(prec + rec)))

Accuracy = 0.878903
Precision = 0.395604
Recall = 0.059652
F1 = 0.103672



<azureml.logging.script_run_request.ScriptRunRequest at 0x7f6e9c1a4208>