In [1]:
import sys
import json
import pickle
import pyspark
from functools import reduce
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import udf, mean, lit, stddev, col, expr, when
from pyspark.sql.types import DoubleType, ArrayType, ShortType, LongType, IntegerType
import pandas as pd
from collections import OrderedDict
from datetime import date
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt

from pyspark.ml import Pipeline, PipelineModel

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier

from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml import PipelineModel

In [2]:
sc = SparkContext.getOrCreate()
sql = SQLContext.getOrCreate(sc)

In [3]:
STORAGE_ACCOUNT_SUFFIX = 'core.windows.net'
STORAGE_ACCOUNT_NAME = os.environ['STORAGE_ACCOUNT_NAME']

wasbUrlOutput = "wasb://{0}@{1}.blob.{2}/features.parquet".format(
            'intermediate',
            STORAGE_ACCOUNT_NAME,
            STORAGE_ACCOUNT_SUFFIX)

dfa = spark.read.parquet(wasbUrlOutput)
dfa.printSchema()

root
 |-- EnqueuedTimeUtc: string (nullable = true)
 |-- machineID: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- f0: double (nullable = true)
 |-- f1: double (nullable = true)
 |-- f2: double (nullable = true)
 |-- a0: double (nullable = true)
 |-- a1: double (nullable = true)
 |-- a2: double (nullable = true)
 |-- temperature_n: double (nullable = true)
 |-- pressure_n: double (nullable = true)
 |-- f0_n: double (nullable = true)
 |-- f1_n: double (nullable = true)
 |-- f2_n: double (nullable = true)
 |-- a0_n: double (nullable = true)
 |-- a1_n: double (nullable = true)
 |-- a2_n: double (nullable = true)



In [4]:
features = [c for c in dfa.columns if c not in ['machineID', 'EnqueuedTimeUtc']]

# assemble features
va = VectorAssembler(inputCols=(features), outputCol='features')

# this is a hack!! In the current simulated dataset, there are 4 machines: machine 0 is "good," 
# whereas the rest are experiencing different failures. This will eventually be produced by merging 
# telemetry with the "maintenance log"
feat_data = va.transform(dfa).withColumn('label_e', dfa.machineID.substr(-1, 1).cast(IntegerType()))
dfa.unpersist()
feat_data.persist(pyspark.StorageLevel.DISK_ONLY)
feat_data.limit(10).toPandas().head()

Unnamed: 0,EnqueuedTimeUtc,machineID,speed,temperature,pressure,f0,f1,f2,a0,a1,...,temperature_n,pressure_n,f0_n,f1_n,f2_n,a0_n,a1_n,a2_n,features,label_e
0,2018-03-15T00:49:22.4250000Z,pm1-354,1220.0,89.188682,2439.961953,244.0,81.0,41.0,2128.662248,1211.514845,...,0.073105,1.999969,0.2,0.066393,0.033607,1.744805,0.993045,0.884288,"[1220.0, 89.1886821271, 2439.96195289, 244.0, ...",4
1,2018-03-15T00:49:23.3780000Z,pm1-354,1220.0,89.207477,2439.962003,244.0,81.0,41.0,2128.354364,1211.348214,...,0.073121,1.999969,0.2,0.066393,0.033607,1.744553,0.992908,0.884706,"[1220.0, 89.207476832, 2439.96200276, 244.0, 8...",4
2,2018-03-15T00:49:23.3940000Z,pm1-355,1330.0,98.270397,2659.946388,266.0,111.0,89.0,2128.128247,1253.766908,...,0.073888,1.99996,0.2,0.083459,0.066917,1.600096,0.942682,0.909901,"[1330.0, 98.270396719, 2659.94638802, 266.0, 1...",5
3,2018-03-15T00:49:24.4430000Z,pm1-354,1220.0,89.226255,2439.962053,244.0,81.0,41.0,2131.79487,1250.348996,...,0.073136,1.999969,0.2,0.066393,0.033607,1.747373,1.024876,0.912406,"[1220.0, 89.2262553905, 2439.96205252, 244.0, ...",4
4,2018-03-15T00:49:24.4900000Z,pm1-355,1330.0,98.290864,2659.946458,266.0,111.0,89.0,2123.030112,1283.127391,...,0.073903,1.99996,0.2,0.083459,0.066917,1.596263,0.964757,0.923692,"[1330.0, 98.290863509, 2659.94645819, 266.0, 1...",5


In [5]:
# set maxCategories so features with > 10 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", 
                               outputCol="indexedFeatures", 
                               maxCategories=10).fit(feat_data)

# fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol="label_e", outputCol="indexedLabel").fit(feat_data)

In [6]:
training, test = feat_data.randomSplit([0.8, 0.2], seed=12345)
print(training.count())
print(test.count())
feat_data.limit(5).toPandas().head()

40030
10060


Unnamed: 0,EnqueuedTimeUtc,machineID,speed,temperature,pressure,f0,f1,f2,a0,a1,...,temperature_n,pressure_n,f0_n,f1_n,f2_n,a0_n,a1_n,a2_n,features,label_e
0,2018-03-15T00:49:22.4250000Z,pm1-354,1220.0,89.188682,2439.961953,244.0,81.0,41.0,2128.662248,1211.514845,...,0.073105,1.999969,0.2,0.066393,0.033607,1.744805,0.993045,0.884288,"[1220.0, 89.1886821271, 2439.96195289, 244.0, ...",4
1,2018-03-15T00:49:23.3780000Z,pm1-354,1220.0,89.207477,2439.962003,244.0,81.0,41.0,2128.354364,1211.348214,...,0.073121,1.999969,0.2,0.066393,0.033607,1.744553,0.992908,0.884706,"[1220.0, 89.207476832, 2439.96200276, 244.0, 8...",4
2,2018-03-15T00:49:23.3940000Z,pm1-355,1330.0,98.270397,2659.946388,266.0,111.0,89.0,2128.128247,1253.766908,...,0.073888,1.99996,0.2,0.083459,0.066917,1.600096,0.942682,0.909901,"[1330.0, 98.270396719, 2659.94638802, 266.0, 1...",5
3,2018-03-15T00:49:24.4430000Z,pm1-354,1220.0,89.226255,2439.962053,244.0,81.0,41.0,2131.79487,1250.348996,...,0.073136,1.999969,0.2,0.066393,0.033607,1.747373,1.024876,0.912406,"[1220.0, 89.2262553905, 2439.96205252, 244.0, ...",4
4,2018-03-15T00:49:24.4900000Z,pm1-355,1330.0,98.290864,2659.946458,266.0,111.0,89.0,2123.030112,1283.127391,...,0.073903,1.99996,0.2,0.083459,0.066917,1.596263,0.964757,0.923692,"[1330.0, 98.290863509, 2659.94645819, 266.0, 1...",5


In [7]:
model_type = 'RandomForest' # Use 'DecisionTree', or 'RandomForest'

# train a model.
if model_type == 'DecisionTree':
    model = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",
                                      # Maximum depth of the tree. (>= 0) 
                                      # E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'
                                      maxDepth=15,
                                      # Max number of bins for discretizing continuous features. 
                                      # Must be >=2 and >= number of categories for any categorical feature.
                                      maxBins=32, 
                                      # Minimum number of instances each child must have after split. 
                                      # If a split causes the left or right child to have fewer than 
                                      # minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.
                                      minInstancesPerNode=1, 
                                      # Minimum information gain for a split to be considered at a tree node.
                                      minInfoGain=0.0, 
                                      # Criterion used for information gain calculation (case-insensitive). 
                                      # Supported options: entropy, gini')
                                      impurity="gini")

    ##=======================================================================================================================
    #elif model_type == 'GBTClassifier':
    #    cls_mthd = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    ##=======================================================================================================================
else:    
    model = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", 
                                      # Passed to DecisionTreeClassifier
                                      maxDepth=15, 
                                      maxBins=32, 
                                      minInstancesPerNode=1, 
                                      minInfoGain=0.0,
                                      impurity="gini",
                                      # Number of trees to train (>= 1)
                                      numTrees=50, 
                                      # The number of features to consider for splits at each tree node. 
                                      # Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].
                                      featureSubsetStrategy="sqrt", 
                                      # Fraction of the training data used for learning each 
                                      # decision tree, in range (0, 1].' 
                                      subsamplingRate = 0.632)

# chain indexers and model in a Pipeline
pipeline_cls_mthd = Pipeline(stages=[labelIndexer, featureIndexer, model])

# train model.  This also runs the indexers.
model_pipeline = pipeline_cls_mthd.fit(training)

In [8]:
# make predictions. The Pipeline does all the same operations on the test data
predictions = model_pipeline.transform(test)

# Create the confusion matrix for the multiclass prediction results
# This result assumes a decision boundary of p = 0.5
conf_table = predictions.stat.crosstab('indexedLabel', 'prediction')
confuse = conf_table.toPandas().sort_values(by=['indexedLabel_prediction'])
confuse.head()

Unnamed: 0,indexedLabel_prediction,0.0,1.0,2.0,3.0
3,0.0,2523,0,0,0
1,1.0,0,2491,0,0
0,2.0,0,0,2532,0
2,3.0,0,0,0,2514


In [16]:
model_pipeline.write().overwrite().save('./model')

import tarfile

tar = tarfile.open("model.tar.gz", "w:gz")
tar.add("./model", arcname="model")
tar.close()