Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

-sandbox
#Logistic Regression Lab

## Getting Started

Run the following cell to configure our "classroom."

In [4]:
%run "../includes/setup_env"

## Reading the data

We begin by reading the data that we finished pre-processing in a prior Notebook.

In [7]:
df = spark.read.parquet("dbfs:/FileStore/tables/processed").cache()
display(df)

machineID,datetime,age,diff_error_0,diff_error_1,diff_error_2,diff_error_3,diff_error_4,diff_fail_0,diff_fail_1,diff_fail_2,diff_fail_3,diff_maint_0,diff_maint_1,diff_maint_2,diff_maint_3,pressure_ma_3,pressure_sd_3,rotate_ma_3,rotate_sd_3,vibration_ma_3,vibration_sd_3,volt_ma_3,volt_sd_3,y_0,y_1,y_2,y_3
68,2015-06-02T02:00:00.000+0000,10,886.0,202.0,788.0,437.0,560.0,3744.0,1460.0,3744.0,3744.0,3620.0,1460.0,740.0,20.0,103.81582891024672,9.253355581538184,430.5864130097248,31.493749995315135,40.16587598649302,1.6465389692984078,158.83585185687275,9.677727435020747,0,0,0,0
68,2015-06-02T03:00:00.000+0000,10,887.0,203.0,789.0,438.0,561.0,3745.0,1461.0,3745.0,3745.0,3621.0,1461.0,741.0,21.0,102.7507371894521,9.46026253914918,449.09793827832,54.36847047577793,39.29227665841008,3.1737452872067906,155.62002882053525,5.934378021697804,0,0,0,0
68,2015-06-02T04:00:00.000+0000,10,888.0,204.0,790.0,439.0,562.0,3746.0,1462.0,3746.0,3746.0,3622.0,1462.0,742.0,22.0,98.40865905366375,9.921132987103828,454.0176178799003,46.8214802838024,39.63602781157347,3.2519960046284395,163.26790949871298,9.867349880395764,0,0,0,0
68,2015-06-02T05:00:00.000+0000,10,889.0,205.0,791.0,440.0,563.0,3747.0,1463.0,3747.0,3747.0,3623.0,1463.0,743.0,23.0,100.127746015699,8.682544288547872,412.387178314523,98.77834311146349,39.01357833550013,2.8780722984542235,169.02554035058023,11.294482815047914,0,0,0,0
68,2015-06-02T06:00:00.000+0000,10,890.0,206.0,792.0,441.0,564.0,3748.0,1464.0,3748.0,3748.0,3624.0,1464.0,744.0,24.0,96.3241649042921,3.698332492952136,395.977662295249,96.33139535678907,37.76242994650308,2.771101520027839,169.59476838986802,10.807074027044104,0,0,0,0
68,2015-06-02T07:00:00.000+0000,10,891.0,207.0,793.0,442.0,565.0,3749.0,1465.0,3749.0,3749.0,3625.0,1465.0,745.0,25.0,90.4357148378262,9.897125296488284,391.5564311429545,89.07959128648875,39.71850602993253,2.7650199464818943,167.53496086774126,14.127546346089517,0,0,0,0
68,2015-06-02T08:00:00.000+0000,10,892.0,208.0,794.0,443.0,566.0,3750.0,1466.0,3750.0,3750.0,3626.0,1466.0,746.0,26.0,94.40597784494906,12.9727921412837,381.05093645966303,89.72644636351149,38.47359267828488,3.2535570819381565,169.76856150497727,16.751344621025083,0,0,0,0
68,2015-06-02T09:00:00.000+0000,10,893.0,209.0,795.0,444.0,567.0,3751.0,1467.0,3751.0,3751.0,3627.0,1467.0,747.0,27.0,99.0644352604921,17.19794554905631,402.029343098885,65.2592113640437,40.49350742357385,5.676586773166511,162.77770812696176,17.09604748985298,0,0,0,0
68,2015-06-02T10:00:00.000+0000,10,894.0,210.0,796.0,445.0,568.0,3752.0,1468.0,3752.0,3752.0,3628.0,1468.0,748.0,28.0,97.65296306774668,17.681276410249108,388.24213452426574,75.79871445160546,41.29215582102195,5.034617237302095,160.78105152531725,17.468958502768274,0,0,0,0
68,2015-06-02T11:00:00.000+0000,10,895.0,211.0,797.0,446.0,569.0,3753.0,1469.0,3753.0,3753.0,3629.0,1469.0,749.0,29.0,103.56244389757204,10.71514820123632,381.425726346905,62.59894575579444,41.307563044273024,5.040233229223519,169.7362114948435,18.801349788702485,0,0,0,0


Let's begin by dividing the data into training and test sets. With time-series data, we usually divide the data based on a time cut-off and to avoid **leakage** we also put a gap (2 weeks in this case) between the training and test data. Another option we have is to sample every n-th row of the data. The data is collected hourly, and if we do not wish to use such a high frequency for modeling, we can sample every n-th row of the data.

In [9]:
# from pyspark.sql.types import DateType
from pandas import datetime
from pyspark.sql.functions import col, hour

# we sample every nth row of the data using the `hour` function
df_train = df.filter((col('datetime') < datetime(2015, 10, 1))) # & (hour(col('datetime')) % 3 == 0))
df_test = df.filter(col('datetime') > datetime(2015, 10, 15))

Let's look at some summary statistics for the labels in the data.

In [11]:
display(df_train.describe())

summary,machineID,age,diff_error_0,diff_error_1,diff_error_2,diff_error_3,diff_error_4,diff_fail_0,diff_fail_1,diff_fail_2,diff_fail_3,diff_maint_0,diff_maint_1,diff_maint_2,diff_maint_3,pressure_ma_3,pressure_sd_3,rotate_ma_3,rotate_sd_3,vibration_ma_3,vibration_sd_3,volt_ma_3,volt_sd_3,y_0,y_1,y_2,y_3
count,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0,654600.0
mean,50.5,11.33,767.0613458600673,998.2097525206232,933.592835319279,1020.7992407577146,1615.1251894286588,2133.154399633364,1935.9317461044916,2652.421269477544,2349.709482126489,1113.8832447296058,1109.771384051329,1144.651457378552,1120.9993217231895,100.82513452018463,9.23999324497862,446.6710800808009,46.20725438107571,40.39648372227754,4.619880429306802,170.79153282266063,13.815335825063844,0.0146379468377635,0.0188496791934005,0.0108203483043079,0.0150733272227314
stddev,28.866092096380136,5.827619744009545,763.1257842489291,955.949091888753,994.1383169517125,995.1194154164108,1427.1820509876254,1655.1293083917906,1552.799216609122,1935.571758820077,1866.573331538548,1000.6749291181294,1023.52386226274,1038.5993590886062,1038.3281999294024,6.7831105591951895,3.91649947352248,29.65933821520036,19.4865553158303,3.172298181385354,1.9533044798825765,8.448016911894175,5.845427117512872,0.1200987068394558,0.1359941066395187,0.1034566803921302,0.121844756591689
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76.0053324677532,0.0936970591161633,198.770337377762,0.2159233438022444,22.9732894818614,0.0164138199561251,135.9794565589355,0.0299919681797135,0.0,0.0,0.0,0.0
max,100.0,20.0,5482.0,6645.0,6645.0,6617.0,6645.0,6645.0,6645.0,6645.0,6645.0,6645.0,6645.0,6645.0,6645.0,164.87532404477776,32.69926439338247,576.923563024815,164.91776461098644,67.31208520010537,15.995336982084046,233.13577630387576,54.584229250416726,1.0,1.0,1.0,1.0


In [12]:
X_drop = ['error_index', 'fail_index', 'maint_index', 'f_1', 'f_2', 'f_3', 'f_4', 'y_0, ''y_1', 'y_2', 'y_3', 'model']
Y_keep = ['y_0', 'y_1', 'y_2', 'y_3']
keys = ['machineID', 'datetime']

X_keep = list(set(df.columns) - set(X_drop + Y_keep + keys))

We now build a classifier for `y_0` (failure in the first component) (and drop the other labels).

In [14]:
keys + Y_keep[1:]

In [15]:
df_train = df_train.drop(*keys + Y_keep[1:])
df_train = df_train.withColumnRenamed(Y_keep[0], "error")
df_train.cache()

df_test = df_test.drop(*keys + Y_keep[1:])
df_test = df_test.withColumnRenamed(Y_keep[0], "error")
df_test.cache()

Let's make sure we don't have any null values in our DataFrame.

In [17]:
recordCount = df_train.count()
noNullsRecordCount = df_train.na.drop().count()

print("We have {} records that contain null values.".format(recordCount - noNullsRecordCount))

In [18]:
display(df_train.groupBy("error").count())

error,count
1,9582
0,645018


## Train a Logistic Regression Model

Before we can apply the logistic regression model, we will need to do some data preparation, such as one hot encoding our categorical variables using `StringIndexer` and `OneHotEncoderEstimator`.

Let's start by taking a look at all of our columns, and determine which ones are categorical.

In [20]:
df_train.printSchema()

## Pipeline

Let's build some of the transformations we'll need in our pipeline, such as `VectorAssembler` and LogisticRegression`.

We create a feature vector with the input columns using [`VectorAssembler`](https://spark.apache.org/docs/latest/ml-features.html#vectorassembler), rescale them using [`StandardScaler`](https://spark.apache.org/docs/latest/ml-features.html#standardscaler) and create the output column `norm_features` which feed into [`LogisticRegression`](https://spark.apache.org/docs/latest/ml-classification-regression.html#logistic-regression) to create our classifier for predicting failure.

### Hands-on lab
Create a pipeline that contains three stages as specified above. Then fit the pipeline to the training data and use the fitted model to get predictions for the test data.

In [24]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

vassembler = VectorAssembler(inputCols = X_keep, outputCol = "features")
stndscaler = StandardScaler(inputCol = "features", outputCol = "norm_features")
# df_norm = stndscaler.transform(df_all)
# display(df_norm)

lr = (LogisticRegression()
     .setLabelCol("error")
     .setFeaturesCol("norm_features"))

pipeline = Pipeline(stages = [vassembler, stndscaler, lr])
print(pipeline.getStages())

In [25]:
lr_model = pipeline.fit(df_train)

df_pred = lr_model.transform(df_test).select("error", "rawPrediction", "prediction")
display(df_pred)

error,rawPrediction,prediction
0,"List(1, 2, List(), List(4.487099562530801, -4.487099562530801))",0.0
0,"List(1, 2, List(), List(5.013388183023125, -5.013388183023125))",0.0
0,"List(1, 2, List(), List(5.6455802986316534, -5.6455802986316534))",0.0
0,"List(1, 2, List(), List(6.7664629395784495, -6.7664629395784495))",0.0
0,"List(1, 2, List(), List(6.827698084260586, -6.827698084260586))",0.0
0,"List(1, 2, List(), List(7.18702061962561, -7.18702061962561))",0.0
0,"List(1, 2, List(), List(6.8735934094136795, -6.8735934094136795))",0.0
0,"List(1, 2, List(), List(6.368971756277576, -6.368971756277576))",0.0
0,"List(1, 2, List(), List(6.257945908729084, -6.257945908729084))",0.0
0,"List(1, 2, List(), List(5.336610385963146, -5.336610385963146))",0.0


### End of lab

## Evaluate the Model

In [28]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print(evaluator.explainParams())

In [29]:
evaluator.setLabelCol("error")
evaluator.setRawPredictionCol('rawPrediction')

metricName = evaluator.getMetricName()
metricVal = evaluator.evaluate(df_pred)

print("{}: {}".format(metricName, metricVal))

We could wrap this into a function to make it easier to get the output of multiple metrics.

In [31]:
def printEval(df, labelCol = "error", rawPredictionCol = "rawPrediction"):
  evaluator = BinaryClassificationEvaluator()
  evaluator.setLabelCol(labelCol)
  evaluator.setRawPredictionCol(rawPredictionCol)

  auroc = evaluator.setMetricName("areaUnderROC").evaluate(df)
  aupr = evaluator.setMetricName("areaUnderPR").evaluate(df)
  print("AUROC: {}\nAUPR: {}".format(auroc, aupr))

In [32]:
printEval(df_pred)

##Conclusion
Hmmmm... our results are not great yet. We'll look into how to improve our results later.

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.