# Step 2: Feature Engineering

In this notebook, we will load the data stored in Azure Blob containers in the previous **Data Ingestion** notebook, and create the features used in our predictive maintenance machine learning solution. 

In [17]:
# Setup our environment by importing required libraries
import os

from pyspark.sql.functions import col
from pyspark.sql.functions import datediff
from pyspark.sql.window import Window

from pyspark.ml import Pipeline

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark.sql.types import DoubleType

from azure.storage.blob import BlockBlobService

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

## Load data from Azure Blob storage container

We have previously downloaded and stored the following data in an Azure blob storage container:


  * Machines: Features differentiating each machine. For example age and model.
  * Error: The log of non-critical errors. These errors may still indicate an impending component failure.
  * Maint: Machine maintenance history detailing component replacement or regular maintenance activities withe the date of replacement.
  * Telemetry: The operating conditions of a machine e.g. data collected from sensors.
  * Failure history: The failure history of a machine or component within the machine.

We'll load these files from blob, and create our analysis data set here. We'll write this data set back into a new blob container to use in our model building and evaluation notebook later. 

Since the Azure Blob storage account name and account key are not passed between notebooks, you'll need to provide those here again.

In [18]:
# Enter your Azure blob storage details here 
ACCOUNT_NAME = "pdmamlworkbench"   ## "<your blob storage account name>"

# You can find the account key under the _Access Keys_ link in the 
# [Azure Portal](portal.azure.com) page for your Azure storage container.
ACCOUNT_KEY = "O5uLzNKX7o+ZHFXtHDyS87SIev9QHlkdX2IhIbxYwhRo7sA9zp45HOOFFttUp4r0LyWCcLQ0cCA7l+e8Ct3Yew==" ## "<account key>"

#-------------------------------------------------------------------------------------------
# The data from the Data Aquisition note book is stored in the dataingestion container.
CONTAINER_NAME = "dataingestion"

# Connect to your blob service     
my_service = BlockBlobService(account_name=ACCOUNT_NAME, account_key=ACCOUNT_KEY)

### Machines data set

Load the machines data set.

In [19]:
# download the entire parquet result folder to local path for a new run 
LOCAL_DIRECT = 'dataingestion_machines_result.parquet'
if not os.path.exists(LOCAL_DIRECT):
    os.makedirs(LOCAL_DIRECT)
    print('DONE creating a local directory!')

for blob in my_service.list_blobs(CONTAINER_NAME):
    if 'machines_files.parquet' in blob.name:
        local_file = os.path.join(LOCAL_DIRECT, os.path.basename(blob.name))
        my_service.get_blob_to_path(CONTAINER_NAME, blob.name, local_file)

machines = spark.read.parquet(LOCAL_DIRECT)

print(machines.count())
machines.toPandas().head(20)

1000


Unnamed: 0,machineID,model,age
0,1,model2,18
1,2,model4,7
2,3,model3,8
3,4,model3,7
4,5,model2,2
5,6,model3,7
6,7,model4,20
7,8,model3,16
8,9,model1,7
9,10,model1,10


# Errors

In [20]:
# load the previous created final dataset into the workspace

# create a local path where to store the results later.
LOCAL_DIRECT = 'dataingestion_err_result.parquet'
if not os.path.exists(LOCAL_DIRECT):
    os.makedirs(LOCAL_DIRECT)
    print('DONE creating a local directory!')

# download the entire parquet result folder to local path for a new run 
for blob in my_service.list_blobs(CONTAINER_NAME):
    if 'errors_files.parquet' in blob.name:
        local_file = os.path.join(LOCAL_DIRECT, os.path.basename(blob.name))
        my_service.get_blob_to_path(CONTAINER_NAME, blob.name, local_file)

errors = spark.read.parquet(LOCAL_DIRECT)

print(errors.count())
errors.toPandas().head(20)

11967


Unnamed: 0,datetime,machineID,errorID
0,2015-04-08 19:00:00,251,error3
1,2015-06-09 06:00:00,251,error1
2,2015-08-08 06:00:00,251,error4
3,2015-09-07 06:00:00,251,error2
4,2015-09-07 06:00:00,251,error3
5,2015-09-22 06:00:00,251,error1
6,2015-09-22 06:00:00,251,error4
7,2015-12-06 06:00:00,251,error4
8,2015-01-03 06:00:00,252,error1
9,2015-01-20 12:00:00,252,error3


# Maintenance

In [21]:
# load the previous created final dataset into the workspace
# create a local path where to store the results later.
LOCAL_DIRECT = 'dataingestion_maint_result.parquet'
if not os.path.exists(LOCAL_DIRECT):
    os.makedirs(LOCAL_DIRECT)
    print('DONE creating a local directory!')

# download the entire parquet result folder to local path for a new run 
for blob in my_service.list_blobs(CONTAINER_NAME):
    if 'maint_files.parquet' in blob.name:
        local_file = os.path.join(LOCAL_DIRECT, os.path.basename(blob.name))
        my_service.get_blob_to_path(CONTAINER_NAME, blob.name, local_file)

maint = spark.read.parquet(LOCAL_DIRECT)

print(maint.count())
maint.toPandas().head(20)

32592


Unnamed: 0,datetime,machineID,comp
0,2015-01-04 06:00:00,252,comp1
1,2015-01-19 06:00:00,252,comp4
2,2015-02-18 06:00:00,252,comp3
3,2015-03-05 06:00:00,252,comp2
4,2015-03-20 06:00:00,252,comp1
5,2015-04-04 06:00:00,252,comp1
6,2015-04-19 06:00:00,252,comp2
7,2015-06-03 06:00:00,252,comp4
8,2015-06-18 06:00:00,252,comp1
9,2015-07-18 06:00:00,252,comp4


# Telemetry

In [22]:

# download the entire parquet result folder to local path for a new run 
# create a local path where to store the results later.
LOCAL_DIRECT = 'dataingestion_tel_result.parquet'
if not os.path.exists(LOCAL_DIRECT):
    os.makedirs(LOCAL_DIRECT)
    print('DONE creating a local directory!')
    
for blob in my_service.list_blobs(CONTAINER_NAME):
    if 'telemetry_files.parquet' in blob.name:
        local_file = os.path.join(LOCAL_DIRECT, os.path.basename(blob.name))
        my_service.get_blob_to_path(CONTAINER_NAME, blob.name, local_file)

telemetry = spark.read.parquet(LOCAL_DIRECT)

print(telemetry.count())
telemetry.toPandas().head(20)

8761000


Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,1420711200000000000,501,165.775142,456.014484,96.779707,40.200315
1,1420714800000000000,501,167.694494,415.396525,106.346838,39.45432
2,1420718400000000000,501,149.286911,549.794168,110.590462,46.649346
3,1420722000000000000,501,164.315444,485.343432,102.644426,38.615502
4,1420725600000000000,501,178.789891,447.830204,100.238279,36.380291
5,1420729200000000000,501,137.114258,544.0499,114.2288,41.865415
6,1420732800000000000,501,186.462256,453.722096,89.747835,38.535097
7,1420736400000000000,501,194.811026,436.807652,86.59867,30.765251
8,1420740000000000000,501,149.091834,440.036746,93.430204,46.478393
9,1420743600000000000,501,175.940093,457.777419,103.471539,33.161014


# Failures

In [23]:
# load the previous created final dataset into the workspace
# create a local path where to store the results later.
LOCAL_DIRECT = 'dataingestion_fail_result.parquet'
if not os.path.exists(LOCAL_DIRECT):
    os.makedirs(LOCAL_DIRECT)
    print('DONE creating a local directory!')

# download the entire parquet result folder to local path for a new run 
for blob in my_service.list_blobs(CONTAINER_NAME):
    if 'failure_files.parquet' in blob.name:
        local_file = os.path.join(LOCAL_DIRECT, os.path.basename(blob.name))
        my_service.get_blob_to_path(CONTAINER_NAME, blob.name, local_file)

failures = spark.read.parquet(LOCAL_DIRECT)

print(failures.count())
failures.toPandas().head(20)

6726


Unnamed: 0,datetime,machineID,failure
0,2015-09-18 06:00:00,453,comp2
1,2015-12-17 06:00:00,453,comp2
2,2015-03-27 06:00:00,454,comp2
3,2015-08-24 06:00:00,454,comp2
4,2015-09-23 06:00:00,454,comp1
5,2015-11-07 06:00:00,454,comp1
6,2015-03-14 06:00:00,455,comp1
7,2015-12-24 06:00:00,455,comp1
8,2015-07-01 06:00:00,456,comp1
9,2015-11-28 06:00:00,456,comp1


# Feature engineering 

## Lag features from Telemetry

In [24]:
rolling_features = ['volt','rotate', 'pressure', 'vibration']
               
# lag window 3hrs, 24 hrs
lags = [3,24]

print(len(rolling_features))

4


In [25]:
# rolling mean
tel_mean = telemetry

for lag_n in lags:
    wSpec = Window.partitionBy('machineID').orderBy('datetime').rowsBetween(1-lag_n, 0)
    for col_name in rolling_features:
        tel_mean = tel_mean.withColumn(col_name+'_rollingmean_'+str(lag_n), F.avg(col(col_name)).over(wSpec))
        print("Lag = %d, Column = %s" % (lag_n, col_name))

NameError: name 'F' is not defined

In [None]:
# rolling std
tel_sd = telemetry

for lag_n in lags:
    wSpec = Window.partitionBy('machineID').orderBy('datetime').rowsBetween(1-lag_n, 0)
    for col_name in rolling_features:
        tel_sd = tel_sd.withColumn(col_name+'_rollingstd_'+str(lag_n), F.stddev(col(col_name)).over(wSpec))
        print("Lag = %d, Column = %s" % (lag_n, col_name))

In [None]:
tel_mean.where((col("machineID") == 1)).show(5)
#tel_sd.show(5)

# Resample telemetry time variable to every 3 hours

In [None]:
# tel_mean rolling mean
# 3 hours = 10800 seconds  
time_val = 10800
dt_truncated = ((round(unix_timestamp(col("datetime")) / time_val) * time_val)
    .cast("timestamp"))

tel_mean_resampled = tel_mean.withColumn("dt_truncated", dt_truncated).drop('volt', 'rotate', 'pressure', 'vibration')
tel_mean_resampled.where((col("machineID") == 1)).show(5)

tel_mean_resampled1 = (tel_mean_resampled.groupBy("machineID","dt_truncated")
                               .agg(F.mean('volt_rollingmean_3').alias('volt_rollingmean_3'),
                                    F.mean('rotate_rollingmean_3').alias('rotate_rollingmean_3'), 
                                    F.mean('pressure_rollingmean_3').alias('pressure_rollingmean_3'), 
                                    F.mean('vibration_rollingmean_3').alias('vibration_rollingmean_3'), 
                                    F.mean('volt_rollingmean_24').alias('volt_rollingmean_24'),
                                    F.mean('rotate_rollingmean_24').alias('rotate_rollingmean_24'), 
                                    F.mean('pressure_rollingmean_24').alias('pressure_rollingmean_24'), 
                                    F.mean('vibration_rollingmean_24').alias('vibration_rollingmean_24')))
tel_mean_resampled1.where((col("machineID") == 1)).show(5)
tel_mean_resampled1.count()

In [None]:
# tel_sd rolling sd
dt_truncated = ((round(unix_timestamp(col("datetime")) / time_val) * time_val)
    .cast("timestamp"))

tel_sd_resampled = (tel_sd.withColumn("dt_truncated", dt_truncated).drop('volt', 'rotate', 'pressure', 'vibration')
                        .fillna(0))
tel_sd_resampled.show(5)

tel_sd_resampled1 = (tel_sd_resampled.groupBy("machineID","dt_truncated")
                               .agg(F.mean('volt_rollingstd_3').alias('volt_rollingstd_3'),
                                    F.mean('rotate_rollingstd_3').alias('rotate_rollingstd_3'), 
                                    F.mean('pressure_rollingstd_3').alias('pressure_rollingstd_3'), 
                                    F.mean('vibration_rollingstd_3').alias('vibration_rollingstd_3'), 
                                    F.mean('volt_rollingstd_24').alias('volt_rollingstd_24'),
                                    F.mean('rotate_rollingstd_24').alias('rotate_rollingstd_24'), 
                                    F.mean('pressure_rollingstd_24').alias('pressure_rollingstd_24'), 
                                    F.mean('vibration_rollingstd_24').alias('vibration_rollingstd_24')))
tel_sd_resampled1.show(5)
tel_sd_resampled1.count()

## Lag features from Errors

In [None]:
errors.show(5)
errors.toPandas()['errorID'].unique()

In [None]:
# create a column for each errorID 
error1 = errors.groupBy("machineID","datetime","errorID").pivot('errorID').agg(F.count('machineID').alias('dummy'))

error1.show(5, False)
error1.count(), len(error1.columns)

In [None]:
# remove the column called errorID and fill in missing values
error2 = error1.drop('errorID').fillna(0)
error2.show(5, False)
error2.count(), len(error2.columns)

In [None]:
# combine errors for a given machine in a given hour
error3 = (error2.groupBy("machineID","datetime")
                .agg(F.sum('error1').alias('error1sum'), 
                     F.sum('error2').alias('error2sum'), 
                     F.sum('error3').alias('error3sum'), 
                     F.sum('error4').alias('error4sum'), 
                     F.sum('error5').alias('error5sum')))

error3.show(5, False)
error3.count(), len(error3.columns)

In [None]:
# join the telemetry data with errors
error_count = (telemetry.join(error3, ((telemetry['machineID'] == error3['machineID']) 
                                  & (telemetry['datetime'] == error3['datetime'])), "left")
               .drop('volt', 'rotate', 'pressure', 'vibration').drop(error3.machineID).drop(error3.datetime))

error_count.show(5, False)
error_count.count(), len(error_count.columns)

In [None]:
# fill in missing value
error_count1 = error_count.fillna(0)

error_count1.show(5, False)
error_count1.count(), len(error_count1.columns)

In [None]:
# check the data statistics
error_count1.describe("error1sum","error2sum", "error3sum", "error4sum", "error5sum").show()

In [None]:
rolling_features1 = ['error1sum','error2sum', 'error3sum', 'error4sum', 'error5sum']
               
# lag window 24 hrs
lags = [24]

print(len(rolling_features1))

In [None]:
# rolling mean
err_mean = error_count1

for lag_n in lags:
    wSpec = Window.partitionBy('machineID').orderBy('datetime').rowsBetween(1-lag_n, 0)
    for col_name in rolling_features1:
        err_mean = err_mean.withColumn(col_name+'_rollingmean_'+str(lag_n), F.avg(col(col_name)).over(wSpec))
        print("Lag = %d, Column = %s" % (lag_n, col_name))

In [None]:
err_mean.show(3)
err_mean.count()

# Resample error time variable to every 3 hours

In [None]:
dt_truncated = ((round(unix_timestamp(col("datetime")) / time_val) * time_val)
    .cast("timestamp"))

err_mean_resampled = (err_mean.withColumn("dt_truncated", dt_truncated)
                    .drop('error1sum', 'error2sum', 'error3sum', 'error4sum', 'error5sum').fillna(0))
err_mean_resampled.show(5)
err_mean_resampled.dtypes

err_mean_resampled1 = (err_mean_resampled.groupBy("machineID","dt_truncated")
                               .agg(F.mean('error1sum_rollingmean_24').alias('error1sum_rollingmean_24'), 
                                    F.mean('error2sum_rollingmean_24').alias('error2sum_rollingmean_24'), 
                                    F.mean('error3sum_rollingmean_24').alias('error3sum_rollingmean_24'), 
                                    F.mean('error4sum_rollingmean_24').alias('error4sum_rollingmean_24'), 
                                    F.mean('error5sum_rollingmean_24').alias('error5sum_rollingmean_24')))
err_mean_resampled1.show(5)
err_mean_resampled1.count()

## Days since last replacement from maintenance 

In [None]:
maint.show(5)
maint.toPandas()['comp'].unique()

In [None]:
# create a column for each comp 
maint1 = maint.groupBy("machineID","datetime","comp").pivot('comp').agg(F.count('machineID').alias('dummy'))

maint1.show(5, False)
maint1.count(), len(error1.columns)

In [None]:
# remove the column called comp and fill in missing values
maint2 = maint1.drop('comp').fillna(0)
maint2.show(5, False)
maint2.count(), len(maint2.columns)

In [None]:
# combine maintenance for a given machine in a given hour
maint3 = (maint2.groupBy("machineID","datetime").agg(F.sum('comp1').alias('comp1sum'), 
                                                    F.sum('comp2').alias('comp2sum'), 
                                                    F.sum('comp3').alias('comp3sum'),
                                                    F.sum('comp4').alias('comp4sum')))

maint3.show(5, False)
maint3.count(), len(maint3.columns)

In [None]:
## test code for days since last replacement
maint3.show(5, False)
maint3.count(), len(maint3.columns)
maint3.dtypes

## Days since last replacement for component-1

In [None]:
test_maint_comp1 = (maint3.where((col("comp1sum") == '1')).withColumnRenamed('datetime','datetime_maint')
                           .drop('comp2sum', 'comp3sum', 'comp4sum'))
print(test_maint_comp1.count())

In [None]:
test_tel_comp1 = (telemetry.withColumnRenamed('datetime','datetime_tel')
                  .drop(telemetry.volt).drop(telemetry.rotate).drop(telemetry.pressure).drop(telemetry.vibration))
test_tel_comp1.count()

In [None]:
test_maint_tel_comp1 = test_tel_comp1.join(test_maint_comp1, ((test_tel_comp1['machineID']==
                                                               test_maint_comp1['machineID']) 
                                            & (test_tel_comp1['datetime_tel'] > test_maint_comp1['datetime_maint']) 
                                            & (test_maint_comp1['comp1sum'] == '1'))).drop(test_maint_comp1.machineID)
test_maint_tel_comp1.show(5)

In [None]:
comp1 = (test_maint_tel_comp1.withColumn("sincelastcomp1", 
              datediff(test_maint_tel_comp1.datetime_tel, test_maint_tel_comp1.datetime_maint))
              .drop(test_maint_tel_comp1.datetime_maint).drop(test_maint_tel_comp1.comp1sum))
comp1.show(5)

In [None]:
comp1.describe("sincelastcomp1").show()

## Days since last replacement for component-2

In [None]:
test_maint_comp2 = (maint3.where(col("comp2sum") == '1').withColumnRenamed('datetime','datetime_maint')
                         .drop('comp1sum', 'comp3sum', 'comp4sum'))
print(test_maint_comp2.count())

In [None]:
test_tel_comp2 = (telemetry.withColumnRenamed('datetime','datetime_tel')
                          .drop(telemetry.volt).drop(telemetry.rotate).drop(telemetry.pressure)
                          .drop(telemetry.vibration))
print(test_tel_comp2.count())

In [None]:
test_maint_tel_comp2 = (test_tel_comp2.join(test_maint_comp2, ((test_tel_comp2['machineID']==
                                                                test_maint_comp2['machineID']) 
                                        & (test_tel_comp2['datetime_tel'] > test_maint_comp2['datetime_maint']) 
                                        & (test_maint_comp2['comp2sum'] == '1') 
                                           )).drop(test_maint_comp2.machineID))
test_maint_tel_comp2.show(5)

In [None]:
comp2 = (test_maint_tel_comp2.withColumn("sincelastcomp2", 
              datediff(test_maint_tel_comp2.datetime_tel, test_maint_tel_comp2.datetime_maint))
              .drop(test_maint_tel_comp2.datetime_maint).drop(test_maint_tel_comp2.comp2sum))
comp2.show(5)

In [None]:
comp2.describe("sincelastcomp2").show()

## Days since last replacement for component-3

In [None]:
test_maint_comp3 = (maint3.where(col("comp3sum") == '1').withColumnRenamed('datetime','datetime_maint')
                          .drop('comp1sum', 'comp2sum', 'comp4sum'))
print(test_maint_comp3.count())

In [None]:
test_tel_comp3 = (telemetry.withColumnRenamed('datetime','datetime_tel')
                    .drop(telemetry.volt).drop(telemetry.rotate).drop(telemetry.pressure).drop(telemetry.vibration))
print(test_tel_comp3.count())

In [None]:
test_maint_tel_comp3 = test_tel_comp3.join(test_maint_comp3, ((test_tel_comp3['machineID']==
                                                               test_maint_comp3['machineID']) 
                                        & (test_tel_comp3['datetime_tel'] > test_maint_comp3['datetime_maint']) 
                                        & (test_maint_comp3['comp3sum'] == '1') 
                                           )).drop(test_maint_comp3.machineID)
test_maint_tel_comp3.show(5)

In [None]:
comp3 = (test_maint_tel_comp3.withColumn("sincelastcomp3", 
              datediff(test_maint_tel_comp3.datetime_tel, test_maint_tel_comp3.datetime_maint))
              .drop(test_maint_tel_comp3.datetime_maint).drop(test_maint_tel_comp3.comp3sum))
comp3.show(5)

In [None]:
comp3.describe("sincelastcomp3").show()

## Days since last replacement for component-4

In [None]:
test_maint_comp4 = (maint3.where(col("comp4sum") == '1').withColumnRenamed('datetime','datetime_maint')
                         .drop('comp1sum', 'comp2sum', 'comp3sum'))
print(test_maint_comp4.count())

In [None]:
test_tel_comp4 = (telemetry.withColumnRenamed('datetime','datetime_tel')
                  .drop(telemetry.volt).drop(telemetry.rotate).drop(telemetry.pressure).drop(telemetry.vibration))
print(test_tel_comp4.count())

In [None]:
test_maint_tel_comp4 = test_tel_comp4.join(test_maint_comp4, ((test_tel_comp4['machineID']==
                                                               test_maint_comp4['machineID']) 
                                        & (test_tel_comp4['datetime_tel'] > test_maint_comp4['datetime_maint']) 
                                        & (test_maint_comp4['comp4sum'] == '1'))).drop(test_maint_comp4.machineID)
test_maint_tel_comp4.show(5)

In [None]:
comp4 = (test_maint_tel_comp4.withColumn("sincelastcomp4", 
              datediff(test_maint_tel_comp4.datetime_tel, test_maint_tel_comp4.datetime_maint))
              .drop(test_maint_tel_comp4.datetime_maint).drop(test_maint_tel_comp4.comp4sum))
comp4.show(5)

In [None]:
comp4.describe("sincelastcomp4").show()

##  Combine comp1, comp2, comp3, comp4 to generate the maintenance feature set

In [None]:
# left join comp3, comp4 
comp3_4 = (comp3.join(comp4, ((comp3['machineID'] == comp4['machineID']) 
                                  & (comp3['datetime_tel'] == comp4['datetime_tel'])), "left")
                                  .drop(comp4.machineID).drop(comp4.datetime_tel))
comp3_4.show(5)

In [None]:
# left join comp2 with (comp3, comp4) 
comp2_3_4 = (comp2.join(comp3_4, ((comp2['machineID'] == comp3_4['machineID']) 
                                  & (comp2['datetime_tel'] == comp3_4['datetime_tel'])), "left")
                                  .drop(comp3_4.machineID).drop(comp3_4.datetime_tel))
comp2_3_4.show(5)

In [None]:
# left join comp1 with (comp2, comp3, comp4) 
comp1_2_3_4 = (comp1.join(comp2_3_4, ((comp1['machineID'] == comp2_3_4['machineID']) 
                                  & (comp1['datetime_tel'] == comp2_3_4['datetime_tel'])), "left")
                                 .drop(comp2_3_4.machineID).drop(comp2_3_4.datetime_tel))
comp1_2_3_4.show(5)

In [None]:
comp1_2_3_4_final = (comp1_2_3_4.groupBy("machineID", "datetime_tel")
                                .agg(F.max('sincelastcomp1').alias('sincelastcomp1'), 
                                     F.max('sincelastcomp2').alias('sincelastcomp2'), 
                                     F.max('sincelastcomp3').alias('sincelastcomp3'), 
                                     F.max('sincelastcomp4').alias('sincelastcomp4')))

In [None]:
comp1_2_3_4_final.show(5)
#comp1_2_3_4_final.count()

In [None]:
# fill in missing value
maint_count1 = comp1_2_3_4_final.fillna(0)

maint_count1.show(5, False)
#maint_count1.count()

# Resample maintenance time variable to every 3 hours

In [None]:
# maint_count1 maintenance 
dt_truncated = ((round(unix_timestamp(col("datetime_tel")) / time_val) * time_val)
    .cast("timestamp"))

maint_resampled = maint_count1.withColumn("dt_truncated", dt_truncated)
maint_resampled.show(5)
maint_resampled.dtypes

In [None]:
maint_resampled1 = (maint_resampled.groupBy("machineID","dt_truncated")
                                  .agg(F.mean('sincelastcomp1').alias('comp1sum'), 
                                       F.mean('sincelastcomp2').alias('comp2sum'), 
                                       F.mean('sincelastcomp3').alias('comp3sum'), 
                                       F.mean('sincelastcomp4').alias('comp4sum')))
maint_resampled1.show(5)
#maint_resampled1.count()

## Machine features - need to do one hot encoding for variable model 

In [None]:
# check sample data
machines.show(5)

In [None]:
# one hot encoding of the variable model
catVarNames = ['model']  
    
sIndexers = [StringIndexer(inputCol=x, outputCol=x + '_indexed') for x in catVarNames]

machines_cat = Pipeline(stages=sIndexers).fit(machines).transform(machines)

# one-hot encode
ohEncoders = [OneHotEncoder(inputCol=x + '_indexed', outputCol=x + '_encoded')
              for x in catVarNames]
ohPipelineModel = Pipeline(stages=ohEncoders).fit(machines_cat)
machines_cat = ohPipelineModel.transform(machines_cat)

drop_list = [col_n for col_n in machines_cat.columns if 'indexed' in col_n]

machines_edit = machines_cat.select([column for column in machines_cat.columns if column not in drop_list])

machines_edit.show(5)

# Creating final feature matrix

In [None]:
# join error with components
#err_mean_resampled1.show(3)
#maint_resampled1.show(3)

error_maint = (err_mean_resampled1.join(maint_resampled1, 
                                ((err_mean_resampled1['machineID'] == maint_resampled1['machineID']) 
                                  & (err_mean_resampled1['dt_truncated'] == maint_resampled1['dt_truncated'])), "left")
                                  .drop(maint_resampled1.machineID).drop(maint_resampled1.dt_truncated))
#error_maint.show(10, False)
#error_maint.count(), len(error_maint.columns)

# now join with machines
#machines_edit.show(1)

err_maint_mach = (error_maint.join(machines_edit, ((error_maint['machineID'] == machines_edit['machineID'])), "left")
                             .drop(machines_edit.machineID))
err_maint_mach_select = (err_maint_mach.select([c for c in err_maint_mach.columns if c not in 
                                               {'error1sum', 'error2sum', 'error3sum', 'error4sum', 'error5sum'}]))
#err_maint_mach_select.show(10, False)
#err_maint_mach_select.count(), len(err_maint_mach_select.columns)

telemetry_all = (tel_mean_resampled1.join(tel_sd_resampled1, 
                             ((tel_mean_resampled1['machineID'] == tel_sd_resampled1['machineID']) 
                              & (tel_mean_resampled1['dt_truncated'] == tel_sd_resampled1['dt_truncated'])), "left")
                              .drop(tel_sd_resampled1.machineID).drop(tel_sd_resampled1.dt_truncated))
#telemetry_all.show(10, False)
#telemetry_all.count(), len(telemetry_all.columns)

# join telemetry_all with err_maint_mach_select to create final feature matrix
final_feat = (telemetry_all.join(err_maint_mach_select, 
                                ((telemetry_all['machineID'] == err_maint_mach_select['machineID']) 
                                  & (telemetry_all['dt_truncated'] == err_maint_mach_select['dt_truncated'])), "left")
                                 .drop(err_maint_mach_select.machineID).drop(err_maint_mach_select.dt_truncated))
final_feat.show(5, False)
#final_feat.count(), len(final_feat.columns)

# Label construction

In [None]:
# check failure sample data
failures.show(5)

# check the dimensions of the data
failures.count(), len(failures.columns)

In [None]:
# check to see if there are duplicate rows based on machine, datetime
failures1 = failures.dropDuplicates(['machineID', 'datetime'])

# check the dimensions of the data
failures1.count(), len(failures1.columns)

In [None]:
# map the failure data to final feature matrix

labeled_features = (final_feat.join(failures1, ((final_feat['machineID'] == failures1['machineID']) 
                                  & (final_feat['dt_truncated'] == failures1['datetime'])), "left")
                                  .drop(failures1.machineID).drop(failures1.datetime))
labeled_features.show(5, False)
#labeled_features.count(), len(labeled_features.columns)

In [None]:
# recoding the column 'failure' to be numeric double for the pyspark classification models
labeled_features1 = (labeled_features.withColumn('failure', F.when(col('failure') == "comp1", 1.0)
                                     .otherwise(col('failure')))
                                     .withColumn('failure', F.when(col('failure') == "comp2", 2.0)
                                     .otherwise(col('failure')))
                                     .withColumn('failure', F.when(col('failure') == "comp3", 3.0)
                                     .otherwise(col('failure')))
                                     .withColumn('failure', F.when(col('failure') == "comp4", 4.0)
                                     .otherwise(col('failure'))))

labeled_features2 = labeled_features1.withColumn("failure1", labeled_features1["failure"].cast(DoubleType()))

#labeled_features2.groupBy('failure').count().show()
#labeled_features2.groupBy('failure1').count().show()

In [None]:
# check data schema
labeled_features2.dtypes

In [None]:
labeled_features3 = labeled_features2.drop('failure').fillna(0)
labeled_features3.dtypes
#labeled_features3.groupBy('failure1').count().show()

In [None]:
# build the code for backfill with all machine data
label_bfill1 = labeled_features3
label_bfill1.show(1)

In [None]:
# lag values to manually backfill label (bfill =7)
my_window = Window.partitionBy('machineID').orderBy(label_bfill1.dt_truncated.desc())

label_bfill1 = label_bfill1.withColumn("prev_value1", F.lag(label_bfill1.failure1).over(my_window)).fillna(0)
label_bfill1 = label_bfill1.withColumn("prev_value2", F.lag(label_bfill1.prev_value1).over(my_window)).fillna(0) 
label_bfill1 = label_bfill1.withColumn("prev_value3", F.lag(label_bfill1.prev_value2).over(my_window)).fillna(0) 
label_bfill1 = label_bfill1.withColumn("prev_value4", F.lag(label_bfill1.prev_value3).over(my_window)).fillna(0) 
label_bfill1 = label_bfill1.withColumn("prev_value5", F.lag(label_bfill1.prev_value4).over(my_window)).fillna(0) 
label_bfill1 = label_bfill1.withColumn("prev_value6", F.lag(label_bfill1.prev_value5).over(my_window)).fillna(0) 
label_bfill1 = label_bfill1.withColumn("prev_value7", F.lag(label_bfill1.prev_value6).over(my_window)).fillna(0) 

In [None]:
# create the label column 
label_bfill2 = (label_bfill1.withColumn('label', label_bfill1.failure1 + label_bfill1.prev_value1 
                         + label_bfill1.prev_value2 + label_bfill1.prev_value3 + label_bfill1.prev_value4 
                         + label_bfill1.prev_value5 + label_bfill1.prev_value6 + label_bfill1.prev_value7))
label_bfill2 = label_bfill2.withColumn('label_e', F.when(col('label') > 4, 4.0).otherwise(col('label')))

In [None]:
label_bfill3 = (label_bfill2.drop(label_bfill2.prev_value1).drop(label_bfill2.prev_value2)
              .drop(label_bfill2.prev_value3).drop(label_bfill2.prev_value4)
              .drop(label_bfill2.prev_value5).drop(label_bfill2.prev_value6)
              .drop(label_bfill2.prev_value7).drop(label_bfill2.label))

In [None]:
label_bfill3.show(1)

In [None]:

# you decide to partition the dataframe into three files and save them in the current folder.
# if you wish to visualize them in the run history Output Files, specify the path 
# as './outputs/multiple_files.parquet'.
#label_bfill3.coalesce(3).write.mode('overwrite').parquet('multiple_files.parquet')
label_bfill3.write.mode('overwrite').parquet('featureengineering_files.parquet')

# unlike the single file case, for multiple files we need to first delete results from the 
# previous run before uploading.
for blob in my_service.list_blobs(CONTAINER_NAME):
    if 'featureengineering_files.parquet' in blob.name:
        my_service.delete_blob(CONTAINER_NAME, blob.name)

# upload the entire folder into blob storage
for name in glob.iglob('featureengineering_files.parquet/*'):
    print(os.path.abspath(name))
    my_service.create_blob_from_path(CONTAINER_NAME, name, name)

print("Feature engineering final dataset files saved!")