In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv("GMM_values.csv", header=0)

In [3]:
#get failure times 
failurePoints = []
for index, row in dataset.iterrows():
    current_state = row['State']
    if current_state > 3:
        failurePoints.append(row['Timestep'])

In [4]:
def getRUL(current, failurelst):
    nextFailure = None
    for f in failurelst:
        if f - current >= 0: 
            nextFailure = f
            break 
    if nextFailure != None:
        diff = nextFailure - current 
        return diff
    else: 
        return None

In [5]:
#investigate general trend of RUL values 
total_rul = []
timeX = dataset['Timestep'].tolist()
for instance in timeX:
    current_rul = getRUL(instance, failurePoints)
    total_rul.append(current_rul)

In [6]:
#create a new dataframe with RUL 
all_df = dataset.copy(deep=True)
all_df['RUL'] = total_rul
all_df

Unnamed: 0,Timestep,Volt,Rotate,Pressure,Vibration,State,RUL
0,0,0.379151,0.556385,0.243080,0.229504,0,9.0
1,1,0.415507,0.698561,0.242410,0.273825,1,8.0
2,2,0.339342,0.637645,0.230266,0.191588,2,7.0
3,3,0.343817,0.627732,0.233905,0.264566,2,6.0
4,4,0.310362,0.607368,0.194296,0.240200,2,5.0
...,...,...,...,...,...,...,...
7995,7995,0.333491,0.682035,0.220176,0.249060,0,
7996,7996,0.344994,0.689298,0.225088,0.218040,1,
7997,7997,0.390017,0.641338,0.211275,0.239294,1,
7998,7998,0.324577,0.656867,0.205467,0.294698,1,


In [7]:
#drop na columns 
all_df = all_df.dropna()
all_df

Unnamed: 0,Timestep,Volt,Rotate,Pressure,Vibration,State,RUL
0,0,0.379151,0.556385,0.243080,0.229504,0,9.0
1,1,0.415507,0.698561,0.242410,0.273825,1,8.0
2,2,0.339342,0.637645,0.230266,0.191588,2,7.0
3,3,0.343817,0.627732,0.233905,0.264566,2,6.0
4,4,0.310362,0.607368,0.194296,0.240200,2,5.0
...,...,...,...,...,...,...,...
7987,7987,0.325059,0.616790,0.235959,0.192175,2,4.0
7988,7988,0.388779,0.670864,0.243099,0.248306,2,3.0
7989,7989,0.339885,0.607819,0.237883,0.232814,3,2.0
7990,7990,0.525533,0.611479,0.414174,0.214225,3,1.0


In [8]:
#split data into episodes
data_split = []
chunkBySize = []
for index, row in all_df.iterrows():
    currentRUL = row['RUL']
    arr = [row['Timestep'], row['Volt'], row['Rotate'], row['Pressure'], row['Vibration'], row['RUL'], row['State']]
    if currentRUL < 1:  
        chunkBySize.append(arr)
        if (chunkBySize):
            data_split.append(chunkBySize)
        chunkBySize = []
    else:
        chunkBySize.append(arr)

In [9]:
#split in training, validation and test sets

num_episodes = len(data_split)
num_train = int(0.75*num_episodes)
num_val = int(0.125*num_episodes)

train_data = data_split[:num_train]
val_data = data_split[num_train:num_train+num_val]
test_data = data_split[num_train+num_val:]

In [10]:
#set random seed 
import tensorflow as tf
tf.random.set_seed(42)

In [11]:
train_data[0]

[[0.0, 0.379151092, 0.556385187, 0.243079506, 0.229504293, 9.0, 0.0],
 [1.0, 0.415506993, 0.698561103, 0.242410288, 0.273824561, 8.0, 1.0],
 [2.0, 0.33934205, 0.637644818, 0.230266358, 0.191588098, 7.0, 2.0],
 [3.0, 0.343817356, 0.627731997, 0.233904513, 0.264566349, 6.0, 2.0],
 [4.0, 0.310361557, 0.607368113, 0.194295643, 0.240200195, 5.0, 2.0],
 [5.0, 0.336327636, 0.531479142, 0.215693485, 0.215780325, 4.0, 2.0],
 [6.0, 0.397831149, 0.57521239, 0.190585546, 0.230599092, 3.0, 2.0],
 [7.0, 0.385974096, 0.630933953, 0.208516333, 0.247693397, 2.0, 2.0],
 [8.0, 0.376943194, 0.396047283, 0.215659658, 0.256992779, 1.0, 3.0],
 [9.0, 0.56498231, 0.648197926, 0.516587151, 0.256162471, 0.0, 4.0]]

### Using PyCaret for Module Selection

In [12]:
#format data into 2D array 
def to2DSensorReadingArray(data):
    sensorReadingData = []
    for episode in data:
        for instance in episode:
            arr = [instance[1], instance[2], instance[3], instance[4], instance[6]]
            sensorReadingData.append(arr)
    sensorReadingData_arr = np.array(sensorReadingData)
    return sensorReadingData_arr

In [13]:
#format data to numpy array
train_data_formatted = to2DSensorReadingArray(train_data)
val_data_formatted = to2DSensorReadingArray(val_data)
test_data_formatted = to2DSensorReadingArray(test_data)

In [14]:
#convert numpy array to pandas dataframe
columnNames = ['Volt', 'Rotate', 'Pressure', 'Vibration', 'State']
train_dataset = pd.DataFrame(data=train_data_formatted, columns=columnNames)
val_dataset = pd.DataFrame(data=val_data_formatted, columns=columnNames)
test_dataset = pd.DataFrame(data=test_data_formatted, columns=columnNames)

In [15]:
import pycaret
from pycaret.classification import *
from pycaret.classification import setup
from pycaret.classification import compare_models

#setup the dataset
grid = setup(data=train_dataset, target='State', train_size=0.8, preprocess=False, html=False, fold_shuffle=True, session_id=42, imputation_type='iterative')

# evaluate models and compare models
best = compare_models()
# report the best model
print(best)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
Volt,Numeric
Rotate,Numeric
Pressure,Numeric
Vibration,Numeric
State,Label



Setup Succesfully Completed!
              Description             Value
0              session_id                42
1                  Target             State
2             Target Type        Multiclass
3           Label Encoded              None
4           Original Data         (6010, 5)
5          Missing Values             False
6        Numeric Features                 4
7    Categorical Features                 0
8   Transformed Train Set         (4808, 4)
9    Transformed Test Set         (1202, 4)
10     Shuffle Train-Test              True
11    Stratify Train-Test             False
12         Fold Generator   StratifiedKFold
13            Fold Number                10
14               CPU Jobs                -1
15                Use GPU             False
16         Log Experiment             False
17        Experiment Name  clf-default-name
18                    USI              de4a
19          Fix Imbalance             False
20   Fix Imbalance Method             SMOTE
  

In [17]:
#tune model
from sklearn.ensemble import GradientBoostingClassifier

# setup the dataset
grid = setup(data=train_dataset, target='State', train_size=0.8, preprocess=False, html=False, fold_shuffle=True, session_id=42, imputation_type='iterative')

# tune model hyperparameters
best_tune = tune_model(GradientBoostingClassifier(), n_iter=30)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
Volt,Numeric
Rotate,Numeric
Pressure,Numeric
Vibration,Numeric
State,Label



Setup Succesfully Completed!
              Description             Value
0              session_id                42
1                  Target             State
2             Target Type        Multiclass
3           Label Encoded              None
4           Original Data         (6010, 5)
5          Missing Values             False
6        Numeric Features                 4
7    Categorical Features                 0
8   Transformed Train Set         (4808, 4)
9    Transformed Test Set         (1202, 4)
10     Shuffle Train-Test              True
11    Stratify Train-Test             False
12         Fold Generator   StratifiedKFold
13            Fold Number                10
14               CPU Jobs                -1
15                Use GPU             False
16         Log Experiment             False
17        Experiment Name  clf-default-name
18                    USI              6a2a
19          Fix Imbalance             False
20   Fix Imbalance Method             SMOTE
Fi

In [18]:
print(best_tune)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.082, loss='deviance', max_depth=1,
                           max_features=1.0, max_leaf_nodes=None,
                           min_impurity_decrease=0.1, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=7,
                           min_weight_fraction_leaf=0.0, n_estimators=160,
                           n_iter_no_change=None, random_state=None,
                           subsample=0.7, tol=0.0001, validation_fraction=0.1,
                           verbose=0, warm_start=False)


In [19]:
train_dataset

Unnamed: 0,Volt,Rotate,Pressure,Vibration,State
0,0.379151,0.556385,0.243080,0.229504,0.0
1,0.415507,0.698561,0.242410,0.273825,1.0
2,0.339342,0.637645,0.230266,0.191588,2.0
3,0.343817,0.627732,0.233905,0.264566,2.0
4,0.310362,0.607368,0.194296,0.240200,2.0
...,...,...,...,...,...
6005,0.385721,0.667068,0.217464,0.264458,2.0
6006,0.352979,0.621213,0.183361,0.212823,3.0
6007,0.615318,0.661905,0.234438,0.220858,3.0
6008,0.371735,0.425609,0.223836,0.251973,3.0


In [15]:
#prepare dataset
#prepare training set 
train_y = np.array(train_dataset['State'].tolist())
featuresDf = train_dataset.drop(columns=['State'])
train_x = featuresDf.to_numpy()

#prepare validation set
val_y = np.array(val_dataset['State'].tolist())
featuresDf = val_dataset.drop(columns=['State'])
val_x = featuresDf.to_numpy()

#prepare test set
test_y = np.array(test_dataset['State'].tolist())
featuresDf = test_dataset.drop(columns=['State'])
test_x = featuresDf.to_numpy()

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

clf1 = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.05, loss='deviance', max_depth=1,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.01, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=5,
                           min_weight_fraction_leaf=0.0, n_estimators=130,
                           n_iter_no_change=None,
                           random_state=None, subsample=0.5, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
clf1.fit(train_x, train_y)

#evaluate performance
accuracytrain = clf1.score(train_x, train_y)
print("accuracy on training set is %f" % accuracytrain)
accuracyval = clf1.score(val_x, val_y)
print("accuracy on validation set is %f" % accuracyval)

accuracy on training set is 0.389850
accuracy on validation set is 0.339193


In [22]:
valPredict1 = clf1.predict(val_x)

Analyse accuracy of prediction for each machine state in the validation set

In [19]:
def findIndexesPerLabel(ylist, label):
    output = []
    for order in range(0, len(ylist)):
        if ylist[order] == label:
            output.append(order)
    return output

In [20]:
def getPredictionsForLabel(indexList, allPredictions):
    predict = []
    for order in indexList:
        predict.append(allPredictions[order])
    return predict

In [27]:
#State 0 
indexes0 = findIndexesPerLabel(val_y, 0)
actual0 = [0 for i in range(0, len(indexes0))]
predicted0 = getPredictionsForLabel(indexes0, valPredict1)
accuracy0 = accuracy_score(actual0, predicted0)
print("Validation accuracy of state 0 is %f" % accuracy0)

Validation accuracy of state 0 is 0.470297


In [29]:
from collections import Counter
Counter(predicted0)

Counter({1.0: 21, 0.0: 95, 3.0: 17, 2.0: 69})

In [28]:
#State 1 
indexes1 = findIndexesPerLabel(val_y, 1)
actual1 = [1 for i in range(0, len(indexes1))]
predicted1 = getPredictionsForLabel(indexes1, valPredict1)
accuracy1 = accuracy_score(actual1, predicted1)
print("Validation accuracy of state 1 is %f" % accuracy1)

Validation accuracy of state 1 is 0.087649


In [30]:
Counter(predicted1)

Counter({0.0: 98, 2.0: 104, 1.0: 22, 3.0: 27})

In [32]:
#State 2 
indexes2 = findIndexesPerLabel(val_y, 2)
actual2 = [2 for i in range(0, len(indexes2))]
predicted2 = getPredictionsForLabel(indexes2, valPredict1)
accuracy2 = accuracy_score(actual2, predicted2)
print("Validation accuracy of state 2 is %f" % accuracy2)

Validation accuracy of state 2 is 0.407563


In [33]:
Counter(predicted2)

Counter({0.0: 68, 3.0: 47, 2.0: 97, 1.0: 25, 4.0: 1})

In [34]:
#State 3 
indexes3 = findIndexesPerLabel(val_y, 3)
actual3 = [3 for i in range(0, len(indexes3))]
predicted3 = getPredictionsForLabel(indexes3, valPredict1)
accuracy3 = accuracy_score(actual3, predicted3)
print("Validation accuracy of state 3 is %f" % accuracy3)

Validation accuracy of state 3 is 0.404145


In [35]:
Counter(predicted3)

Counter({3.0: 78, 4.0: 20, 2.0: 56, 0.0: 30, 1.0: 9})

In [36]:
#State 4 
indexes4 = findIndexesPerLabel(val_y, 4)
actual4 = [4 for i in range(0, len(indexes4))]
predicted4 = getPredictionsForLabel(indexes4, valPredict1)
accuracy4 = accuracy_score(actual4, predicted4)
print("Validation accuracy of state 4 is %f" % accuracy4)

Validation accuracy of state 4 is 0.467532


In [37]:
Counter(predicted4)

Counter({3.0: 27, 4.0: 36, 0.0: 13, 5.0: 1})

In [38]:
#State 5 
indexes5 = findIndexesPerLabel(val_y, 5)
actual5 = [5 for i in range(0, len(indexes5))]
predicted5 = getPredictionsForLabel(indexes5, valPredict1)
accuracy5 = accuracy_score(actual5, predicted5)
print("Validation accuracy of state 5 is %f" % accuracy5)

Validation accuracy of state 5 is 0.000000


In [39]:
Counter(predicted5)

Counter({4.0: 4, 3.0: 2})

### Model 2: Artificial Neural Networks

In [16]:
from sklearn.neural_network import MLPClassifier

#train multilayer perceptron 
clf2 = MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32), 
                      random_state=42, max_iter=300, batch_size=32, 
                      shuffle=False, early_stopping=True)
clf2.fit(train_x, train_y)

#evaluate performance
accuracytrain = clf2.score(train_x, train_y)
print("accuracy on training set is %f" % accuracytrain)
accuracyval = clf2.score(val_x, val_y)
print("accuracy on validation set is %f" % accuracyval)

accuracy on training set is 0.361730
accuracy on validation set is 0.350569


In [17]:
valPredict2 = clf2.predict(val_x)

In [21]:
#State 0 
indexes0 = findIndexesPerLabel(val_y, 0)
actual0 = [0 for i in range(0, len(indexes0))]
predicted0 = getPredictionsForLabel(indexes0, valPredict2)
accuracy0 = accuracy_score(actual0, predicted0)
print("Validation accuracy of state 0 is %f" % accuracy0)

Validation accuracy of state 0 is 0.287129


In [23]:
from collections import Counter
Counter(predicted0)

Counter({1.0: 77, 0.0: 58, 2.0: 51, 4.0: 11, 3.0: 5})

In [24]:
#State 1 
indexes1 = findIndexesPerLabel(val_y, 1)
actual1 = [1 for i in range(0, len(indexes1))]
predicted1 = getPredictionsForLabel(indexes1, valPredict2)
accuracy1 = accuracy_score(actual1, predicted1)
print("Validation accuracy of state 1 is %f" % accuracy1)

Validation accuracy of state 1 is 0.418327


In [25]:
Counter(predicted1)

Counter({1.0: 105, 2.0: 78, 0.0: 58, 3.0: 9, 4.0: 1})

In [26]:
#State 2 
indexes2 = findIndexesPerLabel(val_y, 2)
actual2 = [2 for i in range(0, len(indexes2))]
predicted2 = getPredictionsForLabel(indexes2, valPredict2)
accuracy2 = accuracy_score(actual2, predicted2)
print("Validation accuracy of state 2 is %f" % accuracy2)

Validation accuracy of state 2 is 0.336134


In [27]:
Counter(predicted2)

Counter({0.0: 39, 3.0: 23, 2.0: 80, 1.0: 92, 4.0: 4})

In [28]:
#State 3 
indexes3 = findIndexesPerLabel(val_y, 3)
actual3 = [3 for i in range(0, len(indexes3))]
predicted3 = getPredictionsForLabel(indexes3, valPredict2)
accuracy3 = accuracy_score(actual3, predicted3)
print("Validation accuracy of state 3 is %f" % accuracy3)

Validation accuracy of state 3 is 0.202073


In [29]:
Counter(predicted3)

Counter({2.0: 73, 4.0: 39, 3.0: 39, 1.0: 35, 0.0: 7})

In [30]:
#State 4 
indexes4 = findIndexesPerLabel(val_y, 4)
actual4 = [4 for i in range(0, len(indexes4))]
predicted4 = getPredictionsForLabel(indexes4, valPredict2)
accuracy4 = accuracy_score(actual4, predicted4)
print("Validation accuracy of state 4 is %f" % accuracy4)

Validation accuracy of state 4 is 0.740260


In [31]:
Counter(predicted4)

Counter({3.0: 16, 4.0: 57, 0.0: 4})

In [32]:
#State 5 
indexes5 = findIndexesPerLabel(val_y, 5)
actual5 = [5 for i in range(0, len(indexes5))]
predicted5 = getPredictionsForLabel(indexes5, valPredict2)
accuracy5 = accuracy_score(actual5, predicted5)
print("Validation accuracy of state 5 is %f" % accuracy5)

Validation accuracy of state 5 is 0.000000


In [33]:
Counter(predicted5)

Counter({4.0: 5, 3.0: 1})

In [34]:
#save model
from joblib import dump
fileName1 = 'machineStateClassifier_Original.joblib'
dump(clf2, fileName1)

['machineStateClassifier_Original.joblib']

### Model 3: ANN but with target classes reformatted 

Classes 0 and 1 are combined. The target classes are now 0(0 and 1), 1(2), 2(3), 3(4), 4(5)

In [42]:
def formatTargetClasses(originalTargetLst):
    output = []
    for target in originalTargetLst:
        if target < 2:
            output.append(0)
        else:
            output.append(int(target-1))
    return output

In [43]:
train_y_formatted = formatTargetClasses(train_y)
val_y_formatted = formatTargetClasses(val_y)
test_y_formatted = formatTargetClasses(test_y)

In [44]:
train_y_formatted

[0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 4,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 3,
 0,
 0,
 1,
 1,
 2,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 3,
 0,
 0,
 0,
 4,
 0,
 0,
 1,
 2,
 2,
 2,
 2,
 3,
 0,
 0,
 0,
 1,
 2,
 3,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 3,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 4,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 3,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 3,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 3,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 3,
 0,
 0,
 0,
 1,
 1,
 1,
 2,
 3,
 0,
 0,
 0,
 0,
 0,
 0,


In [45]:
#train multilayer perceptron 
clf3 = MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32), 
                      random_state=42, max_iter=300, batch_size=32, 
                      shuffle=False, early_stopping=True)
clf3.fit(train_x, train_y_formatted)

#evaluate performance
accuracytrain = clf3.score(train_x, train_y_formatted)
print("accuracy on training set is %f" % accuracytrain)
accuracyval = clf3.score(val_x, val_y_formatted)
print("accuracy on validation set is %f" % accuracyval)

accuracy on training set is 0.521797
accuracy on validation set is 0.525336


In [46]:
valPredict3 = clf3.predict(val_x)

In [47]:
#State 0 (0,1)
indexes0 = findIndexesPerLabel(val_y_formatted, 0)
actual0 = [0 for i in range(0, len(indexes0))]
predicted0 = getPredictionsForLabel(indexes0, valPredict3)
accuracy0 = accuracy_score(actual0, predicted0)
print("Validation accuracy of state 0 is %f" % accuracy0)

Validation accuracy of state 0 is 0.938190


In [48]:
Counter(predicted0)

Counter({0: 425, 2: 12, 3: 16})

In [49]:
#State 1 (2)
indexes1 = findIndexesPerLabel(val_y_formatted, 1)
actual1 = [1 for i in range(0, len(indexes1))]
predicted1 = getPredictionsForLabel(indexes1, valPredict3)
accuracy1 = accuracy_score(actual1, predicted1)
print("Validation accuracy of state 1 is %f" % accuracy1)

Validation accuracy of state 1 is 0.000000


In [50]:
Counter(predicted1)

Counter({0: 215, 2: 16, 3: 7})

In [51]:
#State 2 (3)
indexes2 = findIndexesPerLabel(val_y_formatted, 2)
actual2 = [2 for i in range(0, len(indexes2))]
predicted2 = getPredictionsForLabel(indexes2, valPredict3)
accuracy2 = accuracy_score(actual2, predicted2)
print("Validation accuracy of state 2 is %f" % accuracy2)

Validation accuracy of state 2 is 0.150259


In [52]:
Counter(predicted2)

Counter({0: 127, 3: 37, 2: 29})

In [53]:
#State 3 
indexes3 = findIndexesPerLabel(val_y_formatted, 3)
actual3 = [3 for i in range(0, len(indexes3))]
predicted3 = getPredictionsForLabel(indexes3, valPredict3)
accuracy3 = accuracy_score(actual3, predicted3)
print("Validation accuracy of state 3 is %f" % accuracy3)

Validation accuracy of state 3 is 0.701299


In [54]:
Counter(predicted3)

Counter({2: 16, 3: 54, 0: 7})

In [55]:
#State 4 
indexes4 = findIndexesPerLabel(val_y_formatted, 4)
actual4 = [4 for i in range(0, len(indexes4))]
predicted4 = getPredictionsForLabel(indexes4, valPredict3)
accuracy4 = accuracy_score(actual4, predicted4)
print("Validation accuracy of state 4 is %f" % accuracy4)

Validation accuracy of state 4 is 0.000000


In [56]:
Counter(predicted4)

Counter({3: 5, 2: 1})

### Attempt 4: Further compress states

Combine states 0,1,2 to state 0. Map state 3 to state 1, state 4 to state 2 and state 5 to state 3. 

In [57]:
def formatTargetClassesCompress(originalTargetLst):
    output = []
    for target in originalTargetLst:
        if target < 3:
            output.append(0)
        else:
            output.append(int(target-2))
    return output

In [58]:
train_y_formatted_compress = formatTargetClassesCompress(train_y)
val_y_formatted_compress = formatTargetClassesCompress(val_y)
test_y_formatted_compress = formatTargetClassesCompress(test_y)

In [59]:
#train multilayer perceptron 
clf4 = MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32), 
                      random_state=42, max_iter=300, batch_size=32, 
                      shuffle=False, early_stopping=True)
clf4.fit(train_x, train_y_formatted_compress)

#evaluate performance
accuracytrain = clf4.score(train_x, train_y_formatted_compress)
print("accuracy on training set is %f" % accuracytrain)
accuracyval = clf4.score(val_x, val_y_formatted_compress)
print("accuracy on validation set is %f" % accuracyval)

accuracy on training set is 0.746589
accuracy on validation set is 0.764219


In [60]:
valPredict4 = clf4.predict(val_x)

In [61]:
#State 0 (0,1,2)
indexes0 = findIndexesPerLabel(val_y_formatted_compress, 0)
actual0 = [0 for i in range(0, len(indexes0))]
predicted0 = getPredictionsForLabel(indexes0, valPredict4)
accuracy0 = accuracy_score(actual0, predicted0)
print("Validation accuracy of state 0 is %f" % accuracy0)

Validation accuracy of state 0 is 0.956585


In [62]:
Counter(predicted0)

Counter({0: 661, 1: 14, 2: 16})

In [63]:
#State 1 (3)
indexes1 = findIndexesPerLabel(val_y_formatted_compress, 1)
actual1 = [1 for i in range(0, len(indexes1))]
predicted1 = getPredictionsForLabel(indexes1, valPredict4)
accuracy1 = accuracy_score(actual1, predicted1)
print("Validation accuracy of state 1 is %f" % accuracy1)

Validation accuracy of state 1 is 0.134715


In [64]:
Counter(predicted1)

Counter({0: 132, 2: 35, 1: 26})

In [65]:
#State 2 (4)
indexes2 = findIndexesPerLabel(val_y_formatted_compress, 2)
actual2 = [2 for i in range(0, len(indexes2))]
predicted2 = getPredictionsForLabel(indexes2, valPredict4)
accuracy2 = accuracy_score(actual2, predicted2)
print("Validation accuracy of state 1 is %f" % accuracy2)

Validation accuracy of state 1 is 0.675325


In [66]:
Counter(predicted2)

Counter({1: 15, 2: 52, 0: 10})

In [67]:
#State 3 (5)
indexes3 = findIndexesPerLabel(val_y_formatted_compress, 3)
actual3 = [3 for i in range(0, len(indexes3))]
predicted3 = getPredictionsForLabel(indexes3, valPredict4)
accuracy3 = accuracy_score(actual3, predicted3)
print("Validation accuracy of state 1 is %f" % accuracy3)

Validation accuracy of state 1 is 0.000000


In [68]:
Counter(predicted3)

Counter({2: 5, 1: 1})

### Attempt 5: Group machine states

Group states 0, 1 into state 0 and states 2, 3 into state 1, state 4 to state 2, state 5 to state 3. 

In [70]:
def formatTargetClassesGroup(originalTargetLst):
    output = []
    for target in originalTargetLst:
        if target < 2:
            output.append(0)
        elif (target > 1) and (target < 4):
            output.append(1)
        else:
            output.append(int(target-2))
    return output

In [71]:
train_y_formatted_group = formatTargetClassesGroup(train_y)
val_y_formatted_group = formatTargetClassesGroup(val_y)
test_y_formatted_group = formatTargetClassesGroup(test_y)

In [72]:
#train multilayer perceptron 
clf5 = MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32), 
                      random_state=42, max_iter=300, batch_size=32, 
                      shuffle=False, early_stopping=True)
clf5.fit(train_x, train_y_formatted_group)

#evaluate performance
accuracytrain = clf5.score(train_x, train_y_formatted_group)
print("accuracy on training set is %f" % accuracytrain)
accuracyval = clf5.score(val_x, val_y_formatted_group)
print("accuracy on validation set is %f" % accuracyval)

accuracy on training set is 0.603494
accuracy on validation set is 0.587384


In [73]:
valPredict5 = clf5.predict(val_x)

In [74]:
#State 0 (0,1)
indexes0 = findIndexesPerLabel(val_y_formatted_group, 0)
actual0 = [0 for i in range(0, len(indexes0))]
predicted0 = getPredictionsForLabel(indexes0, valPredict5)
accuracy0 = accuracy_score(actual0, predicted0)
print("Validation accuracy of state 0 is %f" % accuracy0)

Validation accuracy of state 0 is 0.653422


In [75]:
Counter(predicted0)

Counter({0: 296, 1: 141, 2: 16})

In [76]:
#State 1 (2, 3)
indexes1 = findIndexesPerLabel(val_y_formatted_group, 1)
actual1 = [1 for i in range(0, len(indexes1))]
predicted1 = getPredictionsForLabel(indexes1, valPredict5)
accuracy1 = accuracy_score(actual1, predicted1)
print("Validation accuracy of state 1 is %f" % accuracy1)

Validation accuracy of state 1 is 0.505800


In [77]:
Counter(predicted1)

Counter({0: 171, 1: 218, 2: 42})

In [78]:
#State 2 (4)
indexes2 = findIndexesPerLabel(val_y_formatted_group, 2)
actual2 = [2 for i in range(0, len(indexes2))]
predicted2 = getPredictionsForLabel(indexes2, valPredict5)
accuracy2 = accuracy_score(actual2, predicted2)
print("Validation accuracy of state 1 is %f" % accuracy2)

Validation accuracy of state 1 is 0.701299


In [79]:
Counter(predicted2)

Counter({1: 21, 2: 54, 0: 2})

In [80]:
#State 3 (5)
indexes3 = findIndexesPerLabel(val_y_formatted_group, 3)
actual3 = [3 for i in range(0, len(indexes3))]
predicted3 = getPredictionsForLabel(indexes3, valPredict5)
accuracy3 = accuracy_score(actual3, predicted3)
print("Validation accuracy of state 1 is %f" % accuracy3)

Validation accuracy of state 1 is 0.000000


In [81]:
Counter(predicted3)

Counter({1: 2, 2: 4})

In [82]:
fileName2 = 'machineStateClassifier_formatted.joblib'
dump(clf5, fileName2)

['machineStateClassifier_formatted.joblib']