In [None]:
# Get open ML
!pip install openml

In [None]:
import openml
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn.tree as tree
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, accuracy_score, recall_score
from openml.datasets import edit_dataset, fork_dataset, get_dataset

In [None]:
# Get data
dataset = openml.datasets.get_dataset(42890)

# Storing the data
eeg, *_ = dataset.get_data()



In [None]:
eeg.head(100)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,M14955,M,299.0,309.0,1351,52.2,44,0,0,0,0,0,0
96,97,M14956,M,299.0,309.0,1575,35.3,47,0,0,0,0,0,0
97,98,M14957,M,298.9,308.9,1750,29.9,50,0,0,0,0,0,0
98,99,L47278,L,298.9,308.8,1529,32.7,53,0,0,0,0,0,0


In [None]:
eeg['Tool wear [min]'] = eeg['Tool wear [min]'].astype('int64')
eeg['Machine failure'] = eeg['Machine failure'].astype('int64')
eeg['TWF'] = eeg['TWF'].astype('int64')
eeg['HDF'] = eeg['HDF'].astype('int64')
eeg['PWF'] = eeg['PWF'].astype('int64')
eeg['OSF'] = eeg['OSF'].astype('int64')
eeg['RNF'] = eeg['RNF'].astype('int64')

In [None]:
col_types = eeg.dtypes
print(col_types)

In [None]:
# For Machine failure
df_filtered = eeg.drop(columns=['Product ID', 'Type', 'HDF', 'PWF', 'OSF', 'RNF'])
target_cols = ['Machine failure']

X = df_filtered.drop(columns='Machine failure', axis=1)
y = eeg[target_cols]

In [None]:
# For TWF
df_filtered = eeg.drop(columns=['Product ID', 'Type', 'Machine failure', 'HDF', 'PWF', 'OSF', 'RNF'])
target_cols = ['TWF']

X = df_filtered.drop(columns='TWF', axis=1)
y = eeg[target_cols]

In [None]:
# For HDF
df_filtered = eeg.drop(columns=['Product ID', 'Type', 'Machine failure', 'TWF', 'PWF', 'OSF', 'RNF'])
target_cols = ['HDF']

X = df_filtered.drop(columns='HDF', axis=1)
y = eeg[target_cols]

In [None]:
# For PWF
df_filtered = eeg.drop(columns=['Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'OSF', 'RNF'])
target_cols = ['PWF']

X = df_filtered.drop(columns='PWF', axis=1)
y = eeg[target_cols]

In [None]:
# For OSF
df_filtered = eeg.drop(columns=['Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'PWF', 'RNF'])
target_cols = ['OSF']

X = df_filtered.drop(columns='OSF', axis=1)
y = eeg[target_cols]

In [None]:
# For RNF
df_filtered = eeg.drop(columns=['Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'OSF', 'PWF'])
target_cols = ['RNF']

X = df_filtered.drop(columns='RNF', axis=1)
y = eeg[target_cols]

In [None]:
df_filtered = eeg.drop(columns=['UDI', 'Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'])
target_cols = eeg.drop(columns=['UDI', 'Product ID', 'Type', 'Air temperature [K]',
                  'Process temperature [K]', 'Rotational speed [rpm]',
                  'Torque [Nm]', 'Tool wear [min]', 'Machine failure'])

X = df_filtered
y = target_cols

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Testing methods

To ensure we are going with the correct classifier, I will use lazy predict which will run my dataset on many classifiers.

In [None]:
!pip install lazypredict

In [None]:
# Lazy predict will do what you expect
# Predict for me on a ton of models
import lazypredict
from lazypredict.Supervised import LazyClassifier

# Create and run LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

# Results

Random Foreest looks promising with what I've seen in each of the random forest runs with the different targets set to each failure mode.

There are other classifers I could try and use but I'm confident with Random Forests (It yeilds good results) and I think it's interesting as a method.

In [None]:
# Our basic Random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_features=6)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9885


In [None]:
# Checking params in use
clf.get_params()

In [None]:
!pip install keras
!pip install -q -U keras-tuner
!pip install -q tensorflow_decision_forests

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m98.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
from time import time
import keras_tuner as kt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import scipy.stats as stats
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# In order to use the Keras tuner
# I will be building a model using a function that returns a keras model

def build_forest_model(hp):

  # Setup the model
  forest_model = tfdf.keras.GradientBoostedTreesModel(
      num_trees=hp.Int('num_trees', min_value=10, max_value=710, step=25),
      growing_strategy=hp.Choice('growing_strategy', values=['BEST_FIRST_GLOBAL', 'LOCAL']),
      subsample=hp.Float('subsample', min_value=0.1, max_value=0.95, step=0.05),
      num_threads=4
  )

  # Compile with accuracy (AUC) as the metric
  forest_model.compile(metrics=['accuracy', tf.keras.metrics.AUC()])

  # Give us back our keras model so we can tune
  return forest_model

In [None]:
# I want my tuner to use Bayesian to optimize
forest_tuner = kt.BayesianOptimization(
    build_forest_model, # Get the model
    objective=kt.Objective('val_auc', direction='max'),
    max_trials=50,
    project_name='RamboForest'
)

Use /tmp/tmp3n65l43l as temporary training directory


In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(pd.concat([X_train, y_train], axis=1), in_place=True, label='RNF')
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(pd.concat([X_test, y_test], axis=1), in_place=True, label='RNF')









In [None]:
forest_tuner.search(train_ds, epochs=1, validation_data=test_ds)

Trial 50 Complete [00h 00m 04s]
val_auc: 0.49399399757385254

Best val_auc So Far: 0.49399399757385254
Total elapsed time: 00h 03m 49s


In [None]:
forest_tuner.results_summary()

Results summary
Results in ./RamboForest
Showing 10 best trials
Objective(name="val_auc", direction="max")

Trial 09 summary
Hyperparameters:
num_trees: 310
growing_strategy: LOCAL
subsample: 0.9
Score: 0.49399399757385254

Trial 11 summary
Hyperparameters:
num_trees: 585
growing_strategy: LOCAL
subsample: 0.9
Score: 0.49399399757385254

Trial 14 summary
Hyperparameters:
num_trees: 435
growing_strategy: LOCAL
subsample: 0.9
Score: 0.49399399757385254

Trial 16 summary
Hyperparameters:
num_trees: 360
growing_strategy: LOCAL
subsample: 0.9
Score: 0.49399399757385254

Trial 17 summary
Hyperparameters:
num_trees: 360
growing_strategy: LOCAL
subsample: 0.9
Score: 0.49399399757385254

Trial 18 summary
Hyperparameters:
num_trees: 360
growing_strategy: LOCAL
subsample: 0.9
Score: 0.49399399757385254

Trial 19 summary
Hyperparameters:
num_trees: 360
growing_strategy: LOCAL
subsample: 0.9
Score: 0.49399399757385254

Trial 20 summary
Hyperparameters:
num_trees: 360
growing_strategy: LOCAL
subsamp

In [None]:
best_forest_model = forest_tuner.get_best_models(num_models=1)[0]

Use /tmp/tmpgytrwprd as temporary training directory


In [None]:
best_forest_model.fit(train_ds)

Reading training dataset...
Training dataset read in 0:00:00.273166. Found 8000 examples.
Training model...
Model trained in 0:00:00.844599
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x7f0a8ad6d2a0>

In [None]:
best_forest_model.predict(test_ds, verbose=1)



array([[0.02642518],
       [0.00064139],
       [0.00045552],
       ...,
       [0.00051317],
       [0.00058276],
       [0.00077803]], dtype=float32)

In [None]:
best_forest_model.evaluate(test_ds)



[0.0, 0.9990000128746033, 0.4912412464618683]



---

