In [48]:
import os
import pickle
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import tensorflow as tf
import keras as kr

In [49]:
df = pd.read_csv("predictive_maintenance.csv")
df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure
...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,No Failure
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,No Failure
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,No Failure
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,No Failure


In [50]:
df["Failure Type"].value_counts()

Failure Type
No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: count, dtype: int64

In [51]:
df['Type'].value_counts() #Three types - L, M, H

Type
L    6000
M    2997
H    1003
Name: count, dtype: int64

> #Hot encoding 'Type'

In [52]:
df_1 = pd.get_dummies(df,columns=['Type'])
df_1[['Type_H','Type_L','Type_M']] = df_1[['Type_H','Type_L','Type_M']].astype(int)

In [53]:
df_1.head()

Unnamed: 0,UDI,Product ID,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Type_H,Type_L,Type_M
0,1,M14860,298.1,308.6,1551,42.8,0,0,No Failure,0,0,1
1,2,L47181,298.2,308.7,1408,46.3,3,0,No Failure,0,1,0
2,3,L47182,298.1,308.5,1498,49.4,5,0,No Failure,0,1,0
3,4,L47183,298.2,308.6,1433,39.5,7,0,No Failure,0,1,0
4,5,L47184,298.2,308.7,1408,40.0,9,0,No Failure,0,1,0


In [54]:
df_1.Target.unique() 


array([0, 1], dtype=int64)

>Target = 1 - Failed

>Target = 0 - Not Failed

In [55]:
df_1[df_1.Target==1]['Failure Type'].value_counts().sum()

339

> The above data shows the Failure Types of the Failed processess.

In [56]:
df_1.columns

Index(['UDI', 'Product ID', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target',
       'Failure Type', 'Type_H', 'Type_L', 'Type_M'],
      dtype='object')

In [57]:
X = df_1[['Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]',
       'Type_H', 'Type_L', 'Type_M']]

In [58]:
Y = df_1['Failure Type']

> Now converting Failure Type string values to int.

In [59]:
labelEncoding = {j:i for i,j in enumerate(Y.unique())}
inverse = {j:i for i,j in labelEncoding.items()}
Y = Y.map(labelEncoding)

In [60]:
Y

0       0
1       0
2       0
3       0
4       0
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Name: Failure Type, Length: 10000, dtype: int64

In [61]:
labelEncoding

{'No Failure': 0,
 'Power Failure': 1,
 'Tool Wear Failure': 2,
 'Overstrain Failure': 3,
 'Random Failures': 4,
 'Heat Dissipation Failure': 5}

In [62]:
inverse

{0: 'No Failure',
 1: 'Power Failure',
 2: 'Tool Wear Failure',
 3: 'Overstrain Failure',
 4: 'Random Failures',
 5: 'Heat Dissipation Failure'}

In [63]:
Y.map(inverse)

0       No Failure
1       No Failure
2       No Failure
3       No Failure
4       No Failure
           ...    
9995    No Failure
9996    No Failure
9997    No Failure
9998    No Failure
9999    No Failure
Name: Failure Type, Length: 10000, dtype: object

> Now balancing data using imblearn.

In [64]:
smote = SMOTE()
X_smote,Y_smote = smote.fit_resample(X,Y)

In [65]:
df_1.isnull().sum() #Checking for null values

UDI                        0
Product ID                 0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Target                     0
Failure Type               0
Type_H                     0
Type_L                     0
Type_M                     0
dtype: int64

In [66]:
x_train,x_test,y_train,y_test = train_test_split(X_smote,Y_smote,test_size=0.2)

In [67]:
models = [LinearRegression, LogisticRegression,
          DecisionTreeClassifier,RandomForestClassifier,
          KNeighborsClassifier,GaussianNB,
          MultinomialNB,SVC]
names = ['LinearRegression', 'LogisticRegression',
          'DecisionTreeClassifier','RandomForestClassifier',
          'KNeighborsClassifier','GaussianNB',
          'MultinomialNB','SVC']

data = []
for name,model in zip(names,models):
    print(name)
    m = model()
    m.fit(x_train, y_train)
    score = m.score(x_test, y_test)
    data.append([name, score])

LinearRegression
LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


DecisionTreeClassifier
RandomForestClassifier
KNeighborsClassifier
GaussianNB
MultinomialNB
SVC


In [68]:
data.sort(key = lambda x:x[1], reverse=True)
pd.DataFrame(data, columns =['Model name','Score'])

Unnamed: 0,Model name,Score
0,RandomForestClassifier,0.99577
1,DecisionTreeClassifier,0.991798
2,KNeighborsClassifier,0.949754
3,LogisticRegression,0.7669
4,GaussianNB,0.690063
5,SVC,0.669688
6,MultinomialNB,0.51895
7,LinearRegression,0.31705


> We will go ahead with the RandomForestClassifier model since it's score is close to 1.0

> We will use more n-estimators to train the model with the training data.

In [69]:
best_case_model = RandomForestClassifier(n_estimators=300)
best_case_model.fit(X_smote,Y_smote)

> Now, We will make a function to return prediction values of failure_type using X values.

In [70]:
def failure_type(x):
    df_1 = pd.get_dummies(x,columns=['Type'])
    df_1[['Type_H','Type_L','Type_M']] = df_1[['Type_H','Type_L','Type_M']].astype(int)
    return best_case_model.predict(df_1)

In [71]:
x = df[['Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]']]
print(x.shape)
prediction = failure_type(x)
print("Model Accuracy: ", float(accuracy_score(df['Failure Type'].map(labelEncoding), prediction)*100),'%')

(10000, 6)
Model Accuracy:  100.0 %


In [84]:
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(best_case_model, f)

In [85]:
X_smote

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_H,Type_L,Type_M
0,298.100000,308.600000,1551,42.800000,0,0,0,1
1,298.200000,308.700000,1408,46.300000,3,0,1,0
2,298.100000,308.500000,1498,49.400000,5,0,1,0
3,298.200000,308.600000,1433,39.500000,7,0,1,0
4,298.200000,308.700000,1408,40.000000,9,0,1,0
...,...,...,...,...,...,...,...,...
57907,303.237448,311.518015,1252,64.342702,138,0,0,0
57908,302.161971,310.634976,1369,48.611257,87,0,1,0
57909,302.570254,310.870254,1362,55.197259,152,0,1,0
57910,302.468709,310.419038,1335,49.191898,148,0,1,0
