**To-Do list:**
1. Handle Missing Value
2. Encoding
3. Data Normalization (Scaling)
4. Handle Imbalanced Data
5. Outlier Detection and handling
6. Modeling (Classification)
7. Evaluation (accuracy and error testing)


### **Data Preprocessing**

In [42]:
import pandas as pd
# read csv
df = pd.read_csv('predictive_maintenance.csv')
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


In [44]:
# Check missing value
df.isna().sum()
# Alhamdulillah datanya bersih

UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Target                     0
Failure Type               0
dtype: int64

In [45]:
# Drop column yang gak dipake
df = df.drop(['UDI', 'Product ID'], axis = 1)

In [46]:
# Cek modus dari label
values = df['Failure Type'].value_counts()
print(values)
print(f'The Mode is: {values.idxmax()} with {values.max()} occurrences')

Failure Type
No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: count, dtype: int64
The Mode is: No Failure with 9652 occurrences


In [47]:
# Lakukan Encoding untuk label (Label Encoding)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Failure Type'] = le.fit_transform(df['Failure Type'])

In [48]:
df['Failure Type'].value_counts()

Failure Type
1    9652
0     112
3      95
2      78
5      45
4      18
Name: count, dtype: int64

In [49]:
le.classes_

array(['Heat Dissipation Failure', 'No Failure', 'Overstrain Failure',
       'Power Failure', 'Random Failures', 'Tool Wear Failure'],
      dtype=object)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     10000 non-null  object 
 1   Air temperature [K]      10000 non-null  float64
 2   Process temperature [K]  10000 non-null  float64
 3   Rotational speed [rpm]   10000 non-null  int64  
 4   Torque [Nm]              10000 non-null  float64
 5   Tool wear [min]          10000 non-null  int64  
 6   Target                   10000 non-null  int64  
 7   Failure Type             10000 non-null  int32  
dtypes: float64(3), int32(1), int64(3), object(1)
memory usage: 586.1+ KB


In [51]:
# Encode Type column One-hot encoding
df = pd.get_dummies(df, columns=['Type'], dtype=int)

In [52]:
df.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Type_H,Type_L,Type_M
0,298.1,308.6,1551,42.8,0,0,1,0,0,1
1,298.2,308.7,1408,46.3,3,0,1,0,1,0
2,298.1,308.5,1498,49.4,5,0,1,0,1,0
3,298.2,308.6,1433,39.5,7,0,1,0,1,0
4,298.2,308.7,1408,40.0,9,0,1,0,1,0


In [34]:
# Data Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X = df.drop(['Target', 'Failure Type'], axis = 1)
y_target = df['Target']
y_failure = df['Failure Type']

X = sc.fit_transform(X)

In [35]:
# Train test split untuk y_target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_target, random_state=42, test_size = 0.2)

# SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [40]:
# SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)

# K-fold validation
from sklearn.model_selection import StratifiedKFold, cross_val_score
kfolds = StratifiedKFold(n_splits=10)
scores = cross_val_score(svm, X, y_target, cv = kfolds)      
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [0.975 0.973 0.976 0.967 0.962 0.966 0.966 0.974 0.975 0.972]
Average CV Score:  0.9705999999999999
Number of CV Scores used in Average:  10


In [41]:
# Check accuracy
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# Predicting
y_pred = svm.predict(X_test)
print('Classification report \n\n', classification_report(y_test, y_pred))
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('MSE           : ', mean_squared_error(y_test, y_pred))


Classification report 

               precision    recall  f1-score   support

           0       1.00      0.92      0.96      1939
           1       0.26      0.87      0.40        61

    accuracy                           0.92      2000
   macro avg       0.63      0.89      0.68      2000
weighted avg       0.97      0.92      0.94      2000

Accuracy Score:  0.9195
MSE           :  0.0805


In [78]:
# Convert the encoded y_classes back to the categorical ones
y_classes = le.inverse_transform(y_pred)
print(y_classes)

In [33]:
from sklearn.svm import LinearSVC
l1_regularization_SVM = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol = 1e-3)
l1_regularization_SVM.fit(X_train, y_train)

kfolds = StratifiedKFold(n_splits=10)
scores = cross_val_score(svm, X, y_target, cv = kfolds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [0.975 0.973 0.976 0.967 0.962 0.966 0.966 0.974 0.975 0.972]
Average CV Score:  0.9705999999999999
Number of CV Scores used in Average:  10


In [34]:
y_pred_reg = l1_regularization_SVM.predict(X_test)
print('Classification report \n\n', classification_report(y_test, y_pred_reg))
print('Accuracy Score: ', accuracy_score(y_test, y_pred_reg))
print('MSE           : ', mean_squared_error(y_test, y_pred_reg))

Classification report 

               precision    recall  f1-score   support

           0       0.99      0.83      0.90      1939
           1       0.13      0.84      0.23        61

    accuracy                           0.83      2000
   macro avg       0.56      0.83      0.56      2000
weighted avg       0.97      0.83      0.88      2000

Accuracy Score:  0.8255
MSE           :  0.1745


In [35]:
# Train test split untuk y_failure
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y_failure, random_state=42, test_size = 0.3)

# SMOTE
X_train1, y_train1 = sm.fit_resample(X_train1, y_train1)

In [36]:
# SVM
svm.fit(X_train1, y_train1)
scores1 = cross_val_score(svm, X, y_failure, cv = kfolds)

print("Cross Validation Scores: ", scores1)
print("Average CV Score: ", scores1.mean())
print("Number of CV Scores used in Average: ", len(scores1))


Cross Validation Scores:  [0.975 0.972 0.974 0.968 0.937 0.97  0.969 0.971 0.974 0.975]
Average CV Score:  0.9685
Number of CV Scores used in Average:  10


In [37]:
y_pred1 = svm.predict(X_test1)
print('Classification report \n\n', classification_report(y_test1, y_pred1))
print('Accuracy Score: ', accuracy_score(y_test1, y_pred1))
print('MSE           : ', mean_squared_error(y_test1, y_pred1))

Classification report 

               precision    recall  f1-score   support

           0       0.24      1.00      0.39        23
           1       0.99      0.85      0.92      2903
           2       0.49      0.86      0.62        22
           3       0.64      0.87      0.74        31
           4       0.01      0.14      0.01         7
           5       0.05      0.50      0.09        14

    accuracy                           0.85      3000
   macro avg       0.40      0.70      0.46      3000
weighted avg       0.97      0.85      0.90      3000

Accuracy Score:  0.848
MSE           :  1.407
