In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [18]:
data_path = "../data/data_after_eda.csv"
data = pd.read_csv(data_path, index_col = "UDI")
data.head()

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,14860,M,298.1,308.6,1551,42.8,0,0,No Failure
2,47181,L,298.2,308.7,1408,46.3,3,0,No Failure
3,47182,L,298.1,308.5,1498,49.4,5,0,No Failure
4,47183,L,298.2,308.6,1433,39.5,7,0,No Failure
5,47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [19]:
data_prep = data.drop(columns = "Product ID")

In [23]:
x = data_prep.drop(columns = ["Target", "Failure Type"])
y = data_prep["Target"]

In [24]:
x.head()

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,M,298.1,308.6,1551,42.8,0
2,L,298.2,308.7,1408,46.3,3
3,L,298.1,308.5,1498,49.4,5
4,L,298.2,308.6,1433,39.5,7
5,L,298.2,308.7,1408,40.0,9


In [37]:
pipe = make_pipeline(
    OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1),
    StandardScaler(),
    RandomForestClassifier()
)

In [38]:
cv = cross_validate(pipe, x, y, cv = 5)

In [41]:
cv_scores = cv['test_score']
cv_scores

array([0.9635, 0.969 , 0.622 , 0.971 , 0.9765])

In [42]:
cv_scores.mean(), cv_scores.std()

(0.9004, 0.13926212694052895)

In [48]:
param_grid = [
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10]
    },
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [SVC()],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__C': [0.1, 1, 10]
    },
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [LogisticRegression()],
        'classifier__C': [0.1, 1, 10]
    }
]

In [49]:
pipeline = Pipeline([
    ('encoder', OrdinalEncoder()),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

In [50]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

In [51]:
grid_search.fit(x, y)

In [52]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'classifier': SVC(C=0.1, kernel='linear'), 'classifier__C': 0.1, 'classifier__kernel': 'linear', 'encoder': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 'scaler': StandardScaler()}
Best cross-validation score: 0.9661000000000002
