# Preprocessing and Modeling

## Imports and load data

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from joblib import dump, load
from sklearn import __version__ as sklearn_version
import datetime

In [2]:
data_path = "../data/data_after_eda.csv"
data = pd.read_csv(data_path, index_col = "UDI")
data.head()

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,14860,M,298.1,308.6,1551,42.8,0,0,No Failure
2,47181,L,298.2,308.7,1408,46.3,3,0,No Failure
3,47182,L,298.1,308.5,1498,49.4,5,0,No Failure
4,47183,L,298.2,308.6,1433,39.5,7,0,No Failure
5,47184,L,298.2,308.7,1408,40.0,9,0,No Failure


## Select the features and the target 1

In [3]:
data_prep = data.drop(columns = "Product ID")

In [4]:
x = data_prep.drop(columns = ["Target", "Failure Type"])
y = data_prep["Target"]

In [5]:
x.head()

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,M,298.1,308.6,1551,42.8,0
2,L,298.2,308.7,1408,46.3,3
3,L,298.1,308.5,1498,49.4,5
4,L,298.2,308.6,1433,39.5,7
5,L,298.2,308.7,1408,40.0,9


## Make pipeline with encoding, scaling and a model

In [6]:
pipe = make_pipeline(
    OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1),
    StandardScaler(),
    RandomForestClassifier()
)

In [7]:
cv = cross_validate(pipe, x, y, cv = 5)

In [8]:
cv_scores = cv['test_score']
cv_scores

array([0.9655, 0.973 , 0.6335, 0.971 , 0.975 ])

In [9]:
cv_scores.mean(), cv_scores.std()

(0.9036, 0.13508715705055016)

## Grid Search

In [10]:
param_grid = [
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10]
    },
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [SVC()],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__C': [0.1, 1, 10]
    },
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [LogisticRegression()],
        'classifier__C': [0.1, 1, 10]
    }
]

In [11]:
pipeline = Pipeline([
    ('encoder', OrdinalEncoder()),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

In [12]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

In [13]:
grid_search.fit(x, y)

In [14]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'classifier': SVC(C=0.1, kernel='linear'), 'classifier__C': 0.1, 'classifier__kernel': 'linear', 'encoder': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 'scaler': StandardScaler()}
Best cross-validation score: 0.9661000000000002


## Select the target 2

In [15]:
data["Failure Type"].value_counts()

Failure Type
No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: count, dtype: int64

In [24]:
param_grid2 = [
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10]
    },
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [SVC()],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__C': [0.1, 1, 10]
    },
    {
        'encoder': [OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [LogisticRegression(multi_class='multinomial', max_iter=1000)],
        'classifier__C': [0.1, 1, 10]
    }
]

In [25]:
y2 = data["Failure Type"]
grid_search2 = GridSearchCV(pipeline, param_grid2, cv=5, scoring='accuracy')

In [26]:
grid_search2.fit(x, y2)

In [27]:
print("Best parameters:", grid_search2.best_params_)
print("Best cross-validation score:", grid_search2.best_score_)

Best parameters: {'classifier': SVC(C=0.1, kernel='linear'), 'classifier__C': 0.1, 'classifier__kernel': 'linear', 'encoder': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 'scaler': MinMaxScaler()}
Best cross-validation score: 0.9652


## Save and serialize the models

In [35]:
model_failures = grid_search.best_estimator_
model_failures.version = '1.0'
model_failures.pandas_version = pd.__version__
model_failures.numpy_version = np.__version__
model_failures.sklearn_version = sklearn_version
model_failures.X_columns = [col for col in x.columns]
model_failures.build_datetime = datetime.datetime.now()

dump(model_failures, '../models/model_failures.pkl')

['../models/model_failures.pkl']

In [36]:
model_type = grid_search2.best_estimator_
model_type.version = '1.0'
model_type.pandas_version = pd.__version__
model_type.numpy_version = np.__version__
model_type.sklearn_version = sklearn_version
model_type.X_columns = [col for col in x.columns]
model_type.build_datetime = datetime.datetime.now()

dump(model_type, '../models/model_type.pkl')

['../models/model_type.pkl']