# This model uses StandardScaler to scale the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, 
                             precision_score, 
                             recall_score, 
                             confusion_matrix, 
                             ConfusionMatrixDisplay,
                             )

#### Loading initial data

In [2]:
initial_data = pd.read_csv('../data/ai4i2020.csv')

#### Convering Type column into binary values for model preperation

In [3]:
dummy = pd.get_dummies(initial_data['Type'], dtype=int, drop_first=True)

In [4]:
cleaned_data = pd.concat([initial_data, dummy], axis=1)

In [5]:
cleaned_data.drop(columns=["Type", "Product ID", "UDI", "TWF", "HDF", "PWF", "OSF", "RNF"], inplace=True)

In [6]:
cleaned_data.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,L,M
0,298.1,308.6,1551,42.8,0,0,0,1
1,298.2,308.7,1408,46.3,3,0,1,0
2,298.1,308.5,1498,49.4,5,0,1,0
3,298.2,308.6,1433,39.5,7,0,1,0
4,298.2,308.7,1408,40.0,9,0,1,0


In [7]:
X = cleaned_data.drop(columns=["Machine failure"])
X

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],L,M
0,298.1,308.6,1551,42.8,0,0,1
1,298.2,308.7,1408,46.3,3,1,0
2,298.1,308.5,1498,49.4,5,1,0
3,298.2,308.6,1433,39.5,7,1,0
4,298.2,308.7,1408,40.0,9,1,0
...,...,...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14,0,1
9996,298.9,308.4,1632,31.8,17,0,0
9997,299.0,308.6,1645,33.4,22,0,1
9998,299.0,308.7,1408,48.5,25,0,0


In [8]:
y = cleaned_data["Machine failure"]
y.value_counts()

Machine failure
0    9661
1     339
Name: count, dtype: int64

#### Applying MinMaxScaler

In [9]:
std_scaler = StandardScaler()

X_scaled = std_scaler.fit_transform(X)

X = pd.DataFrame(X_scaled, columns=X.columns)
X.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],L,M
0,-0.952389,-0.94736,0.068185,0.2822,-1.695984,-1.224745,1.528617
1,-0.902393,-0.879959,-0.729472,0.633308,-1.648852,0.816497,-0.654186
2,-0.952389,-1.014761,-0.22745,0.94429,-1.61743,0.816497,-0.654186
3,-0.902393,-0.94736,-0.590021,-0.048845,-1.586009,0.816497,-0.654186
4,-0.902393,-0.879959,-0.729472,0.001313,-1.554588,0.816497,-0.654186


#### Creating Logistic Regression Model

In [10]:
log_model = LogisticRegression(max_iter=1000)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores = []
precision_scores = []
recall_scores = []

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

y_test.value_counts()

Machine failure
0    2428
1      72
Name: count, dtype: int64

#### Training Model


In [11]:
for train_set, val_set in kf.split(X_train):
    X_train_split, y_train_split = X_train.iloc[train_set], y_train.iloc[train_set]
    X_val_split, y_val_split = X_train.iloc[val_set], y_train.iloc[val_set]
    log_model.fit(X_train_split, y_train_split)
    model_predictions = log_model.predict(X_val_split)

    accuracy_scores.append(accuracy_score(y_val_split, model_predictions))
    precision_scores.append(precision_score(
        y_val_split, model_predictions, zero_division=0))
    recall_scores.append(recall_score(y_val_split, model_predictions))

#### Assessing model's performance from splits

In [12]:
print(f"Average Accuracy Scores: {np.mean(accuracy_scores) * 100}")
print(f"Average Precision Scores: {np.mean(precision_scores) * 100}")
print(f"Average Recall Scores: {np.mean(recall_scores) * 100}")

Average Accuracy Scores: 96.82666666666668
Average Precision Scores: 70.11788211788212
Average Recall Scores: 18.910566552202276
