# Model Building

### Importiere Bibliotheken

In [None]:
import pandas as pd
import numpy as np

# region sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
#endregion

import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.model_selection import GridSearchCV

### Datensatz einlesen
Da Pandas die Datentypen der Merkmale nicht optimal erkennt, werden im Folgenden die konkreten Datentypen definiert. \
Diese können aus der Beschreibung des Datensatztes abgeleitet werden.

In [19]:
dtypes = {
    'Air temperature [K]': 'float32',
    'Process temperature [K]': 'float32',
    'Rotational speed [rpm]': 'float32',
    'Torque [Nm]': 'float32',
    'Tool wear [min]': 'float32',
    'Type_H': 'bool',
    'Type_L': 'bool',
    'Type_M': 'bool',
    'label': 'category'
}

df_train_resampled = pd.read_csv('./dataset_train_resampled.csv', dtype=dtypes)
df_test = pd.read_csv('./dataset_test.csv', dtype=dtypes)

### Definiere die Input- und Output- Merkmale

In [20]:
input_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Type_H', 'Type_L', 'Type_M']
output_column = ['label']

X_train = df_train_resampled[input_columns]
X_test = df_test[input_columns]
y_train = df_train_resampled[output_column].to_numpy().ravel()
y_test = df_test[output_column].to_numpy().ravel()


### Trainiere die Modelle 

In [21]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [22]:
results = {}
for i, (model_name, model) in enumerate(models.items(), start=1):
    print(f'Model {i} of {len(models)}: {model_name}')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

results_df = pd.DataFrame(results).T
results_df

Model 1 of 4: Logistic Regression
Model 2 of 4: Random Forest
Model 3 of 4: Support Vector Machine
Model 4 of 4: K-Nearest Neighbors


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.876,0.97504,0.876,0.917452
Random Forest,0.97,0.976084,0.97,0.972736
Support Vector Machine,0.8695,0.975559,0.8695,0.915241
K-Nearest Neighbors,0.9155,0.963684,0.9155,0.937826


<font color='lightgreen'>Das RandomForest-Model besitzt den besten F1-Score</font>

### Visualisiere die Feature-Importance

In [23]:
feature_importances = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': models['Random Forest'].feature_importances_
}).sort_values(by='Importance', ascending=False)

fig = px.bar(
    feature_importances, x='Feature', y='Importance',
    title='Feature-Import des RandomForest',
    labels={'Importance': 'Relevanz', 'Feature': 'Merkmal'},
    text_auto=True
)

fig.show()

### Hyperparameter-Tuning des RandomForest
Braucht ca. 10 min

In [24]:
parameter_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(),
    parameter_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)


One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]



In [None]:
model = grid_search.best_estimator_

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print('accuracy ',  accuracy)
print('precision ', precision)
print('recall ',    recall)
print('f1 ',        f1)

accuracy  0.9695
precision  0.9768997548150775
recall  0.9695
f1  0.9728233204600207


In [40]:
unique_classes = np.unique(y_test)

y_test_bin = label_binarize(y_test, classes=unique_classes)
n_classes = y_test_bin.shape[1]

y_prob = model.predict_proba(X_test)

fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    if np.any(y_test_bin[:, i]):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])


fig = go.Figure()

for i in range(n_classes):
    if i in roc_auc:
        fig.add_trace(go.Scatter(
            x=fpr[i], 
            y=tpr[i], 
            mode='lines', 
            name=f'{unique_classes[i]} (AUC = {roc_auc[i]:.2f})'
        ))

fig.add_trace(go.Scatter(
    x=[0, 1], 
    y=[0, 1], 
    mode='lines', 
    line=dict(dash='dash'), 
    name='Random Guessing'
))

fig.update_layout(
    title="Receiver Operating Characteristic (ROC) Curve",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    xaxis=dict(scaleanchor="y", constrain="domain"),
    yaxis=dict(scaleanchor="x", constrain="domain"),
)

fig.show()