# Model Building

### Importiere Bibliotheken

In [78]:
import pandas as pd
import numpy as np

# region sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
#endregion

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

### Datensatz einlesen
Da Pandas die Datentypen der Merkmale nicht optimal erkennt, werden im Folgenden die konkreten Datentypen definiert. \
Diese können aus der Beschreibung des Datensatztes abgeleitet werden.

In [79]:
dtypes = {
    'Air temperature [K]': 'float32',
    'Process temperature [K]': 'float32',
    'Rotational speed [rpm]': 'float32',
    'Torque [Nm]': 'float32',
    'Tool wear [min]': 'float32',
    'Type_H': 'bool',
    'Type_L': 'bool',
    'Type_M': 'bool',
    'label': 'category'
}

df_train_resampled = pd.read_csv('./dataset_train_resampled.csv', dtype=dtypes)
df_test = pd.read_csv('./dataset_test.csv', dtype=dtypes)

### Definiere die Input- und Output- Merkmale

In [80]:
input_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Type_H', 'Type_L', 'Type_M']
output_column = ['label']

X_train = df_train_resampled[input_columns]
X_test = df_test[input_columns]
y_train = df_train_resampled[output_column].to_numpy().ravel()
y_test = df_test[output_column].to_numpy().ravel()


### Trainiere die Modelle 

In [81]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [82]:
results = {}
for i, (model_name, model) in enumerate(models.items(), start=1):
    print(f'Model {i} of {len(models)}: {model_name}')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

results_df = pd.DataFrame(results).T
results_df

Model 1 of 4: Logistic Regression
Model 2 of 4: Random Forest
Model 3 of 4: Support Vector Machine
Model 4 of 4: K-Nearest Neighbors


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.876,0.97504,0.876,0.917452
Random Forest,0.9725,0.976714,0.9725,0.974414
Support Vector Machine,0.8695,0.975559,0.8695,0.915241
K-Nearest Neighbors,0.9155,0.963684,0.9155,0.937826
