# Data Preperation

### Importiere Bibliotheken

In [16]:
import pandas as pd
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# region plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# endregion

### Datensatz einlesen
Da Pandas die Datentypen der Merkmale nicht optimal erkennt, werden im Folgenden die konkreten Datentypen definiert. \
Diese können aus der Beschreibung des Datensatztes abgeleitet werden.

In [17]:
dtypes = {
    'UDI': 'int32',
    'Product ID': 'str',
    'Type': 'category',
    'Air temperature [K]': 'float32',
    'Process temperature [K]': 'float32',
    'Rotational speed [rpm]': 'float32',
    'Torque [Nm]': 'float32',
    'Tool wear [min]': 'float32',
    'Machine failure': 'bool',
    'TWF': 'bool',
    'HDF': 'bool',
    'PWF': 'bool',
    'OSF': 'bool',
    'RNF': 'bool'
}

df = pd.read_csv('./dataset.csv', dtype=dtypes)

### Erstelle Dummy-Merkmale für das Merkmal Maschinen-Typ

In [18]:
dummy_columns = pd.get_dummies(df['Type'], prefix='Type')
df = pd.concat([df, dummy_columns], axis=1)

### Definiere die Input- und Output- Merkmale

In [19]:
input_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Type_H', 'Type_L', 'Type_M']
output_column = ['label']
numeric_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

### Generiere Labeling

In [20]:
def get_label(row):
    for defect in ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']:
        if row[defect] == 1:
            return defect
    return 'no_failure'

df['label'] = df.apply(get_label, axis=1)

### Initialisiere Training- und Testdaten

In [21]:
X = df[input_columns]
y = df[output_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Implementiere Oversampling auf den Trainingsdaten
Oversampling wird gewählt, da der Datensatz zwei sehr starke Unausgeglichenheiten enthält und ein Undersampling zu einem sehr starken Datenverlust führen würde.\
Dies liegt daran, da nur wenige Machinen tatsächlich einen Defekt aufweisen.

In [22]:
smote = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

### Kombiniere Test- und Trainingsdatensatz

In [23]:
df_train_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

### Skaliere numerische Werte mit der Standartskalierung auf Basis der Trainingsdaten

In [24]:
scaler = StandardScaler()
df_train_resampled[numeric_columns] = scaler.fit_transform(df_train_resampled[numeric_columns])
df_test[numeric_columns] = scaler.transform(df_test[numeric_columns])

### Speichere den Trainings- und Testdatensatz

In [25]:
df_train_resampled.to_csv('./dataset_train_resampled.csv')
df_test.to_csv('./dataset_test.csv')

In [26]:
df_comb_resampled = pd.concat([df_train_resampled, df_test]).reset_index(drop=True)

In [27]:
df_comb_resampled.describe(include='all')

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_H,Type_L,Type_M,label
count,48260.0,48260.0,48260.0,48260.0,48260.0,48260,48260,48260,48260
unique,,,,,,2,2,2,6
top,,,,,,False,True,False,no_failure
freq,,,,,,40453,40332,31854,9616
mean,-0.011412,-0.006209,0.003261,-0.018562,-0.022014,,,,
std,1.00617,1.009707,0.985318,0.992424,1.002724,,,,
min,-2.88459,-3.655904,-1.068038,-2.858439,-2.090615,,,,
25%,-0.816651,-0.709655,-0.495216,-0.61013,-0.860219,,,,
50%,0.027434,0.095607,-0.328031,0.119388,0.230072,,,,
75%,0.876438,0.692638,0.057681,0.694001,0.902767,,,,


In [28]:
relevant_features = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

for relevant_feature in relevant_features:

    fig = make_subplots(
        rows=1, cols=2,
        column_widths=[0.7, 0.3],
        subplot_titles=[f'{relevant_feature} skaliert und Oversampelt mit Maschinen-Fehler Indikatoren', 'Fehler-Typ Verteilung']
    )

    fig.add_trace(
        go.Scatter(
            x=df_comb_resampled.index,
            y=df_comb_resampled[relevant_feature],
            mode='lines',
            line=dict(color='blue'),
            name=relevant_feature
        ),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(
            x=df_comb_resampled[df_comb_resampled['label'] != 'no_failure'].index,
            y=df_comb_resampled.loc[df_comb_resampled['label'] != 'no_failure', relevant_feature],
            mode='markers',
            marker=dict(color='red', size=3),
            name='Machine Failure'
        ),
        row=1, col=1
    )

    failure_types = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'no_failure']

    for failure in failure_types:
        fig.add_trace(
            go.Box(
                y=df_comb_resampled.loc[df_comb_resampled['label'] == failure, relevant_feature],
                name=failure,
                #boxpoints='all',
                marker=dict(size=3)
            ),
            row=1, col=2
        )

    fig.update_layout(
        title_text=f'{relevant_feature} skaliert und Oversampelt mit Maschinen-Fehler Indikatoren und Verteilung nach Fehler-Typ',
        xaxis_title='Datenpunkt Index',
        yaxis_title=relevant_feature,
        legend_title='Legende'
    )

    fig.show()