# DataUnderstanding

### 0. Import Libraries

In [1]:
import pandas as pd

# region plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# endregion

### DataFrame Description
Im Folgenden werden die Merkmale in Form einer Tabelle beschrieben

| Variable (Name)       | Beschreibung                                                                                           | Skalenniveau | Typ (Input/Output) |
|----------------------|------------------------------------------------------------------------------------------------------|--------------|-------------------|
| UID                  | Eindeutige Kennung (1–10.000)                                                                       | Nominal      | Input            |
| product ID           | spezifischer Seriennummer                                                                           | Nominal      | Input            |
| type                 | Produktvariante (L/M/H)                                                                             | Nominal      | Input            |
| air temperature [K]  | Lufttemperatur, normalisiert um 300 K mit Standardabweichung von 2 K                               | Metrisch     | Input            |
| process temperature [K] | Prozess-Temperatur, berechnet als Lufttemperatur + 10 K, mit Standardabweichung von 1 K       | Metrisch     | Input            |
| rotational speed [rpm] | Drehzahl, berechnet aus 2860 W mit überlagerter normalverteilter Störung                       | Metrisch     | Input            |
| torque [Nm]         | Drehmoment, normalverteilt um 40 Nm (σ = 10 Nm), keine negativen Werte                            | Metrisch     | Input            |
| tool wear [min]     | Werkzeugverschleiß, beeinflusst durch Produktqualität (H: +5 min, M: +3 min, L: +2 min)           | Metrisch     | Input            |
| machine failure     | Gibt an, ob ein Maschinenausfall vorliegt (1 = Ausfall, 0 = kein Ausfall)                         | Dichotom     | Output           |
| tool wear failure (TWF) | Werkzeugverschleißbedingter Ausfall, wenn Verschleiß zwischen 200–240 min erreicht wird     | Dichotom     | Output            |
| heat dissipation failure (HDF) | Ausfall, wenn Temperaturdifferenz < 8.6 K und Drehzahl < 1380 rpm                      | Dichotom     | Output            |
| power failure (PWF) | Ausfall, wenn Leistung < 3500 W oder > 9000 W                                                      | Dichotom     | Output            |
| overstrain failure (OSF) | Ausfall, wenn Produkt aus Werkzeugverschleiß und Drehmoment einen Schwellenwert überschreitet | Dichotom     | Output            |
| random failures (RNF) | Zufälliger Prozessausfall mit Wahrscheinlichkeit von 0,1 %                                      | Dichotom     | Output            |


In [2]:
df = pd.read_csv('./dataset.csv')
df.describe()

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


Dies zeigt, das Pandas die Datentypen der Merkmale nicht optimal bestimmen kann. Demnach wird im folgenden der Datensatz mit konkreten Datentypen eingelesen. Diese können aus der Beschreibung des Datensatztes abgeleitet werden.

In [3]:
dtypes = {
    'UDI': 'int32',
    'Product ID': 'str',
    'Type': 'category',
    'Air temperature [K]': 'float32',
    'Process temperature [K]': 'float32',
    'Rotational speed [rpm]': 'float32',
    'Torque [Nm]': 'float32',
    'Tool wear [min]': 'float32',
    'Machine failure': 'bool',
    'TWF': 'bool',
    'HDF': 'bool',
    'PWF': 'bool',
    'OSF': 'bool',
    'RNF': 'bool'
}


df = pd.read_csv('./dataset.csv', dtype=dtypes)
df.describe(include='all')

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000,10000,10000.0,10000.0,10000.0,10000.0,10000.0,10000,10000,10000,10000,10000,10000
unique,,10000,3,,,,,,2,2,2,2,2,2
top,,M14860,L,,,,,,False,False,False,False,False,False
freq,,1,6000,,,,,,9661,9954,9885,9905,9902,9981
mean,5000.5,,,300.004913,310.005554,1538.776123,39.986908,107.950996,,,,,,
std,2886.89568,,,2.000259,1.483734,179.284103,9.968934,63.654148,,,,,,
min,1.0,,,295.299988,305.700012,1168.0,3.8,0.0,,,,,,
25%,2500.75,,,298.299988,308.799988,1423.0,33.200001,53.0,,,,,,
50%,5000.5,,,300.100006,310.100006,1503.0,40.099998,108.0,,,,,,
75%,7500.25,,,301.5,311.100006,1612.0,46.799999,162.0,,,,,,


In [4]:
failure_counts = df.groupby(['Type', 'Machine failure'], observed=False).size().unstack(fill_value=0)

fig = px.bar(
    failure_counts,
    x=failure_counts.index,
    y=[False, True],
    labels={'x': 'Machine Type', 'value': 'Count', 'variable': 'Machine Failure'},
    color_discrete_map={False: 'blue', True: 'red'},
    barmode='stack'
)

fig.update_layout(title='Machine Failure by Type', xaxis_title='Type', yaxis_title='Count')
fig.show()

In [5]:
relevant_types = ['H', 'L', 'M']

fig = make_subplots(
    rows=1, cols=3, specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}]]
)

for i, relevant_type in enumerate(relevant_types):
    subset = df[df['Type'] == relevant_type]

    failure_counts_per_type = {
        'Tool Wear': subset['TWF'].sum(),
        'Heat Dissipation': subset['HDF'].sum(),
        'Power': subset['PWF'].sum(),
        'Overstrain': subset['OSF'].sum(),
        'Random': subset['RNF'].sum()
    }
    
    fig.add_trace(go.Pie(
        labels=list(failure_counts_per_type.keys()),
        values=list(failure_counts_per_type.values()),
        hole=0.3,
        title=f"Type: {relevant_type}"
    ), row=1, col=i+1)

fig.show()

In [18]:
relevant_features = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

for relevant_feature in relevant_features:
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=df.index,
        y=df[relevant_feature],
        mode='lines',
        line=dict(color='blue'),
        name=relevant_feature
    ))

    fig.add_trace(go.Scatter(
        x=df[df['Machine failure']].index,
        y=df.loc[df['Machine failure'], relevant_feature],
        mode='markers',
        marker=dict(color='red', size=5),
        name='Machine Failure'
    ))


    fig.update_layout(
        title=f'{relevant_feature} with Machine Failure Indicators',
        xaxis_title='Data Point Index',
        yaxis_title=relevant_feature,
        legend_title='Status'
    )

    fig.show()