In [6]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import numpy as np

# Load the processed data
df = pd.read_csv('../data/processed/processed_data.csv')

# Feature Engineering (already done in preprocessing, but here for completeness)
df['delta_temp'] = df['Process temperature [K]'] - df['Air temperature [K]']
df['power'] = df['Torque [Nm]'] * df['Rotational speed [rpm]'] * (2 * np.pi / 60)
df['overstrain'] = df['Tool wear [min]'] * df['Torque [Nm]']

# Basic stats
print(df.describe())

# Correlation heatmap including new features
numeric_cols = df.select_dtypes(include=np.number)  # Select only numeric columns for correlation
correlation_matrix = numeric_cols.corr()

# Plotly heatmap
fig = px.imshow(correlation_matrix, text_auto=True, aspect="auto", color_continuous_scale='Viridis')
fig.update_layout(title="Correlation Heatmap with Numeric Features")
fig.show()

# Distribution of machine failures
fig = px.histogram(df, x='Machine failure', title="Distribution of Machine Failures", color='Machine failure')
fig.show()

# Distribution of different failure modes
failure_modes = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
failure_sum = df[failure_modes].sum().reset_index()
failure_sum.columns = ['Failure Mode', 'Count']

fig = px.bar(failure_sum, x='Failure Mode', y='Count', title="Distribution of Different Failure Modes", color='Failure Mode')
fig.show()

# Failure mode by product type
for mode in failure_modes:
    fig = px.histogram(df, x='Type', color=mode, barmode='group',
                       title=f"Distribution of {mode} by Product Type")
    fig.show()

# Pairplot with engineered features
fig = px.scatter_matrix(df, dimensions=['delta_temp', 'power', 'overstrain'], color='Machine failure', title="Pairplot of Engineered Features")
fig.show()

# Exploring HDF conditions
hdf_conditions = df[(df['delta_temp'] < 8.6) & (df['Rotational speed [rpm]'] < 1380)]
print("HDF Conditions Data Points:", hdf_conditions.shape[0])
print(hdf_conditions.describe())

fig = px.histogram(hdf_conditions, x='Machine failure', title="Machine Failures under HDF Conditions", color='Machine failure')
fig.show()

# Exploring PWF conditions
pwf_conditions = df[(df['power'] < 3500) | (df['power'] > 9000)]
print("PWF Conditions Data Points:", pwf_conditions.shape[0])
print(pwf_conditions.describe())

fig = px.histogram(pwf_conditions, x='Machine failure', title="Machine Failures under PWF Conditions", color='Machine failure')
fig.show()

# Exploring OSF conditions
osf_conditions_L = df[(df['Type'] == 'L') & (df['overstrain'] > 11000)]
osf_conditions_M = df[(df['Type'] == 'M') & (df['overstrain'] > 12000)]
osf_conditions_H = df[(df['Type'] == 'H') & (df['overstrain'] > 13000)]

print("OSF Conditions Data Points (L):", osf_conditions_L.shape[0])
print("OSF Conditions Data Points (M):", osf_conditions_M.shape[0])
print("OSF Conditions Data Points (H):", osf_conditions_H.shape[0])

fig = go.Figure()

fig.add_trace(go.Histogram(x=osf_conditions_L['Machine failure'], name="L Type OSF", marker_color='blue'))
fig.add_trace(go.Histogram(x=osf_conditions_M['Machine failure'], name="M Type OSF", marker_color='orange'))
fig.add_trace(go.Histogram(x=osf_conditions_H['Machine failure'], name="H Type OSF", marker_color='green'))

fig.update_layout(title="Machine Failures under OSF Conditions by Product Type",
                  xaxis_title="Machine Failure",
                  yaxis_title="Count",
                  barmode='overlay')

fig.show()


               UDI  Air temperature [K]  Process temperature [K]  \
count  10000.00000         10000.000000             10000.000000   
mean    5000.50000           300.004930               310.005560   
std     2886.89568             2.000259                 1.483734   
min        1.00000           295.300000               305.700000   
25%     2500.75000           298.300000               308.800000   
50%     5000.50000           300.100000               310.100000   
75%     7500.25000           301.500000               311.100000   
max    10000.00000           304.500000               313.800000   

       Rotational speed [rpm]   Torque [Nm]  Tool wear [min]  Machine failure  \
count            10000.000000  10000.000000     10000.000000     10000.000000   
mean              1538.776100     39.986910       107.951000         0.033900   
std                179.284096      9.968934        63.654147         0.180981   
min               1168.000000      3.800000         0.000000   

HDF Conditions Data Points: 115
               UDI  Air temperature [K]  Process temperature [K]  \
count   115.000000           115.000000               115.000000   
mean   4405.947826           302.560870               310.788696   
std     287.417993             0.601853                 0.644513   
min    3237.000000           300.800000               309.400000   
25%    4199.000000           302.100000               310.300000   
50%    4418.000000           302.500000               310.700000   
75%    4627.500000           303.050000               311.300000   
max    4852.000000           303.700000               312.200000   

       Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  \
count              115.000000   115.000000       115.000000            115.0   
mean              1337.260870    53.166957       107.191304              1.0   
std                 34.745967     6.223494        63.629257              0.0   
min               1212.000000    41

PWF Conditions Data Points: 95
               UDI  Air temperature [K]  Process temperature [K]  \
count    95.000000            95.000000                95.000000   
mean   4306.136842           300.075789               309.954737   
std    2787.525505             2.147127                 1.600450   
min      51.000000           295.700000               306.200000   
25%    1787.500000           298.400000               308.700000   
50%    4293.000000           300.400000               310.200000   
75%    6580.000000           301.800000               311.000000   
max    9975.000000           304.000000               313.200000   

       Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  \
count               95.000000    95.000000        95.000000             95.0   
mean              1763.968421    48.514737       101.884211              1.0   
std                620.829138    26.788653        64.355704              0.0   
min               1200.000000     3.

OSF Conditions Data Points (L): 87
OSF Conditions Data Points (M): 9
OSF Conditions Data Points (H): 2
