# About Dataset
Please note that this is the original dataset with additional information and proper attribution. There is at least one other version of this dataset on Kaggle that was uploaded without permission. Please be fair and attribute the original author.
This synthetic dataset is modeled after an existing milling machine and consists of 10 000 data points from a stored as rows with 14 features in columns

UID: unique identifier ranging from 1 to 10000

product ID: consisting of a letter L, M, or H for low (50% of all products), medium (30%) and high (20%) as product quality variants and a variant-specific serial number

type: just the product type L, M or H from column 2

air temperature [K]: generated using a random walk process later normalized to a standard deviation of 2 K around 300 K
process temperature [K]: generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.

rotational speed [rpm]: calculated from a power of 2860 W, overlaid with a normally distributed noise

torque [Nm]: torque values are normally distributed around 40 Nm with a SD = 10 Nm and no negative values.

tool wear [min]: The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process.

a 'machine failure' label that indicates, whether the machine has failed in this particular datapoint for any of the following failure modes are true.
### The machine failure consists of five independent failure modes

###### 1) tool wear failure (TWF): the tool will be replaced of fail at a randomly selected tool wear time between 200 - 240 mins (120 times in our dataset). At this point in time, the tool is replaced 69 times, and fails 51 times (randomly assigned).

###### 2) heat dissipation failure (HDF): heat dissipation causes a process failure, if the difference between air- and process temperature is below 8.6 K and the tools rotational speed is below 1380 rpm. This is the case for 115 data points.

###### 3) power failure (PWF): the product of torque and rotational speed (in rad/s) equals the power required for the process. If this power is below 3500 W or above 9000 W, the process fails, which is the case 95 times in our dataset.

###### 4) overstrain failure (OSF): if the product of tool wear and torque exceeds 11,000 minNm for the L product variant (12,000 M, 13,000 H), the process fails due to overstrain. This is true for 98 datapoints.

###### 5) random failures (RNF): each process has a chance of 0,1 % to fail regardless of its process parameters. This is the case for only 5 datapoints, less than could be expected for 10,000 datapoints in our dataset.

#####  If at least one of the above failure modes is true, the process fails and the 'machine failure' label is set to 1. It is therefore not transparent to the machine learning method, which of the failure modes has caused the process to fail.

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read Data

In [None]:
df=pd.read_csv("ai4i2020.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

# Data preprocessing

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
#drop UDI and Product ID columns because all values in them are unique
df.drop(columns=['UDI','Product ID'],inplace=True)

In [None]:
# This visualization shows this unbalanced data

failure_columns = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']

plt.figure(figsize=(10, 6))

# Create a countplot for each failure type
for i, col in enumerate(failure_columns):
    plt.subplot(2, 3, i+1)  # 2 rows, 3 columns grid
    sns.countplot(x=col, data=df)
    plt.title(f'{col} Distribution')
    plt.xlabel('')
    plt.ylabel('Count')
    plt.xticks(ticks=[0, 1], labels=['No Failure', 'Failure'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
df['No failure']=df['Machine failure']==0

In [None]:
df['No failure']=df['No failure'].replace({True:1,False:0})

In [None]:
targets=df[['TWF','HDF','PWF','OSF','RNF','No failure']]
df['merged_target'] = targets.idxmax(axis=1)
df['merged_target'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
lb.fit(df['merged_target'])

print(lb.classes_)      # Shows original categories
print(lb.transform(lb.classes_))  # Shows corresponding encoded values
df['merged_target']=lb.transform(df['merged_target'])

In [None]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
lb.fit(df['Type'])
print(lb.classes_)      # Shows original categories
print(lb.transform(lb.classes_))  # Shows corresponding encoded values
df['Type']=lb.transform(df['Type'])
 

In [None]:
df.drop(columns=['TWF','HDF','PWF','OSF','RNF','Machine failure','No failure'],inplace=True)

In [None]:
target=df['merged_target']
x=df.drop(columns=['merged_target'])

In [None]:
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=42)
X,target_res=sm.fit_resample(x,target)
X=pd.DataFrame(X)
target_res=pd.DataFrame(target_res)
target_res.value_counts()

In [None]:
target_res=target_res.replace({0:'HDF', 1:'No failure', 2:'OSF', 3:'PWF', 4:'RNF', 5:'TWF'})

In [None]:
target_res.value_counts()

In [None]:

df= pd.concat([X, target_res], axis=1)
df['Type']=df['Type'].replace({0:'H',1:'L',2:'M'})

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder 
onehotencoder=OneHotEncoder()
X=onehotencoder.fit_transform(df[['merged_target']]).toarray()
for index , col in enumerate(onehotencoder.categories_[0]) :
    df[col]=X[:,index]

In [None]:
df.drop(columns=['merged_target'],inplace=True)

In [None]:
df.info()

# Visualization 

In [None]:
failure_columns = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']

plt.figure(figsize=(10, 6))

# Create a countplot for each failure type
for i, col in enumerate(failure_columns):
    plt.subplot(2, 3, i+1)  # 2 rows, 3 columns grid
    sns.countplot(x=col, data=df)
    plt.title(f'{col} Distribution')
    plt.xlabel('')
    plt.ylabel('Count')
    plt.xticks(ticks=[0, 1], labels=['No Failure', 'Failure'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
failure_columns = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
plt.figure(figsize=(10, 6))

# Create a countplot for each failure type
for i, col in enumerate(failure_columns):
    plt.subplot(2, 3, i+1)  # 2 rows, 3 columns grid
    sns.scatterplot(data=df, 
               x='Torque [Nm]', 
               y='Rotational speed [rpm]', 
               hue=col,
               palette='viridis',
               alpha=0.7)
    plt.title(f'{col} by Torque and RPM')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set up the figure
plt.figure(figsize=(12, 8))

# Define failure types
failure_types = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
colors = ['blue', 'green', 'red', 'purple', 'orange']

# Plot KDE for each failure type
for failure, color in zip(failure_types, colors):
    failure_tool_wear = df[df[failure] == 1]['Tool wear [min]']
    sns.kdeplot(failure_tool_wear, 
                label=f'{failure} Failure', 
                fill=True, 
                color=color,
                alpha=0.5)
    
    # Add mean line
    plt.axvline(failure_tool_wear.mean(), 
                color=color, 
                linestyle='--', 
                linewidth=2,
                label=f'{failure} Mean')

plt.xlabel('Tool wear [min]')
plt.ylabel('Density')
plt.title('Tool Wear Distribution by Failure Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Set up the figure
plt.figure(figsize=(12, 8))

# Define failure types
failure_types = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
colors = ['blue', 'green', 'red', 'purple', 'orange']

# Plot KDE for each failure type
for failure, color in zip(failure_types, colors):
    failure_tool_wear = df[df[failure] == 1]['Torque [Nm]']
    sns.kdeplot(failure_tool_wear, 
                label=f'{failure} Failure', 
                fill=True, 
                color=color,
                alpha=0.5)
    
    # Add mean line
    plt.axvline(failure_tool_wear.mean(), 
                color=color, 
                linestyle='--', 
                linewidth=2,
                label=f'{failure} Mean')

plt.xlabel('Torque [Nm]')
plt.ylabel('Density')
plt.title('Torque Distribution by Failure Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Set up the figure
plt.figure(figsize=(12, 8))

# Define failure types
failure_types = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
colors = ['blue', 'green', 'red', 'purple', 'orange']

# Plot KDE for each failure type
for failure, color in zip(failure_types, colors):
    failure_tool_wear = df[df[failure] == 1]['Rotational speed [rpm]']
    sns.kdeplot(failure_tool_wear, 
                label=f'{failure} Failure', 
                fill=True, 
                color=color,
                alpha=0.5)
    
    # Add mean line
    plt.axvline(failure_tool_wear.mean(), 
                color=color, 
                linestyle='--', 
                linewidth=2,
                label=f'{failure} Mean')

plt.xlabel('Rotational speed [rpm]')
plt.ylabel('Density')
plt.title('Rotational speed Distribution by Failure Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Set up the figure
plt.figure(figsize=(12, 8))

# Define failure types
failure_types = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
colors = ['blue', 'green', 'red', 'purple', 'orange']

# Plot KDE for each failure type
for failure, color in zip(failure_types, colors):
    failure_tool_wear = df[df[failure] == 1]['Process temperature [K]']
    sns.kdeplot(failure_tool_wear, 
                label=f'{failure} Failure', 
                fill=True, 
                color=color,
                alpha=0.5)
    
    # Add mean line
    plt.axvline(failure_tool_wear.mean(), 
                color=color, 
                linestyle='--', 
                linewidth=2,
                label=f'{failure} Mean')

plt.xlabel('Process temperature [K]')
plt.ylabel('Density')
plt.title('Process temperature Distribution by Failure Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Set up the figure
plt.figure(figsize=(12, 8))

# Define failure types
failure_types = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
colors = ['blue', 'green', 'red', 'purple', 'orange']

# Plot KDE for each failure type
for failure, color in zip(failure_types, colors):
    failure_tool_wear = df[df[failure] == 1]['Air temperature [K]']
    sns.kdeplot(failure_tool_wear, 
                label=f'{failure} Failure', 
                fill=True, 
                color=color,
                alpha=0.5)
    
    # Add mean line
    plt.axvline(failure_tool_wear.mean(), 
                color=color, 
                linestyle='--', 
                linewidth=2,
                label=f'{failure} Mean')

plt.xlabel('Air temperature [K]')
plt.ylabel('Density')
plt.title('Air temperature Distribution by Failure Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
failure_columns = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
plt.figure(figsize=(10, 6))

# Create a countplot for each failure type
for i, col in enumerate(failure_columns):
    plt.subplot(2, 3, i+1)  # 2 rows, 3 columns grid
    sns.countplot(data=df, x='Type', hue=col)
    plt.title(f'{col} by type')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [None]:
# Boxplot
plt.figure(figsize=(8, 5))
sns.boxplot(data=df['Torque [Nm]'])
plt.title("Box Plot - Detecting Outliers")
plt.show()

In [None]:
# Boxplot
plt.figure(figsize=(8, 5))
sns.boxplot(data=df['Rotational speed [rpm]'])
plt.title("Box Plot - Detecting Outliers")
plt.show()

In [None]:
# Boxplot
plt.figure(figsize=(8, 5))
sns.boxplot(data=df['Process temperature [K]'])
plt.title("Box Plot - Detecting Outliers")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Torque [Nm]'], kde=True, stat="density", bins=30)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Rotational speed [rpm]'], kde=True, stat="density", bins=30)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Process temperature [K]'], kde=True, stat="density", bins=30)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Tool wear [min]'], kde=True, stat="density", bins=30)

In [None]:
from sklearn.preprocessing import LabelEncoder
import pickle
for col in df.select_dtypes(include='object'):
    lb=LabelEncoder()
    df[col]=lb.fit_transform(df[col])
    filename = f"{col}_label_encoder.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(lb, f)

In [None]:
df.info()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Show the plot
plt.title("Correlation Heatmap")
plt.show()

In [None]:
features=df.drop(columns=['TWF','HDF','PWF','OSF','RNF','No failure'])
y=df[['TWF','HDF','PWF','OSF','RNF']]

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pickle
for col in features.columns:
    my_scaler=MinMaxScaler()
    features[col]=my_scaler.fit_transform(features[[col]])
    filename = f"{col}_MinMaxScaler.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(my_scaler, f)

In [None]:
features.describe()

# Split Data 

In [None]:
import numpy as np
from collections import defaultdict

def multilabel_train_test_split(X, y, test_size=0.2, random_state=None):
    """Manual implementation of multilabel stratified split"""
    if random_state:
        np.random.seed(random_state)
    
    # Get unique label combinations and their counts
    label_combos = y.astype(str).apply('_'.join, axis=1)
    combo_counts = label_combos.value_counts()
    
    # Initialize indices
    train_idx, test_idx = [], []
    
    # Stratify each label combination separately
    for combo in combo_counts.index:
        combo_indices = np.where(label_combos == combo)[0]
        np.random.shuffle(combo_indices)
        
        split_point = int(len(combo_indices) * (1 - test_size))
        train_idx.extend(combo_indices[:split_point])
        test_idx.extend(combo_indices[split_point:])
    
    return X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]

# Usage
X_train, X_test, y_train, y_test = multilabel_train_test_split(
    features, y, test_size=0.2, random_state=42
)

# Train & Evaluation Models

In [None]:
from sklearn.metrics import precision_score,f1_score,recall_score,accuracy_score,confusion_matrix,classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
Overall_Accuracy=[]
models={
    "LogisticRegression":MultiOutputClassifier(LogisticRegression(random_state=1,class_weight='balanced')),
    "SVM": MultiOutputClassifier(SVC()),
   "KNeighborsClassifier":MultiOutputClassifier(KNeighborsClassifier()),
   "GaussianNB":MultiOutputClassifier(GaussianNB()),
   "DecisionTreeClassifier":MultiOutputClassifier(DecisionTreeClassifier(random_state=1,criterion = 'entropy',max_depth=28,class_weight='balanced')), #{"gini", "entropy", "log_loss"}),
    "RandomForestClassifier":MultiOutputClassifier(RandomForestClassifier(random_state=1,n_estimators= 20 , criterion = 'entropy',max_depth=50,class_weight='balanced'))
   
}
for model_name, model in models.items():
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    Overall_Accuracy.append(accuracy_score(y_test, y_pred))
performance=pd.DataFrame([Overall_Accuracy],columns=models.keys(),index=['Overall_Accuracy']).T

performance  

# Parameter Tuning 

In [None]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier(random_state=1)
param_grid = {
    "n_estimators":range(18,24,2),
    'max_depth': range(45,55, 2),
    #'min_samples_leaf': range(1, 10, 1),
    #'min_samples_split': range(2, 10, 1),
    'criterion': ["entropy", "gini"],
    'class_weight':['balanced']
}
model = MultiOutputClassifier(rf)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=4)
grid_search.fit(X_train, y_train)
print("best accuracy using GridSearchCV", grid_search.best_score_)
print(grid_search.best_estimator_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
rf = grid_search.best_estimator_

# Multi-output wrapper
model = MultiOutputClassifier(rf)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [None]:
importances = np.mean([estimator.feature_importances_ for estimator in model.estimators_], axis=0)
feature_names = features.columns

# Create a DataFrame
feature_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(x='Importance', y='Feature', data=feature_imp_df, palette='viridis')
plt.title('Average Feature Importance Across All Outputs')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
#'Process temperature [K]',
features = ['Air temperature [K]',
            'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
targets = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']

# 1. Classification report and accuracy for each failure type
for i, target in enumerate(targets):
    print(f"\nEvaluation for {target}:")
    print(classification_report(y_test[target], y_pred[:, i]))
    print(f"Accuracy: {accuracy_score(y_test[target], y_pred[:, i]):.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(y_test[target], y_pred[:, i])
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Negative', 'Positive'], 
                yticklabels=['Negative', 'Positive'])
    plt.title(f'Confusion Matrix for {target}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# 2. ROC curves for all failure types
plt.figure(figsize=(10, 8))

for i, target in enumerate(targets):
    fpr, tpr, _ = roc_curve(y_test[target], y_pred[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{target} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Each Failure Type')
plt.legend(loc="lower right")
plt.show()

# 3. Combined evaluation
print("\nOverall Accuracy:")
print(f"{accuracy_score(y_test, y_pred):.4f}")

# Deployment 

In [None]:
filename = f"RandomForestClassifier.pkl"
with open(filename, 'wb') as f:
        pickle.dump(model, f)

<br><br>
<div style="text-align: center; font-size: 48px; color:white; font-weight: bold;">
    THANK YOU!
</div>
<br><br>