In [20]:
import pandas as pd
import numpy as np
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Define the file path to the dataset
# This path points to a CSV file named 'ai4i2020.csv' located on the desktop
path = r"C:\Users\PC\Desktop\Ai4i2020\ai4i2020.csv"

# Load the dataset from the specified path into a pandas DataFrame named 'ai4i'
ai4i = pd.read_csv(path)
# Display summary information about the DataFrame including data types and non-null values
print("Summary information about the DataFrame including data types and non-null values\n")
print(ai4i.info())

#features engineering 

def engineer_features(df):
     # Calculate temperature difference between process and air temperature
    df['Temp_diff'] = df['Process temperature [K]'] - df['Air temperature [K]']
    # Calculate power in Watts using torque and rotational speed (P = T * ω)
    # Convert rpm to radians/second by multiplying by 2π/60
    df['Power [W]'] = df['Torque [Nm]'] * df['Rotational speed [rpm]'] * (2 * np.pi / 60)
    df['Wear_Torque'] = df['Tool wear [min]'] * df['Torque [Nm]']
    # Create binary feature for Tool Wear Failure risk
    # Risk is present when tool wear is between 200-240 minutes
    df['TWF_risk'] = ((df['Tool wear [min]'] >= 200) & (df['Tool wear [min]'] <= 240)).astype(int)
    
    # Initialize Overstrain Failure risk feature
    df['OSF_risk'] = 0
    # Set OSF risk based on product type and Wear_Torque threshold
    # Different thresholds for different product types (L, M, H)
    df.loc[(df['Type'] == 'L') & (df['Wear_Torque'] > 11000), 'OSF_risk'] = 1
    df.loc[(df['Type'] == 'M') & (df['Wear_Torque'] > 12000), 'OSF_risk'] = 1
    df.loc[(df['Type'] == 'H') & (df['Wear_Torque'] > 13000), 'OSF_risk'] = 1

    return df

ai4i_X_eFeatures = engineer_features(ai4i)
# Display information about the engineered features dataset
# This shows the column names, non-null counts, and data types
print(f"information about the engineered features dataset:\n")
print(ai4i_X_eFeatures.info())

# Create an encoder that:
# - handles unknown categories by ignoring them
# - returns dense arrays instead of sparse matrices
# - outputs pandas DataFrames instead of numpy arrays
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False).set_output(transform="pandas")

# Apply one-hot encoding to the "Type" column in the dataset
# This transforms categorical values into binary columns (one for each category)
enc = encoder.fit_transform(ai4i_X_eFeatures[["Type"]])

# Display the encoded data
print(f'\nEncode Data("Type Categories"): \n{enc}')

# Concatenate engineered features with one-hot encoded 'Type' column and remove original 'Type' column
ai4i_X_eFeatures = pd.concat([ai4i_X_eFeatures,enc],axis=1).drop(columns="Type")
ai4i_X_eFeatures.columns

# Define list of features to be used for modeling
features = [
    'Air temperature [K]',          # Raw sensor measurement of air temperature
    'Process temperature [K]',      # Raw sensor measurement of process temperature
    'Rotational speed [rpm]',       # Raw sensor measurement of rotational speed
    'Torque [Nm]',                  # Raw sensor measurement of torque
    'Tool wear [min]',              # Raw measurement of tool wear time
    'TWF', 'HDF', 'PWF', 'OSF', 'RNF',  # Failure type indicators (binary)
    'Temp_diff',                    # Engineered feature: temperature difference
    'Power [W]',                    # Engineered feature: calculated power
    'Wear_Torque',                  # Engineered feature: relationship between wear and torque
    'TWF_risk',                     # Engineered feature: TWF risk indicator
    'OSF_risk',                     # Engineered feature: OSF risk indicator
    'Type_H', 'Type_L','Type_M'     # One-hot encoded machine type features
]

# Define target variable for prediction
target = 'Machine failure'

# Count the number of machine failures in the dataset
machine_failure_count= (ai4i_X_eFeatures["Machine failure"]== 1).sum()
print(f"\nNumber of Machine failure in the dataset: {machine_failure_count}")

# Extract the feature columns from the dataset for model training
X = ai4i_X_eFeatures[features]

# Extract the target variable (machine failure) for model training
y = ai4i_X_eFeatures[target]

# Split the dataset into training (80%) and testing (20%) sets
# stratify=y ensures that the class distribution in the splits matches the original dataset
# random_state=12 ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=12)

# Print the shapes of the training and testing sets to verify the split
print("\nShapes of the training and testing sets to verify the split")
print(f"Features - Test: {X_test.shape}, Train: {X_train.shape}")
print(f"Target - Test: {y_test.shape}, Train: {y_train.shape}")


# Initialize SMOTE with sampling_strategy=0.5 (minority class will be resampled to 50% of majority class)
# random_state ensures reproducibility of results
smote = SMOTE(sampling_strategy=0.5, random_state=12)

# Apply SMOTE to training data
# This creates synthetic samples for the minority class
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nclass distribution before and after resampling")
# Display class distribution before applying SMOTE
print("Before SMOTE:", y_train.value_counts())
# Display class distribution after applying SMOTE
# Should show more balanced classes compared to before
print("After SMOTE:", y_train_resampled.value_counts())

# Initialize a Random Forest Classifier with:
# - 200 decision trees (n_estimators)
# - balanced class weights to handle imbalanced data
# - maximum tree depth of 10 to prevent overfitting
# - random state of 12 for reproducibility
model = RandomForestClassifier(n_estimators= 200, class_weight="balanced", max_depth=10, random_state=12)

# Train the model on the resampled training data
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
print("\n=== RandomForest Results (Clean) ===")
# Calculate and display accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")

# Calculate and display ROC AUC 
auc = roc_auc_score(y_test, y_pred)
print(f"roc_auc_score: {(auc):.3f}")

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Display detailed classification metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


# Prepare model data for saving (includes model and feature information)
model_data={
    "model" : model,
    "features_order" : X_train.columns.tolist(),
    "features_types" : X_train.dtypes.to_dict()
}
# Save model to disk
joblib.dump(model_data, "ai4i2020_rfc_M2.plk")
# Confirm model has been saved
print("\nModel is saved as: ai4i2020_rfc_M2.pkl ")




Summary information about the DataFrame including data types and non-null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UID                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
