# Machine Failure Prediction

## 0.Data Loading

### 0.1 Module Importing

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.modules.outlier_detector import detect_outliers
from src.modules.outlier_imputer import OutlierImputer
from src.modules.feature_scaler import preprocessor


In [3]:
# Load the dataset
df = pd.read_csv("data/IndFD-PM-DT dataset.csv")

# Verify data load
df

Unnamed: 0,Datetime,Vibration_Level,Temperature_Readings,Pressure_Data,Acoustic_Signals,Humidity_Levels,Motor_Speed,Torque_Data,Energy_Consumption,Production_Rate,...,Controller_Setpoints,Actual_vs_Setpoint_Values,Alarm_Trigger_Data,Repair_Logs,Spare_Part_Usage,Anomaly_Scores,Fault_Probability,Operator_Shift_Data,Quality_Control_Test_Results,Fault_Diagnosis
0,2021-06-21 08:00:00,0.336129,23.865396,15.346143,42.709897,31.404880,59.812632,3.677243,1.994422,71.472236,...,55.040202,53.633334,0,0,4.326138,31.806763,26.218982,2,89.347791,1
1,2019-03-07 10:00:00,0.441680,27.332215,19.648924,56.146535,23.516423,65.572156,1.057276,7.086933,79.280461,...,56.245287,56.292362,0,0,3.006478,16.828544,10.021797,2,89.013490,0
2,2023-01-13 08:00:00,0.349080,29.483668,10.466149,80.487979,27.537668,53.951904,0.881063,8.155945,100.994780,...,47.621152,44.103209,0,0,0.468073,30.532711,45.273987,3,86.990711,0
3,2021-08-13 23:00:00,0.507108,19.005118,32.012730,63.602072,14.888160,36.715289,1.372567,0.974947,108.291454,...,48.786144,51.057235,0,0,3.270788,2.718529,28.625665,3,79.234826,0
4,2022-09-21 07:00:00,1.402247,32.096101,22.620923,47.574150,22.703902,52.411045,0.437824,1.401210,95.616224,...,54.382062,55.865060,0,0,4.598529,35.209996,18.994673,3,86.246880,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43820,2019-09-19 01:00:00,0.347451,32.978580,7.315431,73.282519,8.003138,66.676623,2.635279,10.126611,148.029482,...,53.836874,54.727018,0,0,1.645819,12.286389,7.149823,3,87.035059,2
43821,2020-04-15 04:00:00,1.058969,26.184765,23.259838,38.305330,21.218103,80.000000,3.679083,3.105544,86.109704,...,51.818563,50.712255,0,0,1.675778,20.070351,29.528974,1,91.843367,1
43822,2023-05-09 22:00:00,0.250290,34.318023,23.191210,39.245290,42.714236,44.781710,2.794093,3.967244,104.457433,...,46.538447,47.757877,0,0,4.841985,13.628070,15.906455,1,85.005856,0
43823,2019-02-05 20:00:00,0.747001,36.644425,24.684167,50.495128,27.355317,41.558678,1.049931,12.275970,146.353829,...,51.130096,52.198700,0,0,1.790334,9.445317,28.129942,3,81.650147,0


## 0.2 Data Exploration

In [4]:
# Inspect dataframe shape
print("Dataset shape:", df.shape)

Dataset shape: (43825, 38)


In [5]:
# Check data types of each column and non-null value count
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43825 entries, 0 to 43824
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Datetime                       43825 non-null  object 
 1   Vibration_Level                43825 non-null  float64
 2   Temperature_Readings           43825 non-null  float64
 3   Pressure_Data                  43825 non-null  float64
 4   Acoustic_Signals               43825 non-null  float64
 5   Humidity_Levels                43825 non-null  float64
 6   Motor_Speed                    43825 non-null  float64
 7   Torque_Data                    43825 non-null  float64
 8   Energy_Consumption             43825 non-null  float64
 9   Production_Rate                43825 non-null  float64
 10  Tool_Wear_Rate                 43825 non-null  float64
 11  Machine_Utilization_Rate       43825 non-null  float64
 12  Cycle_Time_Per_Operation       43825 non-null 

In [7]:
# Count missing values
missing_counts = df.isnull().sum()
print("Missing values per column:\n", missing_counts)

Missing values per column:
 Datetime                         0
Vibration_Level                  0
Temperature_Readings             0
Pressure_Data                    0
Acoustic_Signals                 0
Humidity_Levels                  0
Motor_Speed                      0
Torque_Data                      0
Energy_Consumption               0
Production_Rate                  0
Tool_Wear_Rate                   0
Machine_Utilization_Rate         0
Cycle_Time_Per_Operation         0
Idle_Time                        0
Machine_Load_Percentage          0
Ambient_Temperature              0
Humidity                         0
Air_Quality_Index                0
Machine_Health_Index             0
Failure_Mode_Indicators          0
Maintenance_Logs                 0
Previous_Fault_Occurrences       0
Predictive_Maintenance_Scores    0
Component_Degradation_Index      0
Real_Time_Performance_Index      0
Machine_Start_Stop_Events        0
Downtime_Incidents               0
Fault_Trigger_Timestamps   

In [6]:
df.describe()

Unnamed: 0,Vibration_Level,Temperature_Readings,Pressure_Data,Acoustic_Signals,Humidity_Levels,Motor_Speed,Torque_Data,Energy_Consumption,Production_Rate,Tool_Wear_Rate,...,Controller_Setpoints,Actual_vs_Setpoint_Values,Alarm_Trigger_Data,Repair_Logs,Spare_Part_Usage,Anomaly_Scores,Fault_Probability,Operator_Shift_Data,Quality_Control_Test_Results,Fault_Diagnosis
count,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,...,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0,43825.0
mean,0.499235,30.013493,28.468127,49.818038,20.170006,49.909403,3.003601,9.957493,100.073372,24.995389,...,49.993859,49.982764,0.049606,0.096817,2.006614,20.087401,29.950859,1.901905,89.981169,0.656361
std,0.499789,5.003203,15.879884,14.957175,12.173021,9.966962,2.124429,9.983611,25.000217,14.382023,...,5.006638,5.389119,0.217133,0.295712,1.416941,12.145394,13.77334,0.8307,4.997179,1.15615
min,3e-06,10.277032,0.21625,-11.435987,0.079138,1.70564,0.006696,0.000449,-4.730857,0.058636,...,27.579604,23.542815,0.0,0.0,0.00399,0.029079,1.050927,1.0,68.295099,0.0
25%,0.143893,26.618437,16.071721,39.766143,10.829864,43.109726,1.441208,2.867996,83.192927,13.802988,...,46.637394,46.356892,0.0,0.0,0.969281,10.699885,19.571182,1.0,86.606247,0.0
50%,0.346396,30.037904,26.422164,49.815537,18.062967,49.901249,2.522669,6.896966,100.151089,22.971233,...,50.021118,49.995034,0.0,0.0,1.689532,18.023829,28.554473,2.0,89.986887,0.0
75%,0.690445,33.411046,38.813255,59.909451,27.454614,56.668254,4.03798,13.777403,117.00805,34.047212,...,53.365112,53.595967,0.0,0.0,2.697366,27.450155,38.955632,3.0,93.336861,1.0
max,5.244292,45.0,91.929697,119.250894,88.364871,80.0,21.818879,115.162662,213.169597,87.356111,...,69.490124,71.476751,1.0,1.0,12.994366,83.724162,87.808973,3.0,111.086828,4.0


In [8]:
# Count duplicates
duplicate_rows = df.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

Number of duplicate rows: 0


## 1. Data Processing Pipeline

In [21]:
# Load splited data

X = df.drop("Fault_Diagnosis", axis=1)
y = df["Fault_Diagnosis"]

train_idx = np.load("data/train_idx.npy")
test_idx  = np.load("data/test_idx.npy")

X_train = X.iloc[train_idx]
X_test  = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test  = y.iloc[test_idx]

## 2. Models