# Machine Failure Prediction

## 0.Data Loading

### 0.1 Module Importing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.modules.outlier_detector import detect_outliers
from src.modules.outlier_imputer import OutlierImputer
from src.modules.feature_scaler import preprocessor
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import mean_absolute_percentage_error, r2_score, make_scorer
from src.modules.time_extractor import TimeFeatureExtractor


In [None]:
# Load the dataset
df = pd.read_csv("data/IndFD-PM-DT dataset.csv")

# Verify data load
df

## 0.2 Data Exploration

In [None]:
# Inspect dataframe shape
print("Dataset shape:", df.shape)

In [None]:
# Check data types of each column and non-null value count
df.info()

In [None]:
# Count missing values
missing_counts = df.isnull().sum()
print("Missing values per column:\n", missing_counts)

In [None]:
df.describe()

In [None]:
# Count duplicates
duplicate_rows = df.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

## 1. Data Processing Pipeline

In [None]:
# Load splited data

X = df.drop("Fault_Diagnosis", axis=1)
y = df["Fault_Diagnosis"]

train_idx = np.load("data/train_idx.npy")
test_idx  = np.load("data/test_idx.npy")

X_train = X.iloc[train_idx]
X_test  = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test  = y.iloc[test_idx]

In [None]:
# Set preprocessing pipeline
pipeline = Pipeline(steps=[
    ('outlier_imputer', OutlierImputer(multiplier=1.5)),
    ('time_extractor', TimeFeatureExtractor(column='Datetime')),
    ('scaler', preprocessor)
])

In [None]:
# Transform datasets
preprocessor = pipeline.fit(X_train)

X_train_proc = preprocessor.transform(X_train)
X_test_proc = preprocessor.transform(X_test)

In [None]:
# Set the evaluation metrics
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score, greater_is_better=True)

scoring = {
    "MAPE": mape_scorer,
    "R2": r2_scorer
}

## 2. Models

### 2.1. XGBoost

In [None]:
# XGBoost pipeline
xgb_pipeline = Pipeline([
    ("preproccess_pipeline", pipeline),
    ("regressor", XGBRegressor(objective='reg:squarederror', random_state=42))
])

In [None]:
# Define parameter search range
param_dist = {
    # number of trees
    "regressor__n_estimators": randint(500, 1200),

    # tree depth
    "regressor__max_depth": randint(4, 10),

    # step size shrinkage
    "regressor__learning_rate": uniform(0.005, 0.095),

    # subsample ratio of the training instance
    "regressor__subsample": uniform(0.7, 0.3),

    # subsample ratio of columns when constructing each tree
    "regressor__colsample_bytree": uniform(0.7, 0.3),

    # L1 & L2 regularisation weights
    "regressor__reg_alpha": uniform(0.0, 1.0),
    "regressor__reg_lambda": uniform(0.5, 1.5),

    # minimum sum of instance weight needed
    "regressor__min_child_weight": randint(1, 10),

    # minimum loss reduction
    "regressor__gamma": uniform(0.0, 0.5),
}

In [None]:
# Set randomised Search Cross validation
search = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_dist,
    n_iter=50, # try 50 random combos
    scoring=scoring,
    refit="MAPE", # Set MAPE as the optimiser metric.
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    random_state=42,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [None]:
# Train the model
search.fit(X_train, y_train)

# Get the best parameters and the score
print("Best params:", search.best_params_)
print("Best CV MAPE:", -search.best_score_)
print("Corresponding CV R2:", search.cv_results_["mean_test_R2"][search.best_index_])

In [None]:
# Evaluate on test data

# Find the best model
best_model = search.best_estimator_

# Set validation target
y_pred_test = best_model.predict(X_test)

# Calculate the evaluation
hold_mape = mean_absolute_percentage_error(y_test, y_pred_test)
hold_r2 = r2_score(y_test, y_pred_test)

print(f"Hold‑out MAPE: {hold_mape:.4f}")
print(f"Hold‑out R²: {hold_r2:.4f}")