In [1]:
from pathlib import Path
import os

# Project root (auto-resolves based on where the notebook is run)
PROJECT_ROOT = Path().resolve()

# Subfolders
DATA_DIR = PROJECT_ROOT / "data"
MODELS_DIR = PROJECT_ROOT / "models"
OUTPUTS_DIR = PROJECT_ROOT / "outputs"

# Ensure directories exist (optional)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(OUTPUTS_DIR, exist_ok=True)


In [3]:
# üìò 03_modeling_rul_xgboost.ipynb
# Purpose: Train an XGBoost Regressor to predict Remaining Useful Life (RUL)

# --------------------------------------------
# üîç Step 1: Load Processed Sensor Data
# --------------------------------------------
# What: Load the preprocessed dataset with RUL, rolling features, and scaled values
# Why: This is our training-ready data
# How: Using pandas


import pandas as pd

# ‚úÖ Use a raw string to safely read Windows paths
file_path = r"C:\Users\Arushi Sharma\Downloads\predictive_maintenance\data\processed_sensor_data.csv"

# ‚úÖ Now load the data
df = pd.read_csv(file_path)

# ‚úÖ Quick preview
print(df.shape)
print(df.head())


# --------------------------------------------------
# üîç Step 2: Separate Features and Label (RUL)
# --------------------------------------------------
# What: Split input features and the target variable RUL
# Why: Required format for supervised ML models
# How: Exclude columns that shouldn't be used as predictors

target = 'RUL'
exclude_cols = ['unit', 'cycle', 'RUL']
features = [col for col in df.columns if col not in exclude_cols]

X = df[features]
y = df[target]

# -----------------------------------------------------------
# üîç Step 3: Train-Test Split (by Unit to Avoid Data Leakage)
# -----------------------------------------------------------
# What: Split engines into training and testing groups
# Why: Prevent cycles from the same engine being split across train/test
# How: Use sklearn's train_test_split on unique unit IDs

from sklearn.model_selection import train_test_split

units = df['unit'].unique()
train_units, test_units = train_test_split(units, test_size=0.2, random_state=42)

train_df = df[df['unit'].isin(train_units)]
test_df = df[df['unit'].isin(test_units)]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

# -------------------------------------------
# üîç Step 4: Train XGBoost Regressor Model
# -------------------------------------------
# What: Train a gradient-boosted decision tree regressor
# Why: XGBoost is powerful for tabular regression tasks
# How: Use XGBRegressor from xgboost library

from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

model.fit(X_train, y_train)

# ---------------------------------------------
# üîç Step 5: Evaluate Model Performance
# ---------------------------------------------
# What: Assess how well model predicts RUL
# Why: To quantify accuracy and error
# How: Use MAE and RMSE metrics

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# ----------------------------------------------------------
# üîç Step 6: Save Model and Export Predictions (Optional)
# ----------------------------------------------------------
# What: Persist the trained model and output test predictions
# Why: For future reuse, deployment, or dashboard visualization
# How: Use joblib for model, pandas for CSV output

import os
import joblib

# Create the models folder if it doesn‚Äôt exist
os.makedirs("models", exist_ok=True)

# Save model
import joblib, os
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/xgb_rul_model.pkl")

# Save test predictions
results = test_df[['unit', 'cycle']].copy()
results['actual_RUL'] = y_test.values
results['predicted_RUL'] = y_pred

os.makedirs("outputs", exist_ok=True)
results.to_csv("outputs/rul_predictions.csv", index=False)
print("‚úÖ Test predictions saved.")


import pandas as pd

metrics_df = pd.DataFrame({"MAE": [mae], "RMSE": [rmse]})
metrics_df.to_csv("outputs/rul_metrics.csv", index=False)
print("‚úÖ Metrics saved to outputs/rul_metrics.csv")


# ---------------------------------------------
# ‚úÖ End of Step 6 ‚Äì XGBoost RUL Regression Done
# ---------------------------------------------


(20231, 48)
   unit  cycle  RUL  op_setting_1  op_setting_2  op_setting_3  sensor_2  \
0     1      5  187      0.390805      0.333333           0.0  0.349398   
1     1      6  186      0.252874      0.416667           0.0  0.268072   
2     1      7  185      0.557471      0.583333           0.0  0.382530   
3     1      8  184      0.304598      0.750000           0.0  0.406627   
4     1      9  183      0.545977      0.583333           0.0  0.274096   

   sensor_3  sensor_4  sensor_7  ...  sensor_14_mean  sensor_14_std  \
0  0.257467  0.404625  0.668277  ...        0.172934       0.250773   
1  0.292784  0.272113  0.776167  ...        0.166440       0.076297   
2  0.463920  0.261985  0.723027  ...        0.167374       0.043770   
3  0.259865  0.316003  0.644122  ...        0.164943       0.095942   
4  0.434707  0.211850  0.618357  ...        0.155780       0.307891   

   sensor_15_mean  sensor_15_std  sensor_17_mean  sensor_17_std  \
0        0.264082       0.473851        0.2