# Week 3 — Health Index (HI) Baseline

**Learning Goals:**
- Build a physics-inspired health index from sensor data
- Understand degradation as a monotonic signal
- Compare HI-based RUL with ML baselines from Week 2

**Methods:** PCA-based HI, Weighted sensor sum, Degradation curve fitting

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from scipy.optimize import curve_fit
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.insert(0, '../../src')
from data.data_loader import load_train, load_test
from data.preprocess import INFORMATIVE_SENSORS_FD001, fit_scaler, apply_scaler
from train import compute_metrics

plt.style.use('seaborn-v0_8-whitegrid')
print('Imports OK')

In [None]:
df_train = load_train(fd_number=1, rul_cap=None)  # raw RUL for HI fitting
df_test, rul_true = load_test(fd_number=1)
sensors = INFORMATIVE_SENSORS_FD001
print(f'Data loaded: {df_train.shape[0]} train rows, {df_test.shape[0]} test rows')

## 1. PCA-based Health Index

In [None]:
# Scale sensors
scaler = MinMaxScaler()
df_train_scaled = df_train.copy()
df_train_scaled[sensors] = scaler.fit_transform(df_train[sensors])

# PCA: first principal component as health index
pca = PCA(n_components=1)
df_train_scaled['HI_pca'] = pca.fit_transform(df_train_scaled[sensors])

# Normalize HI to [0, 1] per engine (1 = healthy, 0 = failed)
for uid in df_train_scaled['unit_id'].unique():
    mask = df_train_scaled['unit_id'] == uid
    hi = df_train_scaled.loc[mask, 'HI_pca'].values
    # Ensure HI decreases (healthy→failed), flip if needed
    if np.corrcoef(np.arange(len(hi)), hi)[0,1] > 0:
        hi = -hi
    hi_min, hi_max = hi.min(), hi.max()
    if hi_max > hi_min:
        df_train_scaled.loc[mask, 'HI_pca'] = (hi - hi_min) / (hi_max - hi_min)
    else:
        df_train_scaled.loc[mask, 'HI_pca'] = 0.5

print(f'PCA explained variance ratio: {pca.explained_variance_ratio_[0]:.3f}')

In [None]:
# Plot health index for sample engines
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

for uid in [1, 25, 50, 75, 100]:
    unit = df_train_scaled[df_train_scaled['unit_id'] == uid]
    axes[0].plot(unit['cycle'], unit['HI_pca'], label=f'Engine {uid}', alpha=0.8)

axes[0].set_xlabel('Cycle')
axes[0].set_ylabel('Health Index (PCA)')
axes[0].set_title('PCA Health Index Over Engine Lifetime')
axes[0].legend()

# HI vs RUL scatter
axes[1].scatter(df_train_scaled['HI_pca'], df_train_scaled['RUL'], alpha=0.1, s=1)
axes[1].set_xlabel('Health Index (PCA)')
axes[1].set_ylabel('RUL')
axes[1].set_title('Health Index vs RUL')

plt.tight_layout()
plt.show()

## 2. Degradation Curve Fitting (HI → RUL)

In [None]:
# Fit an exponential degradation model: RUL = a * exp(b * HI) + c
def exp_model(hi, a, b, c):
    return a * np.exp(b * hi) + c

# Sample data for fitting (avoid NaN/inf)
sample = df_train_scaled[['HI_pca', 'RUL']].dropna().sample(n=5000, random_state=42)

try:
    popt, pcov = curve_fit(exp_model, sample['HI_pca'], sample['RUL'], 
                           p0=[100, 2, 0], maxfev=10000)
    print(f'Fitted parameters: a={popt[0]:.2f}, b={popt[1]:.2f}, c={popt[2]:.2f}')
    
    # Predict RUL from HI
    hi_range = np.linspace(0, 1, 100)
    rul_pred_curve = exp_model(hi_range, *popt)
    
    plt.figure(figsize=(10, 5))
    plt.scatter(sample['HI_pca'], sample['RUL'], alpha=0.1, s=1, label='Data')
    plt.plot(hi_range, rul_pred_curve, 'r-', linewidth=2, label='Fitted curve')
    plt.xlabel('Health Index')
    plt.ylabel('RUL')
    plt.title('Degradation Curve: HI → RUL')
    plt.legend()
    plt.show()
except Exception as e:
    print(f'Curve fitting failed: {e}')
    print('Using linear mapping instead.')
    popt = None

## 3. Evaluate HI Baseline on Test Set

In [None]:
# Apply the same PCA + scaler to test data
df_test_scaled = df_test.copy()
df_test_scaled[sensors] = scaler.transform(df_test[sensors])

# Get last observation per test engine
test_last = df_test_scaled.groupby('unit_id').tail(1).copy()
test_hi = pca.transform(test_last[sensors]).flatten()

# Predict RUL
if popt is not None:
    rul_pred_hi = np.clip(exp_model(test_hi, *popt), 0, 200)
else:
    # Linear fallback
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()
    lr.fit(df_train_scaled['HI_pca'].values.reshape(-1,1), df_train_scaled['RUL'].values)
    rul_pred_hi = np.clip(lr.predict(test_hi.reshape(-1,1)), 0, 200)

# Evaluate
metrics_hi = compute_metrics(rul_true, rul_pred_hi)
print(f'Health Index Baseline (FD001 Test):')
print(f'  MAE:        {metrics_hi["MAE"]:.2f}')
print(f'  RMSE:       {metrics_hi["RMSE"]:.2f}')
print(f'  NASA Score: {metrics_hi["NASA_Score"]:.0f}')

In [None]:
# Compare with Week 2 baselines
print('\n=== Comparison ===')
print(f'Health Index:       MAE={metrics_hi["MAE"]:.2f}, RMSE={metrics_hi["RMSE"]:.2f}')
print('(Compare with Week 2 results in reports/baseline_results.csv)')