# Week 2 — Simple ML Baselines + Feature Engineering

**Learning Goals:**
- Build reproducible baselines with classical ML (scikit-learn)
- Feature engineering for time series → tabular format
- Establish metric baselines: MAE, RMSE, NASA Score

**Models:** Linear Regression, Ridge, Random Forest, Gradient Boosting

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.insert(0, '../../src')
from data.data_loader import load_train, load_test
from data.preprocess import INFORMATIVE_SENSORS_FD001, train_val_split, fit_scaler, apply_scaler
from data.features import extract_features, extract_windowed_features
from train import compute_metrics

plt.style.use('seaborn-v0_8-whitegrid')
print('Imports OK')

## 1. Load & Prepare Data

In [None]:
# Load data
df_train_raw = load_train(fd_number=1, rul_cap=125)
df_test_raw, rul_true = load_test(fd_number=1)

# Train/val split by engine
df_train, df_val = train_val_split(df_train_raw, val_fraction=0.2, random_state=42)
print(f'Train engines: {df_train["unit_id"].nunique()}, Val engines: {df_val["unit_id"].nunique()}')

## 2. Feature Engineering

In [None]:
# Extract features (one row per engine, using last 30 cycles)
WINDOW = 30
sensors = INFORMATIVE_SENSORS_FD001

feat_train = extract_features(df_train, sensors, window=WINDOW)
feat_val = extract_features(df_val, sensors, window=WINDOW)
feat_test = extract_features(df_test_raw, sensors, window=WINDOW)

print(f'Train features shape: {feat_train.shape}')
print(f'Val features shape:   {feat_val.shape}')
print(f'Test features shape:  {feat_test.shape}')
print(f'\nFeature columns ({len(feat_train.columns)}): {list(feat_train.columns[:10])}...')

In [None]:
# Prepare X, y arrays
feature_cols = [c for c in feat_train.columns if c not in ['unit_id', 'RUL']]

X_train = feat_train[feature_cols].values
y_train = feat_train['RUL'].values

X_val = feat_val[feature_cols].values
y_val = feat_val['RUL'].values

X_test = feat_test[feature_cols].values
y_test = rul_true

print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_val:   {X_val.shape}, y_val: {y_val.shape}')
print(f'X_test:  {X_test.shape}, y_test: {y_test.shape}')

## 3. Train Baseline Models

In [None]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge (alpha=10)': Ridge(alpha=10),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42),
}

# Train and evaluate each model
results = []
predictions = {}

for name, model in models.items():
    print(f'\n--- {name} ---')
    model.fit(X_train, y_train)
    
    # Validation predictions
    y_val_pred = model.predict(X_val)
    val_metrics = compute_metrics(y_val, y_val_pred)
    
    # Test predictions 
    y_test_pred = model.predict(X_test)
    test_metrics = compute_metrics(y_test, y_test_pred)
    
    print(f'  Val  — MAE: {val_metrics["MAE"]:.2f}, RMSE: {val_metrics["RMSE"]:.2f}')
    print(f'  Test — MAE: {test_metrics["MAE"]:.2f}, RMSE: {test_metrics["RMSE"]:.2f}, NASA: {test_metrics["NASA_Score"]:.0f}')
    
    results.append({
        'Model': name,
        'Val MAE': val_metrics['MAE'],
        'Val RMSE': val_metrics['RMSE'],
        'Test MAE': test_metrics['MAE'],
        'Test RMSE': test_metrics['RMSE'],
        'Test NASA Score': test_metrics['NASA_Score'],
    })
    predictions[name] = y_test_pred

results_df = pd.DataFrame(results)
results_df

## 4. Visualize Predictions

In [None]:
# Prediction vs True RUL for each model
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, (name, y_pred) in enumerate(predictions.items()):
    ax = axes[idx // 2, idx % 2]
    ax.scatter(y_test, y_pred, alpha=0.6, s=30, edgecolors='none')
    lims = [0, max(y_test.max(), y_pred.max()) + 10]
    ax.plot(lims, lims, 'r--', linewidth=1)
    ax.set_xlabel('True RUL')
    ax.set_ylabel('Predicted RUL')
    metrics = compute_metrics(y_test, y_pred)
    ax.set_title(f'{name}\nMAE={metrics["MAE"]:.1f}, RMSE={metrics["RMSE"]:.1f}')

plt.suptitle('Baseline Models — Predicted vs True RUL (FD001 Test)', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# Error analysis: which engines are hardest to predict?
best_model_name = results_df.loc[results_df['Test MAE'].idxmin(), 'Model']
best_preds = predictions[best_model_name]
errors = best_preds - y_test

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar(range(len(errors)), errors, color=['red' if e > 0 else 'blue' for e in errors], alpha=0.7)
axes[0].set_xlabel('Engine ID')
axes[0].set_ylabel('Prediction Error (pred - true)')
axes[0].set_title(f'{best_model_name} — Error per Engine')
axes[0].axhline(0, color='black', linewidth=0.5)

axes[1].hist(errors, bins=20, edgecolor='black', alpha=0.7, color='steelblue')
axes[1].axvline(0, color='red', linestyle='--')
axes[1].set_xlabel('Prediction Error')
axes[1].set_ylabel('Count')
axes[1].set_title(f'{best_model_name} — Error Distribution')

plt.tight_layout()
plt.show()

## 5. Feature Importance (Best Model)

In [None]:
# Feature importance from Random Forest
rf_model = models['Random Forest']
importance = rf_model.feature_importances_
feat_imp = pd.Series(importance, index=feature_cols).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(12, 8))
feat_imp.head(20).plot(kind='barh', ax=ax, color='steelblue', edgecolor='black')
ax.set_xlabel('Feature Importance')
ax.set_title('Random Forest — Top 20 Feature Importances')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## 6. Save Results

In [None]:
# Save results table
results_df.to_csv('../../reports/baseline_results.csv', index=False)
print('Baseline results saved to reports/baseline_results.csv')
print()
print(results_df.to_markdown(index=False))