In [21]:
# ============================================================================
# CELL 1: Imports
# ============================================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import xgboost as xgb
from imodels import RuleFitRegressor  # pip install imodels
import mlflow
import mlflow.sklearn
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [22]:
# ============================================================================
# CELL 2: Setup MLflow
# ============================================================================
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("energy-consumption-forecasting")

print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"MLflow experiment: {mlflow.get_experiment_by_name('energy-consumption-forecasting')}")

MLflow tracking URI: file:../mlruns
MLflow experiment: <Experiment: artifact_location='file:c:/Users/AABDC5/repo/PythonML/dev/mlops-homework/notebooks/../mlruns/135180177654306881', creation_time=1760065591810, experiment_id='135180177654306881', last_update_time=1760065591810, lifecycle_stage='active', name='energy-consumption-forecasting', tags={}>


In [23]:
# ============================================================================
# CELL 3: Load Processed Data
# ============================================================================
X = pd.read_csv('../data/processed/X_features.csv')
y = pd.read_csv('../data/processed/y_target.csv').values.ravel()

print(f"✓ Data loaded successfully")
print(f"  Features shape: {X.shape}")
print(f"  Target shape: {y.shape}")
print(f"  Feature names: {list(X.columns[:5])}... ({len(X.columns)} total)")


✓ Data loaded successfully
  Features shape: (35740, 33)
  Target shape: (35740,)
  Feature names: ['num_skew__Lagging_Current_Reactive.Power_kVarh', 'num_skew__Leading_Current_Reactive_Power_kVarh', 'num_skew__CO2(tCO2)', 'num_skew__reactive_power_total', 'num_skew__NSM']... (33 total)


In [24]:
# ============================================================================
# CELL 4: Train-Test Split
# ============================================================================
# SPLIT STRATEGY JUSTIFICATION:
# ==============================
# test_size=0.2 (80/20 split):
#   - Industry standard for datasets with 5000+ samples
#   - Provides enough test data (1600 samples) for reliable evaluation
#   - Maintains sufficient training data (6400 samples) for learning
# 
# random_state=42:
#   - Ensures reproducible splits across experiments
#   - Critical for MLOps: same test set = fair model comparison
#   - The number 42 is arbitrary but conventional (Hitchhiker's Guide reference)
#
# No stratification:
#   - Regression task (stratify only applies to classification)
#   - Random split adequate for time-series-like energy data
#   - Consider TimeSeriesSplit for production if temporal ordering matters

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\n✓ Train-test split completed (80/20)")
print(f"  Train set: {X_train.shape} ← Used for learning patterns")
print(f"  Test set:  {X_test.shape}  ← Held-out for unbiased evaluation")
print(f"  Split ensures: same test data for all 8 models = fair comparison")



✓ Train-test split completed (80/20)
  Train set: (28592, 33) ← Used for learning patterns
  Test set:  (7148, 33)  ← Held-out for unbiased evaluation
  Split ensures: same test data for all 8 models = fair comparison


In [25]:
# ============================================================================
# CELL 5: Helper Function - Evaluate Model
# ============================================================================
def evaluate_model(y_true, y_pred, model_name):
    """
    Calculate regression metrics with business interpretation
    
    METRIC SELECTION RATIONALE:
    ===========================
    1. MSE (Mean Squared Error):
       - Penalizes large errors heavily (squared term)
       - Useful when large deviations are costly (e.g., capacity planning)
       - Units: (kWh)² - harder to interpret
    
    2. RMSE (Root Mean Squared Error):
       - Same units as target (kWh) - easier interpretation
       - Shows typical prediction error magnitude
       - Business use: "Model predicts within ±X kWh on average"
    
    3. MAE (Mean Absolute Error):
       - Average absolute deviation in kWh
       - More robust to outliers than RMSE
       - Business use: "Average prediction error is X kWh"
    
    4. R² (R-squared / Coefficient of Determination):
       - Percentage of variance explained (0-1 scale)
       - 0.8 = model explains 80% of consumption variability
       - Business use: Model quality indicator for stakeholders
    
    5. MAPE (Mean Absolute Percentage Error):
       - Error as percentage of actual value
       - Scale-independent, easy to communicate
       - Business use: "Model is off by X% on average"
       - Preferred by business analysts for forecasting accuracy
    """
    metrics = {
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2_score': r2_score(y_true, y_pred),
        'mape': np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100  # Add small epsilon to avoid division by zero
    }
    
    print(f"\n{model_name} Results:")
    print(f"  MSE:  {metrics['mse']:.4f}  ← Penalizes large errors")
    print(f"  RMSE: {metrics['rmse']:.4f} kWh  ← Typical prediction error")
    print(f"  MAE:  {metrics['mae']:.4f} kWh  ← Average absolute error")
    print(f"  R²:   {metrics['r2_score']:.4f}  ← Variance explained ({metrics['r2_score']*100:.1f}%)")
    print(f"  MAPE: {metrics['mape']:.2f}%  ← Average % error (business-friendly)")
    
    return metrics

In [26]:
# ============================================================================
# CELL 6: Model 1 - Linear Regression
# ============================================================================
print("\n" + "="*80)
print("MODEL 1: LINEAR REGRESSION")
print("="*80)

with mlflow.start_run(run_name="linear_regression_v1"):
    
    # Log parameters with justification
    params = {
        'model_type': 'LinearRegression',
        'fit_intercept': True,   # REASON: Allow baseline consumption (y-intercept) 
                                  # when all features are zero - represents base load
        'normalize': False        # REASON: Data already scaled in feature engineering step
                                  # (StandardScaler applied), no need for additional normalization
    }
    mlflow.log_params(params)
    
    print("📋 Parameter Justification:")
    print("  • fit_intercept=True: Captures baseline energy consumption")
    print("  • normalize=False: Features pre-scaled in preprocessing phase")
    
    # Train model
    model_lr = LinearRegression()
    model_lr.fit(X_train, y_train)
    
    # Predictions
    y_pred = model_lr.predict(X_test)
    
    # Evaluate
    metrics = evaluate_model(y_test, y_pred, "Linear Regression")
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(model_lr, "model")
    
    # Log feature names
    mlflow.log_dict({'features': list(X.columns)}, 'features.json')


MODEL 1: LINEAR REGRESSION
📋 Parameter Justification:
  • fit_intercept=True: Captures baseline energy consumption
  • normalize=False: Features pre-scaled in preprocessing phase

Linear Regression Results:
  MSE:  37451.1270  ← Penalizes large errors
  RMSE: 193.5229 kWh  ← Typical prediction error
  MAE:  55.0160 kWh  ← Average absolute error
  R²:   0.1904  ← Variance explained (19.0%)
  MAPE: 714.12%  ← Average % error (business-friendly)




In [27]:
# ============================================================================
# CELL 7: Model 2 - K-Nearest Neighbors (KNN)
# ============================================================================
print("\n" + "="*80)
print("MODEL 2: K-NEAREST NEIGHBORS")
print("="*80)

with mlflow.start_run(run_name="knn_v1"):
    
    # Parameters with justification
    params = {
        'model_type': 'KNeighborsRegressor',
        'n_neighbors': 5,      # REASON: Square root rule (√n ≈ 90 for 8000 samples)
                               # Balances bias-variance: too few=overfitting, too many=underfitting
                               # 5 is a good starting point for most datasets
        'weights': 'uniform',  # REASON: Equal weight to all neighbors (baseline approach)
                               # Alternative 'distance' gives more weight to closer neighbors
        'algorithm': 'auto',   # REASON: Let sklearn choose best algorithm (ball_tree/kd_tree/brute)
                               # based on data structure - optimal for performance
        'metric': 'minkowski'  # REASON: Generalized distance metric (p=2 gives Euclidean)
                               # Works well with scaled features
    }
    mlflow.log_params(params)
    
    print("📋 Parameter Justification:")
    print("  • n_neighbors=5: Balances local pattern detection vs generalization")
    print("  • weights=uniform: Equal importance to all 5 nearest neighbors")
    print("  • algorithm=auto: Automatically selects most efficient search algorithm")
    print("  • metric=minkowski: Standard distance measure for scaled numerical data")
    
    # Train model
    model_params = {k: v for k, v in params.items() if k != 'model_type'}
    model_knn = KNeighborsRegressor(**model_params)
    model_knn.fit(X_train, y_train)
    
    # Predictions
    y_pred = model_knn.predict(X_test)
    
    # Evaluate
    metrics = evaluate_model(y_test, y_pred, "K-Nearest Neighbors")
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(model_knn, "model")


MODEL 2: K-NEAREST NEIGHBORS
📋 Parameter Justification:
  • n_neighbors=5: Balances local pattern detection vs generalization
  • weights=uniform: Equal importance to all 5 nearest neighbors
  • algorithm=auto: Automatically selects most efficient search algorithm
  • metric=minkowski: Standard distance measure for scaled numerical data

K-Nearest Neighbors Results:
  MSE:  24177.1116  ← Penalizes large errors
  RMSE: 155.4899 kWh  ← Typical prediction error
  MAE:  13.9523 kWh  ← Average absolute error
  R²:   0.4773  ← Variance explained (47.7%)
  MAPE: 31.24%  ← Average % error (business-friendly)




In [28]:
# ============================================================================
# CELL 8: Model 3 - Random Forest
# ============================================================================
print("\n" + "="*80)
print("MODEL 3: RANDOM FOREST")
print("="*80)

with mlflow.start_run(run_name="random_forest_v1"):
    
    # Parameters with justification
    params = {
        'model_type': 'RandomForestRegressor',
        'n_estimators': 100,      # REASON: 100 trees balances performance vs computation time
                                   # More trees = better performance but diminishing returns after 100
                                   # Industry standard for initial experimentation
        'max_depth': 20,          # REASON: Limits tree depth to prevent overfitting
                                   # 20 allows complex patterns without memorizing training data
                                   # Energy data has ~50 features, depth=20 captures interactions
        'min_samples_split': 5,   # REASON: Minimum 5 samples required to split a node
                                   # Prevents creating splits on noise/outliers
                                   # Higher value = more generalization, less overfitting
        'min_samples_leaf': 2,    # REASON: At least 2 samples in leaf nodes
                                   # Ensures predictions based on multiple observations
                                   # Reduces variance in leaf predictions
        'max_features': 'sqrt',   # REASON: Consider √(n_features) for each split
                                   # Adds randomness to decorrelate trees
                                   # Standard practice for regression (classification uses 'log2')
        'random_state': 42,       # REASON: Reproducibility - ensures same results across runs
                                   # Critical for MLOps pipeline consistency
        'n_jobs': -1              # REASON: Use all CPU cores for parallel training
                                   # Significantly speeds up training time
    }
    mlflow.log_params(params)
    
    print("📋 Parameter Justification:")
    print("  • n_estimators=100: Standard ensemble size balancing accuracy & speed")
    print("  • max_depth=20: Deep enough for complex patterns, shallow enough to avoid overfitting")
    print("  • min_samples_split=5: Prevents splitting on noise, improves generalization")
    print("  • min_samples_leaf=2: Ensures robust predictions from multiple samples")
    print("  • max_features='sqrt': Decorrelates trees for better ensemble diversity")
    print("  • n_jobs=-1: Parallel processing for faster training")
    
    # Train model
    model_params = {k: v for k, v in params.items() if k != 'model_type'}
    model_rf = RandomForestRegressor(**model_params)
    model_rf.fit(X_train, y_train)
    
    # Predictions
    y_pred = model_rf.predict(X_test)
    
    # Evaluate
    metrics = evaluate_model(y_test, y_pred, "Random Forest")
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(model_rf, "model")
    
    # Log feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model_rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    mlflow.log_dict(feature_importance.to_dict(), 'feature_importance.json')
    
    print("\nTop 10 Important Features:")
    for idx, row in feature_importance.head(10).iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")


MODEL 3: RANDOM FOREST
📋 Parameter Justification:
  • n_estimators=100: Standard ensemble size balancing accuracy & speed
  • max_depth=20: Deep enough for complex patterns, shallow enough to avoid overfitting
  • min_samples_split=5: Prevents splitting on noise, improves generalization
  • min_samples_leaf=2: Ensures robust predictions from multiple samples
  • max_features='sqrt': Decorrelates trees for better ensemble diversity
  • n_jobs=-1: Parallel processing for faster training





Random Forest Results:
  MSE:  8714.8328  ← Penalizes large errors
  RMSE: 93.3533 kWh  ← Typical prediction error
  MAE:  9.4844 kWh  ← Average absolute error
  R²:   0.8116  ← Variance explained (81.2%)
  MAPE: 39.35%  ← Average % error (business-friendly)





Top 10 Important Features:
  num_skew__nsm_per_kwh: 0.3269
  num_skew__co2_per_kwh: 0.1664
  num_lin__reactive_power_diff: 0.0586
  num_lin__Lagging_Current_Power_Factor: 0.0547
  num_skew__NSM: 0.0488
  num_lin__reactive_power_ratio: 0.0440
  num_skew__Lagging_Current_Reactive.Power_kVarh: 0.0395
  num_lin__power_factor_ratio: 0.0370
  num_lin__avg_power_factor: 0.0363
  num_skew__reactive_power_total: 0.0348


In [29]:
# ============================================================================
# CELL 9: Model 4 - XGBoost
# ============================================================================
print("\n" + "="*80)
print("MODEL 4: XGBOOST")
print("="*80)

with mlflow.start_run(run_name="xgboost_v1"):
    
    # Parameters with justification
    params = {
        'model_type': 'XGBRegressor',
        'n_estimators': 100,       # REASON: 100 boosting rounds - standard starting point
                                    # XGBoost converges faster than RF, 100 is often sufficient
                                    # Can be tuned with early_stopping in production
        'learning_rate': 0.1,      # REASON: Controls step size at each boosting iteration
                                    # 0.1 is conservative, prevents overfitting
                                    # Lower = more robust but needs more estimators
                                    # Higher = faster training but risk of overfitting
        'max_depth': 6,            # REASON: Shallower than RF (6 vs 20) because boosting builds sequentially
                                    # Depth 6 = up to 64 leaf nodes, sufficient for most patterns
                                    # XGBoost default, proven effective across domains
        'min_child_weight': 1,     # REASON: Minimum sum of instance weight in child node
                                    # Controls overfitting similar to min_samples_leaf
                                    # 1 = allow smaller partitions for detailed patterns
        'subsample': 0.8,          # REASON: Use 80% of samples for each tree (row sampling)
                                    # Prevents overfitting by introducing randomness
                                    # 0.8 balances variance reduction with training stability
        'colsample_bytree': 0.8,   # REASON: Use 80% of features for each tree (column sampling)
                                    # Similar to RF's max_features, adds diversity
                                    # Reduces correlation between trees
        'gamma': 0,                # REASON: Minimum loss reduction to make split (0 = no constraint)
                                    # 0 for initial model, can increase for regularization
                                    # Acts as pruning parameter
        'reg_alpha': 0,            # REASON: L1 regularization (Lasso) - not applied initially
                                    # 0 = no feature selection penalty
                                    # Can be tuned if many irrelevant features suspected
        'reg_lambda': 1,           # REASON: L2 regularization (Ridge) - mild regularization
                                    # 1 = default, prevents extreme weights
                                    # Helps with generalization without being too restrictive
        'random_state': 42,        # REASON: Reproducibility for MLOps pipeline
        'n_jobs': -1               # REASON: Parallel processing for faster training
    }
    mlflow.log_params(params)
    
    print("📋 Parameter Justification:")
    print("  • n_estimators=100: Sufficient boosting rounds for convergence")
    print("  • learning_rate=0.1: Conservative step size prevents overfitting")
    print("  • max_depth=6: Optimal depth for boosted trees (shallower than RF)")
    print("  • subsample=0.8: Row sampling adds randomness, reduces overfitting")
    print("  • colsample_bytree=0.8: Column sampling decorrelates trees")
    print("  • reg_lambda=1: L2 regularization for weight control & generalization")
    
    # Train model
    model_params = {k: v for k, v in params.items() if k != 'model_type'}
    model_xgb = xgb.XGBRegressor(**model_params)
    model_xgb.fit(X_train, y_train)
    
    # Predictions
    y_pred = model_xgb.predict(X_test)
    
    # Evaluate
    metrics = evaluate_model(y_test, y_pred, "XGBoost")
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(model_xgb, "model")
    
    # Log feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model_xgb.feature_importances_
    }).sort_values('importance', ascending=False)
    
    mlflow.log_dict(feature_importance.to_dict(), 'feature_importance.json')
    
    print("\nTop 10 Important Features:")
    for idx, row in feature_importance.head(10).iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")



MODEL 4: XGBOOST
📋 Parameter Justification:
  • n_estimators=100: Sufficient boosting rounds for convergence
  • learning_rate=0.1: Conservative step size prevents overfitting
  • max_depth=6: Optimal depth for boosted trees (shallower than RF)
  • subsample=0.8: Row sampling adds randomness, reduces overfitting
  • colsample_bytree=0.8: Column sampling decorrelates trees
  • reg_lambda=1: L2 regularization for weight control & generalization





XGBoost Results:
  MSE:  13937.5805  ← Penalizes large errors
  RMSE: 118.0575 kWh  ← Typical prediction error
  MAE:  7.1857 kWh  ← Average absolute error
  R²:   0.6987  ← Variance explained (69.9%)
  MAPE: 20.17%  ← Average % error (business-friendly)





Top 10 Important Features:
  num_lin__is_high_consumption: 0.1420
  num_skew__nsm_per_kwh: 0.1412
  cat_ord__Load_Type: 0.1087
  num_skew__NSM: 0.0779
  num_skew__Lagging_Current_Reactive.Power_kVarh: 0.0768
  num_skew__co2_per_kwh: 0.0676
  num_lin__power_factor_ratio: 0.0616
  num_lin__reactive_power_diff: 0.0601
  num_lin__Lagging_Current_Power_Factor: 0.0599
  num_skew__reactive_power_total: 0.0545


In [30]:
# ============================================================================
# CELL 10: Model 5 - RuleFit (Rule-Based Regression)
# ============================================================================
print("\n" + "="*80)
print("MODEL 5: RULEFIT (RULE-BASED REGRESSION)")
print("="*80)

with mlflow.start_run(run_name="rulefit_v1"):
    
    # Parameters with justification
    params = {
        'model_type': 'RuleFitRegressor',
        'max_rules': 30,           # REASON: Limits number of rules for interpretability
                                    # 30 rules = manageable for human review & business validation
                                    # Too many rules = loss of interpretability
                                    # Too few = may miss important patterns
        'tree_size': 4,            # REASON: Maximum tree depth for rule extraction
                                    # Depth 4 = rules with up to 4 conditions (IF A AND B AND C AND D)
                                    # Shallow trees = simpler, more interpretable rules
                                    # Deeper than 6 creates overly complex rules
        'random_state': 42,        # REASON: Reproducibility for consistent rule generation
        'exp_rand_tree_size': True # REASON: Exponentially sample tree depths (1, 2, 4, 8...)
                                    # Creates diverse rule complexity (simple + complex)
                                    # Improves ensemble diversity and pattern coverage
    }
    mlflow.log_params(params)
    
    print("📋 Parameter Justification:")
    print("  • max_rules=30: Balances coverage with interpretability for stakeholders")
    print("  • tree_size=4: Generates understandable rules (max 4 conditions per rule)")
    print("  • exp_rand_tree_size=True: Creates diverse rule complexity for better coverage")
    print("  • Business Value: Rules can be validated by domain experts and implemented in systems")
    
    # Train model
    model_params = {k: v for k, v in params.items() if k != 'model_type'}
    model_rf = RuleFitRegressor(**model_params)
    model_rf.fit(X_train, y_train)
    
    # Predictions
    y_pred = model_rf.predict(X_test)
    
    # Evaluate
    metrics = evaluate_model(y_test, y_pred, "RuleFit")
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(model_rf, "model")
    
    # Extract and log rules
    try:
        rules = model_rf.get_rules()
        if rules is not None and len(rules) > 0:
            rules_df = pd.DataFrame(rules)
            top_rules = rules_df.nlargest(10, 'importance') if 'importance' in rules_df.columns else rules_df.head(10)
            mlflow.log_dict(top_rules.to_dict(), 'top_rules.json')
            
            print(f"\n✓ Generated {len(rules)} rules")
            print(f"\nTop 5 Most Important Rules:")
            for idx, row in top_rules.head(5).iterrows():
                if 'rule' in row and 'importance' in row:
                    print(f"  Rule: {row['rule']}")
                    print(f"  Importance: {row['importance']:.4f}\n")
    except Exception as e:
        print(f"\n⚠️  Could not extract rules: {e}")
        print("  Model trained successfully, but rule extraction unavailable")



MODEL 5: RULEFIT (RULE-BASED REGRESSION)
📋 Parameter Justification:
  • max_rules=30: Balances coverage with interpretability for stakeholders
  • tree_size=4: Generates understandable rules (max 4 conditions per rule)
  • exp_rand_tree_size=True: Creates diverse rule complexity for better coverage
  • Business Value: Rules can be validated by domain experts and implemented in systems

RuleFit Results:
  MSE:  3189.9299  ← Penalizes large errors
  RMSE: 56.4795 kWh  ← Typical prediction error
  MAE:  8.6153 kWh  ← Average absolute error
  R²:   0.9310  ← Variance explained (93.1%)
  MAPE: 65.25%  ← Average % error (business-friendly)





⚠️  Could not extract rules: 'RuleFitRegressor' object has no attribute 'get_rules'
  Model trained successfully, but rule extraction unavailable


In [31]:
# ============================================================================
# CELL 11: Model 6 - PCA + Linear Regression
# ============================================================================
print("\n" + "="*80)
print("MODEL 6: PCA + LINEAR REGRESSION")
print("="*80)

with mlflow.start_run(run_name="pca_linear_regression_v1"):
    
    # Parameters with justification
    params = {
        'model_type': 'PCA_LinearRegression',
        'n_components': 0.95,  # REASON: Retain 95% of variance - standard in literature
                               # Reduces dimensionality while preserving most information
                               # Helps with multicollinearity in energy data (correlated features)
                               # 95% balances information retention vs dimensionality reduction
        'whiten': False        # REASON: No whitening (variance scaling) of PCA components
                               # Features already standardized in preprocessing
                               # Whitening can reduce interpretability of components
                               # False = preserve relative importance of principal components
    }
    mlflow.log_params(params)
    
    print("📋 Parameter Justification:")
    print("  • n_components=0.95: Keeps 95% of data variance while reducing dimensions")
    print("  • whiten=False: Features pre-scaled, whitening unnecessary")
    print("  • Business Benefit: Addresses multicollinearity in correlated energy metrics")
    
    # Create pipeline with PCA and Linear Regression
    pipeline = Pipeline([
        ('pca', PCA(n_components=params['n_components'], whiten=params['whiten'])),
        ('regressor', LinearRegression())
    ])
    
    # Train pipeline
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Get number of components selected
    n_components_selected = pipeline.named_steps['pca'].n_components_
    explained_variance = pipeline.named_steps['pca'].explained_variance_ratio_.sum()
    
    mlflow.log_param('n_components_selected', n_components_selected)
    mlflow.log_param('explained_variance', explained_variance)
    
    print(f"\n✓ PCA reduced features from {X.shape[1]} to {n_components_selected}")
    print(f"  Explained variance: {explained_variance:.4f}")
    
    # Evaluate
    metrics = evaluate_model(y_test, y_pred, "PCA + Linear Regression")
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(pipeline, "model")



MODEL 6: PCA + LINEAR REGRESSION
📋 Parameter Justification:
  • n_components=0.95: Keeps 95% of data variance while reducing dimensions
  • whiten=False: Features pre-scaled, whitening unnecessary
  • Business Benefit: Addresses multicollinearity in correlated energy metrics

✓ PCA reduced features from 33 to 12
  Explained variance: 0.9547

PCA + Linear Regression Results:
  MSE:  44514.1979  ← Penalizes large errors
  RMSE: 210.9839 kWh  ← Typical prediction error
  MAE:  33.0836 kWh  ← Average absolute error
  R²:   0.0377  ← Variance explained (3.8%)
  MAPE: 307.68%  ← Average % error (business-friendly)




In [None]:
# ============================================================================
# CELL 12: Model 7 - PCA + Random Forest
# ============================================================================
print("\n" + "="*80)
print("MODEL 7: PCA + RANDOM FOREST")
print("="*80)

with mlflow.start_run(run_name="pca_random_forest_v1"):
    
    # Parameters with justification
    params = {
        'model_type': 'PCA_RandomForest',
        'n_components': 0.95,  # REASON: 95% variance retention (consistent with PCA+LR)
                               # Tests if PCA helps tree-based models (usually less effective)
                               # RF handles high dimensions well, but PCA can speed training
        'n_estimators': 100,   # REASON: Same as standalone RF for fair comparison
                               # 100 trees is sufficient even with reduced dimensions
        'max_depth': 20,       # REASON: Same depth as standalone RF
                               # PCA components are linear combinations, may need depth
        'random_state': 42     # REASON: Reproducibility
    }
    mlflow.log_params(params)
    
    print("📋 Parameter Justification:")
    print("  • n_components=0.95: Tests if dimensionality reduction helps RF performance")
    print("  • n_estimators=100: Consistent with standalone RF for comparison")
    print("  • max_depth=20: Maintains complexity despite fewer features")
    print("  • Use Case: May improve speed, unlikely to improve accuracy for RF")
    
    # Create pipeline
    pipeline = Pipeline([
        ('pca', PCA(n_components=params['n_components'])),
        ('regressor', RandomForestRegressor(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            random_state=params['random_state'],
            n_jobs=-1
        ))
    ])
    
    # Train pipeline
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Get PCA info
    n_components_selected = pipeline.named_steps['pca'].n_components_
    explained_variance = pipeline.named_steps['pca'].explained_variance_ratio_.sum()
    
    mlflow.log_param('n_components_selected', n_components_selected)
    mlflow.log_param('explained_variance', explained_variance)
    
    print(f"\n✓ PCA reduced features from {X.shape[1]} to {n_components_selected}")
    print(f"  Explained variance: {explained_variance:.4f}")
    
    # Evaluate
    metrics = evaluate_model(y_test, y_pred, "PCA + Random Forest")
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(pipeline, "model")

In [32]:
# ============================================================================
# CELL 13: Model 8 - PCA + XGBoost
# ============================================================================
print("\n" + "="*80)
print("MODEL 8: PCA + XGBOOST")
print("="*80)

with mlflow.start_run(run_name="pca_xgboost_v1"):
    
    # Parameters with justification
    params = {
        'model_type': 'PCA_XGBoost',
        'n_components': 0.95,   # REASON: 95% variance retention (consistent across PCA models)
                                # XGBoost handles high dimensions, but PCA may help with:
                                # 1) Training speed (fewer features to evaluate per split)
                                # 2) Reducing noise from low-variance components
        'n_estimators': 100,    # REASON: Same as standalone XGBoost for fair comparison
                                # Boosting may converge faster with uncorrelated PCA components
        'learning_rate': 0.1,   # REASON: Same conservative rate as standalone XGBoost
                                # PCA doesn't change optimal learning rate significantly
        'max_depth': 6,         # REASON: XGBoost standard depth maintained
                                # PCA components are continuous, depth 6 still appropriate
        'random_state': 42      # REASON: Reproducibility across experiments
    }
    mlflow.log_params(params)
    
    print("📋 Parameter Justification:")
    print("  • n_components=0.95: Reduces feature space while keeping 95% information")
    print("  • n_estimators=100: Consistent boosting rounds for fair comparison")
    print("  • learning_rate=0.1: Conservative rate unchanged by PCA transformation")
    print("  • max_depth=6: Standard XGBoost depth works well with PCA components")
    print("  • Expected Benefit: Faster training, potentially better generalization")
    
    # Create pipeline
    pipeline = Pipeline([
        ('pca', PCA(n_components=params['n_components'])),
        ('regressor', xgb.XGBRegressor(
            n_estimators=params['n_estimators'],
            learning_rate=params['learning_rate'],
            max_depth=params['max_depth'],
            random_state=params['random_state'],
            n_jobs=-1
        ))
    ])
    
    # Train pipeline
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Get PCA info
    n_components_selected = pipeline.named_steps['pca'].n_components_
    explained_variance = pipeline.named_steps['pca'].explained_variance_ratio_.sum()
    
    mlflow.log_param('n_components_selected', n_components_selected)
    mlflow.log_param('explained_variance', explained_variance)
    
    print(f"\n✓ PCA reduced features from {X.shape[1]} to {n_components_selected}")
    print(f"  Explained variance: {explained_variance:.4f}")
    
    # Evaluate
    metrics = evaluate_model(y_test, y_pred, "PCA + XGBoost")
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(pipeline, "model")



MODEL 8: PCA + XGBOOST
📋 Parameter Justification:
  • n_components=0.95: Reduces feature space while keeping 95% information
  • n_estimators=100: Consistent boosting rounds for fair comparison
  • learning_rate=0.1: Conservative rate unchanged by PCA transformation
  • max_depth=6: Standard XGBoost depth works well with PCA components
  • Expected Benefit: Faster training, potentially better generalization





✓ PCA reduced features from 33 to 12
  Explained variance: 0.9547

PCA + XGBoost Results:
  MSE:  45859.0867  ← Penalizes large errors
  RMSE: 214.1473 kWh  ← Typical prediction error
  MAE:  23.8554 kWh  ← Average absolute error
  R²:   0.0086  ← Variance explained (0.9%)
  MAPE: 89.76%  ← Average % error (business-friendly)




In [33]:
# ============================================================================
# CELL 14: Compare All Models
# ============================================================================
print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)

# Get all runs from the experiment
experiment = mlflow.get_experiment_by_name("energy-consumption-forecasting")
runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Display key metrics
comparison_df = runs_df[['tags.mlflow.runName', 'metrics.rmse', 'metrics.mae', 'metrics.r2_score', 'metrics.mape']].copy()
comparison_df.columns = ['Model', 'RMSE', 'MAE', 'R²', 'MAPE (%)']
comparison_df = comparison_df.sort_values('RMSE', ascending=True)

print("\n📊 Performance Comparison (sorted by RMSE):")
print(comparison_df.to_string(index=False))

# Find best model
best_model_idx = comparison_df['RMSE'].idxmin()
best_model = comparison_df.loc[best_model_idx, 'Model']
best_rmse = comparison_df.loc[best_model_idx, 'RMSE']
best_r2 = comparison_df.loc[best_model_idx, 'R²']

print("\n" + "="*80)
print("BEST MODEL")
print("="*80)
print(f"🏆 Model: {best_model}")
print(f"   RMSE: {best_rmse:.4f} kWh")
print(f"   R²:   {best_r2:.4f} ({best_r2*100:.1f}% variance explained)")

print("\n" + "="*80)
print("MODEL SELECTION INSIGHTS")
print("="*80)
print("""
EXPECTED PERFORMANCE PATTERNS:
================================
1. LINEAR REGRESSION: 
   - Baseline model, assumes linear relationships
   - Good if energy consumption is linear function of features
   - Expected: Moderate performance, high interpretability

2. K-NEAREST NEIGHBORS:
   - Captures local patterns and non-linearity
   - Sensitive to feature scaling (already handled)
   - Expected: Good for similar time periods/conditions

3. RANDOM FOREST:
   - Handles non-linear relationships and interactions
   - Robust to outliers, provides feature importance
   - Expected: Strong performance, slower predictions

4. XGBOOST:
   - Often best performer for tabular data
   - Handles complex patterns via sequential boosting
   - Expected: Top 2 model, efficient predictions

5. RULEFIT:
   - Generates interpretable rules for stakeholders
   - Balances accuracy with explainability
   - Expected: Good performance + business insights

6. PCA + LINEAR REGRESSION:
   - Addresses multicollinearity issues
   - Reduces overfitting via dimensionality reduction
   - Expected: Better than plain LR if features correlated

7. PCA + RANDOM FOREST:
   - Tests if PCA helps tree-based models
   - May speed up training, unlikely to improve accuracy
   - Expected: Similar to RF, faster training

8. PCA + XGBOOST:
   - Combines dimensionality reduction with boosting
   - May improve generalization
   - Expected: Competitive with pure XGBoost

SELECTION CRITERIA:
===================
• PRODUCTION (Speed + Accuracy): Choose XGBoost or RF
• INTERPRETABILITY (Business): Choose RuleFit or Linear Regression  
• DEPLOYMENT (Low latency): Choose Linear Regression or KNN
• RESEARCH (Best RMSE): Choose top performer regardless of complexity
""")

print("\n" + "="*80)
print("MLFLOW UI")
print("="*80)
print("\nTo view all runs and compare models in MLflow UI:")
print("👉 Run: mlflow ui --backend-store-uri file:../mlruns")
print("👉 Open: http://localhost:5000")
print("\n" + "="*80)
print("\n✅ MODEL TRAINING PIPELINE COMPLETED SUCCESSFULLY")
print("="*80)


MODEL COMPARISON SUMMARY

📊 Performance Comparison (sorted by RMSE):
                   Model       RMSE       MAE        R²   MAPE (%)
              rulefit_v1  56.479465  8.615257  0.931039  65.247918
        random_forest_v1  93.353269  9.484375  0.811600  39.352744
              xgboost_v1 118.057531  7.185674  0.698693  20.165720
                  knn_v1 155.489908 13.952253  0.477332  31.237594
              xgboost_v1 178.845295 15.054670  0.308525  99.029948
              xgboost_v1 178.845295 15.054670  0.308525  99.029948
              xgboost_v1 178.845295 15.054670  0.308525  99.029948
        random_forest_v1 192.748294 22.110672  0.196839 117.423930
        random_forest_v1 192.748294 22.110672  0.196839 117.423930
        random_forest_v1 192.748294 22.110672  0.196839 117.423930
    linear_regression_v1 193.522937 55.016029  0.190371 714.122902
pca_linear_regression_v1 210.983881 33.083637  0.037679 307.679102
          pca_xgboost_v1 214.147348 23.855385  0.008605  89