# 03 - Model Training & Evaluation
This notebook trains multiple regressors to predict life expectancy, performs hyperparameter tuning, and compares model performance.

## Workflow
1. Load the cleaned dataset from `data/processed/`.
2. Build training and evaluation utilities (metrics, plotting helpers).
3. Define preprocessing + model pipelines for Linear Regression, Random Forest, and Gradient Boosting.
4. Tune models with `GridSearchCV`, evaluate on the hold-out set, and visualise predictions.
5. Save the best-performing pipeline to `models/final_model.pkl` and log the comparison table.

In [None]:

from pathlib import Path
from typing import Dict
import sys

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

PROJECT_ROOT = Path('..').resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from streamlit_app import utils as app_utils

CATEGORICAL_FEATURES = app_utils.CATEGORICAL_FEATURES
FEATURE_COLUMNS = app_utils.FEATURE_COLUMNS
NUMERIC_FEATURES = app_utils.NUMERIC_FEATURES
PROCESSED_PATH = app_utils.PROCESSED_PATH
TARGET_COLUMN = app_utils.TARGET_COLUMN
load_clean_data = app_utils.load_clean_data
prepare_clean_dataframe = app_utils.prepare_clean_dataframe

MODELS_DIR = PROJECT_ROOT / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)


import matplotlib as mpl
if not hasattr(mpl.rcParams, '_get'):
    mpl.rcParams._get = mpl.rcParams.get


In [None]:

def build_preprocessor() -> ColumnTransformer:
    numeric_features = list(NUMERIC_FEATURES)
    categorical_features = list(CATEGORICAL_FEATURES)

    numeric_transformer = Pipeline([
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    return ColumnTransformer([
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ])


def regression_metrics(y_true, y_pred) -> Dict[str, float]:
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}


def plot_predictions(y_true, y_pred, title: str) -> None:
    plt.figure(figsize=(6, 6))
    plt.scatter(y_true, y_pred, alpha=0.6)
    lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
    plt.plot(lims, lims, 'r--')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(title)
    plt.show()


In [None]:

df = load_clean_data()
expected_cols = set(FEATURE_COLUMNS) | {TARGET_COLUMN}
if not expected_cols.issubset(df.columns):
    print('Detected missing columns in processed data â€“ regenerating with shared utils...')
    df = prepare_clean_dataframe()
    PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(PROCESSED_PATH, index=False)

feature_cols = list(FEATURE_COLUMNS)
target_col = TARGET_COLUMN

X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)
X_train.shape, X_test.shape


In [None]:
models_config = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {
            'regressor__fit_intercept': [True, False]
        }
    },
    'Random Forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': [200, 300],
            'regressor__max_depth': [None, 10, 20],
            'regressor__min_samples_split': [2, 5]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': [150, 200],
            'regressor__learning_rate': [0.05, 0.1],
            'regressor__max_depth': [2, 3]
        }
    }
}

In [None]:
results = []
best_models = {}

for model_name, cfg in models_config.items():
    print(f'\nTraining {model_name}...')
    pipeline = Pipeline([
        ('preprocessor', build_preprocessor()),
        ('regressor', cfg['model'])
    ])

    grid = GridSearchCV(
        pipeline,
        param_grid=cfg['params'],
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    metrics = regression_metrics(y_test, y_pred)
    metrics['Model'] = model_name
    metrics['Best Params'] = grid.best_params_
    results.append(metrics)
    best_models[model_name] = grid.best_estimator_
    plot_predictions(y_test, y_pred, f'{model_name} Predictions')

results_df = pd.DataFrame(results)
results_df

In [None]:
results_long = results_df.melt(id_vars=['Model', 'Best Params'], value_vars=['MAE', 'RMSE', 'R2'],
                                 var_name='Metric', value_name='Value')
fig = px.bar(results_long, x='Model', y='Value', color='Metric', barmode='group', title='Model Comparison')
fig.show()

In [None]:
best_model_name = results_df.sort_values(by='R2', ascending=False).iloc[0]['Model']
best_pipeline = best_models[best_model_name]
model_path = MODELS_DIR / 'final_model.pkl'
metrics_path = MODELS_DIR / 'model_performance.csv'

joblib.dump(best_pipeline, model_path)
results_df.to_csv(metrics_path, index=False)

print(f'Saved best model ({best_model_name}) to {model_path}')
results_df[['Model', 'MAE', 'RMSE', 'R2']]