# 02 - Data Preprocessing
This notebook focuses on preparing the life expectancy dataset for modelling by handling quality issues, engineering informative features, and exporting train/test splits.

## Pipeline Overview
1. Load and standardise the raw WHO export.
2. Remove duplicates and impossible values.
3. Impute missing data using KNN for numeric columns and the mode for categoricals.
4. Engineer temporal and regional features.
5. Apply OneHotEncoder + scaling inside a `ColumnTransformer`.
6. Perform an 80/20 train-test split and export processed artefacts to `data/processed/`.

In [None]:

import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Tuple

import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

PROJECT_ROOT = Path('..').resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from streamlit_app import utils as app_utils

CATEGORICAL_FEATURES = app_utils.CATEGORICAL_FEATURES
FEATURE_COLUMNS = app_utils.FEATURE_COLUMNS
NUMERIC_FEATURES = app_utils.NUMERIC_FEATURES
TARGET_COLUMN = app_utils.TARGET_COLUMN

DATA_PATH = PROJECT_ROOT / 'data' / 'life_expectancy.csv'
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR = PROJECT_ROOT / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
def load_life_expectancy_data(data_path: Path = DATA_PATH) -> pd.DataFrame:
    text = data_path.read_text(encoding='utf-8').strip()
    if text.startswith('{'):
        payload = json.loads(text)
        records = payload.get('value', [])
        df = pd.DataFrame(records)
    else:
        df = pd.read_csv(data_path)
    return df


def to_snake_case(value: str) -> str:
    value = value or ''
    step1 = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', value)
    step2 = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', step1)
    cleaned = re.sub(r'[^0-9a-zA-Z]+', '_', step2)
    return '_'.join(filter(None, cleaned.lower().split('_')))


def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [to_snake_case(col) for col in df.columns]
    return df


def map_gender(value: str) -> str:
    mapping = {
        'sex_mle': 'Male',
        'sex_fmle': 'Female',
        'sex_btsx': 'Both sexes'
    }
    if not isinstance(value, str):
        return 'Both sexes'
    return mapping.get(value.lower(), value)


def enrich_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['gender'] = df.get('dim1', 'SEX_BTSX').apply(map_gender)
    df['country_code'] = df.get('spatial_dim')
    df['continent_code'] = df.get('parent_location_code')
    df['continent'] = df.get('parent_location')
    df['year'] = df.get('time_dim').astype(int)
    df['life_expectancy'] = df.get('numeric_value')
    df['life_expectancy_low'] = df.get('low')
    df['life_expectancy_high'] = df.get('high')
    df['record_date'] = pd.to_datetime(df.get('date'), errors='coerce')
    df['period_start'] = pd.to_datetime(df.get('time_dimension_begin'), errors='coerce')
    df['period_end'] = pd.to_datetime(df.get('time_dimension_end'), errors='coerce')
    df['value_range'] = df['life_expectancy_high'] - df['life_expectancy_low']
    drop_cols = [
        '@odata_context', 'dim1', 'dim1_type', 'dim2', 'dim2_type', 'dim3', 'dim3_type',
        'time_dimension_value', 'value'
    ]
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)
    return df

In [None]:
raw_df = load_life_expectancy_data()
df = enrich_columns(clean_column_names(raw_df))
print(f'Initial shape: {df.shape}')

valid_mask = df[TARGET_COLUMN].between(0, 120)
df = df[valid_mask]
df = df.drop_duplicates(subset=['country_code', 'year', 'gender'])
print(f'Shape after quality filters: {df.shape}')


In [None]:
df['continent_encoded'] = df['continent'].astype('category').cat.codes
year_min = df['year'].min()
year_max = df['year'].max()
df['year_normalized'] = (df['year'] - year_min) / (year_max - year_min)
df['continent_life_expectancy_mean'] = df.groupby('continent')[TARGET_COLUMN].transform('mean')
df['country_life_expectancy_mean'] = df.groupby('country_code')[TARGET_COLUMN].transform('mean')

df.head()


In [None]:
feature_cols = list(FEATURE_COLUMNS)
target_col = TARGET_COLUMN

X = df[feature_cols].copy()
y = df[target_col].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)


In [None]:
numeric_features = list(NUMERIC_FEATURES)
categorical_features = list(CATEGORICAL_FEATURES)

numeric_transformer = Pipeline(
    steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

feature_names = preprocessor.get_feature_names_out()
X_train_processed_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=feature_names)


In [None]:
clean_path = PROCESSED_DIR / 'life_expectancy_clean.csv'
train_path = PROCESSED_DIR / 'X_train_processed.csv'
test_path = PROCESSED_DIR / 'X_test_processed.csv'
y_train_path = PROCESSED_DIR / 'y_train.csv'
y_test_path = PROCESSED_DIR / 'y_test.csv'

df.to_csv(clean_path, index=False)
X_train_processed_df.to_csv(train_path, index=False)
X_test_processed_df.to_csv(test_path, index=False)
y_train.to_csv(y_train_path, index=False, header=True)
y_test.to_csv(y_test_path, index=False, header=True)

joblib.dump(preprocessor, MODELS_DIR / 'preprocessor.pkl')
print('Exported cleaned dataset, processed splits, and fitted preprocessor.')

In [None]:
pd.DataFrame({
    'dataset': ['X_train_processed', 'X_test_processed', 'y_train', 'y_test'],
    'rows': [len(X_train_processed_df), len(X_test_processed_df), len(y_train), len(y_test)],
    'columns': [X_train_processed_df.shape[1], X_test_processed_df.shape[1], 1, 1]
})