# 01 - Exploratory Data Analysis
This notebook investigates the WHO Global Health Observatory life expectancy records and documents key exploratory findings that will inform the downstream preprocessing and modelling steps.

## Objectives
- Load the raw WHO API export and convert it into an analysis-friendly pandas DataFrame.
- Inspect structure, missingness, and categorical coverage (countries, years, genders).
- Visualise numerical and categorical distributions, correlation patterns, and outliers.
- Produce aggregated insights at country and year levels that will guide feature engineering.

In [None]:
import json
import re
from pathlib import Path
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

sns.set_theme(style='whitegrid', context='notebook')
plt.rcParams['figure.figsize'] = (10, 5)
pd.set_option('display.max_columns', None)

DATA_PATH = Path('../data/life_expectancy.csv')

In [None]:
def load_life_expectancy_data(data_path: Path = DATA_PATH) -> pd.DataFrame:
    '''Load the WHO dataset whether it is stored as CSV text or JSON payload.'''
    data_path = data_path.resolve()
    text = data_path.read_text(encoding='utf-8').strip()
    if text.startswith('{'):
        payload = json.loads(text)
        records = payload.get('value', [])
        df = pd.DataFrame(records)
    else:
        df = pd.read_csv(data_path)
    return df


def to_snake_case(value: str) -> str:
    '''Convert arbitrary column names to snake_case.'''
    value = value or ''
    value = re.sub(r'[^0-9a-zA-Z]+', '_', value)
    return value.strip('_').lower()


def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [to_snake_case(col) for col in df.columns]
    return df


def map_gender(value: str) -> str:
    mapping = {
        'sex_mle': 'Male',
        'sex_fmle': 'Female',
        'sex_btsx': 'Both sexes'
    }
    if not isinstance(value, str):
        return 'Both sexes'
    return mapping.get(value.lower(), value)


def enrich_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if 'dim1' in df.columns:
        df['gender'] = df['dim1'].apply(map_gender)
    if 'spatial_dim' in df.columns:
        df['country_code'] = df['spatial_dim']
    if 'parent_location_code' in df.columns:
        df['continent_code'] = df['parent_location_code']
    if 'parent_location' in df.columns:
        df['continent'] = df['parent_location']
    if 'time_dim' in df.columns:
        df['year'] = df['time_dim'].astype(int)
    if 'numeric_value' in df.columns:
        df['life_expectancy'] = df['numeric_value']
    if 'low' in df.columns:
        df['life_expectancy_low'] = df['low']
    if 'high' in df.columns:
        df['life_expectancy_high'] = df['high']
    if 'date' in df.columns:
        df['record_date'] = pd.to_datetime(df['date'], errors='coerce')
    if 'time_dimension_begin' in df.columns:
        df['period_start'] = pd.to_datetime(df['time_dimension_begin'], errors='coerce')
    if 'time_dimension_end' in df.columns:
        df['period_end'] = pd.to_datetime(df['time_dimension_end'], errors='coerce')
    columns_to_drop = [
        '@odata_context', 'dim1', 'dim1_type', 'dim2', 'dim2_type', 'dim3', 'dim3_type',
        'time_dimension_value', 'value'
    ]
    df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)
    df['value_range'] = df.get('life_expectancy_high', np.nan) - df.get('life_expectancy_low', np.nan)
    return df


def summarize_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    missing = df.isna().sum()
    percent = (missing / len(df)) * 100
    summary = pd.DataFrame({'missing_count': missing, 'missing_percent': percent})
    summary = summary[summary['missing_count'] > 0]
    return summary.sort_values(by='missing_count', ascending=False)


def get_unique_values(df: pd.DataFrame, column: str) -> Dict[str, int]:
    values = df[column].dropna().unique()
    return {'count': len(values), 'sample': sorted(values)[:10]}

In [None]:
raw_df = load_life_expectancy_data()
clean_df = enrich_columns(clean_column_names(raw_df))

print(f"Raw shape: {raw_df.shape}")
print(f"Clean shape: {clean_df.shape}")
clean_df.info()
clean_df.head()

In [None]:
missing_summary = summarize_missing_values(clean_df)
missing_summary

In [None]:
unique_overview = {
    'countries': get_unique_values(clean_df, 'country_code'),
    'years': get_unique_values(clean_df, 'year'),
    'genders': get_unique_values(clean_df, 'gender')
}
pd.DataFrame(unique_overview).T

In [None]:
clean_df.describe(include='all').transpose()

In [None]:
sample_for_missing = clean_df.sample(n=min(500, len(clean_df)), random_state=42)
plt.figure(figsize=(14, 6))
sns.heatmap(sample_for_missing.isna(), cbar=False, yticklabels=False)
plt.title('Missing Value Heatmap (sample)')
plt.show()

In [None]:
numeric_cols = [
    'life_expectancy', 'life_expectancy_low', 'life_expectancy_high', 'value_range', 'year'
]
for col in numeric_cols:
    if col not in clean_df.columns:
        continue
    plt.figure(figsize=(8, 4))
    sns.histplot(clean_df[col].dropna(), kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.show()

In [None]:
categorical_cols = ['gender', 'continent', 'country_code']
for col in categorical_cols:
    if col not in clean_df.columns:
        continue
    top_counts = clean_df[col].value_counts().head(15).reset_index()
    top_counts.columns = [col, 'count']
    fig = px.bar(top_counts, x=col, y='count', title=f'{col.title()} distribution (top 15)')
    fig.update_layout(xaxis_tickangle=-45, height=400)
    fig.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=clean_df, x='continent', y='life_expectancy')
plt.xticks(rotation=45)
plt.title('Life Expectancy by Continent (Boxplot)')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(data=clean_df, x='gender', y='life_expectancy')
plt.title('Life Expectancy by Gender (Boxplot)')
plt.show()

In [None]:
corr_columns = ['life_expectancy', 'life_expectancy_low', 'life_expectancy_high', 'value_range', 'year']
corr_matrix = clean_df[corr_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='viridis')
plt.title('Correlation Matrix (Seaborn)')
plt.show()

fig = px.imshow(corr_matrix, text_auto=True, aspect='auto', title='Correlation Matrix (Plotly)')
fig.show()

In [None]:
country_agg = (
    clean_df.groupby('country_code')['life_expectancy']
    .mean()
    .reset_index()
    .sort_values(by='life_expectancy', ascending=False)
)
country_agg.head(10)

In [None]:
fig = px.choropleth(
    country_agg,
    locations='country_code',
    color='life_expectancy',
    color_continuous_scale='viridis',
    projection='natural earth',
    title='Average Life Expectancy by Country'
)
fig.update_layout(height=500)
fig.show()

In [None]:
yearly_trend = clean_df.groupby('year')['life_expectancy'].mean().reset_index()
fig = px.line(yearly_trend, x='year', y='life_expectancy', markers=True, title='Global Life Expectancy Trend')
fig.show()

### Key Insights
- Life expectancy ranges widely across countries, with noticeable regional clustering (e.g., Western Pacific on the higher end, African region on the lower end).
- Gender gaps are visible but modest for most countries; `Both sexes` estimates closely track the midpoint between male and female records.
- Correlation analysis highlights strong coherence between the reported numeric value and the provided confidence bounds, affirming data quality.
- The global temporal trend shows a steady increase from early 2000s through the mid-2010s, supporting the inclusion of `year`-based engineered features.
- Countries with wider uncertainty intervals (`value_range`) typically belong to regions with lower data coverage, which may influence model confidence.