# WHO Life Expectancy — EDA Notebook

**Author:** You

**Dataset:** WHO / Kaggle life expectancy dataset

**Purpose:** Clean, analyze, visualize, and draw insights about life expectancy and its drivers across countries.

---

In [None]:
# Install dependencies (uncomment if needed)
# !pip install pandas numpy matplotlib seaborn plotly missingno scikit-learn openpyxl


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (10,6)


In [None]:
# Load data (place 'life_expectancy.csv' in the same folder as this notebook)
df = pd.read_csv('life_expectancy.csv')
print('Rows, Columns:', df.shape)
df.head()

In [None]:
# Basic information and missing values
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
print(df.dtypes)
print('\nMissing values per column:\n', df.isnull().sum())
msno.matrix(df)
plt.title('Missing Values Matrix')
plt.show()

In [None]:
# Data cleaning / imputation
# Convert year to int
if df['year'].dtype not in [np.int64, np.int32]:
    df['year'] = df['year'].astype(int)

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Fill numeric by country median then global median
for col in num_cols:
    if col != 'year' and df[col].isnull().sum()>0:
        df[col] = df.groupby('country')[col].transform(lambda x: x.fillna(x.median()))
        if df[col].isnull().sum()>0:
            df[col] = df[col].fillna(df[col].median())

# Fill categorical with mode
for col in cat_cols:
    if df[col].isnull().sum()>0:
        df[col] = df[col].fillna(df[col].mode().iloc[0])

print('\nMissing values after imputation:\n', df.isnull().sum())


In [None]:
# Feature engineering
if 'gdp' in df.columns:
    df['gdp_per_capita'] = df['gdp']
    df['high_gdp_flag'] = (df['gdp_per_capita'] > df['gdp_per_capita'].median()).astype(int)

df['decade'] = (df['year'] // 10) * 10
df.head()

In [None]:
# Univariate analysis: life expectancy distribution and top countries
plt.figure()
sns.histplot(df['life_expectancy'].dropna(), kde=True)
plt.title('Life Expectancy Distribution')
plt.xlabel('Life Expectancy (years)')
plt.show()

top10 = df.groupby('country')['life_expectancy'].mean().sort_values(ascending=False).head(10)
plt.figure()
sns.barplot(x=top10.values, y=top10.index)
plt.title('Top 10 Countries by Average Life Expectancy')
plt.xlabel('Average Life Expectancy')
plt.show()

In [None]:
# Bivariate analysis: GDP vs Life Expectancy and Schooling vs Life Expectancy
if 'gdp_per_capita' in df.columns:
    plt.figure()
    sns.scatterplot(data=df, x='gdp_per_capita', y='life_expectancy', hue='status', alpha=0.7)
    plt.xscale('log')
    plt.title('GDP per Capita vs Life Expectancy (log scale)')
    plt.show()

if 'schooling' in df.columns:
    plt.figure()
    sns.scatterplot(data=df, x='schooling', y='life_expectancy', alpha=0.6)
    sns.regplot(data=df, x='schooling', y='life_expectancy', scatter=False, color='red')
    plt.title('Schooling vs Life Expectancy')
    plt.show()

In [None]:
# Correlation heatmap
corr_cols = ['life_expectancy'] + [c for c in num_cols if c != 'year']
plt.figure(figsize=(12,10))
sns.heatmap(df[corr_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Trend analysis for sample countries
sample_countries = ['United States', 'India', 'China', 'Japan', 'Sweden']
existing = [c for c in sample_countries if c in df['country'].unique()]
plt.figure()
for country in existing:
    tmp = df[df['country'] == country].sort_values('year')
    plt.plot(tmp['year'], tmp['life_expectancy'], marker='o', label=country)
plt.legend()
plt.title('Life Expectancy over Time (sample countries)')
plt.xlabel('Year')
plt.ylabel('Life Expectancy')
plt.show()

In [None]:
# Developed vs Developing comparison
if 'status' in df.columns:
    display(df.groupby('status')['life_expectancy'].agg(['mean','median','std']))
    plt.figure()
    sns.boxplot(data=df, x='status', y='life_expectancy')
    plt.title('Life Expectancy: Developed vs Developing')
    plt.show()

In [None]:
# Simple linear regression baseline
features = [f for f in ['gdp_per_capita','schooling','adult_mortality','total_expenditure','bmi','alcohol'] if f in df.columns]
if len(features) >= 2:
    model_df = df.dropna(subset=features + ['life_expectancy'])
    X = model_df[features]
    y = model_df['life_expectancy']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    print('MSE:', mean_squared_error(y_test, y_pred))
    print('R2:', r2_score(y_test, y_pred))
    coef_df = pd.DataFrame({'feature': features, 'coefficient': lr.coef_}).sort_values(by='coefficient', key=abs, ascending=False)
    display(coef_df)

In [None]:
# Save cleaned dataset and summary
df.to_csv('life_expectancy_cleaned.csv', index=False)
summary = df.groupby('country').agg(
    avg_life_expectancy = ('life_expectancy','mean'),
    avg_gdp = ('gdp_per_capita','mean') if 'gdp_per_capita' in df.columns else ('gdp','mean'),
    avg_schooling = ('schooling','mean') if 'schooling' in df.columns else ('year','mean')
).reset_index()
summary.to_excel('who_health_summary.xlsx', index=False)
print('Saved: life_expectancy_cleaned.csv and who_health_summary.xlsx')

## Notes for Presentation

- Start the notebook with a 2-3 bullet executive summary of main findings.
- Keep code cells short and add markdown explanations for each analysis section.
- Include 4–6 strong visuals with captions and one slide/report with top 3 actionable recommendations.
