# WHO Models

## Importing libraries and features

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.tools
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge
from statsmodels.tools.tools import pinv_extended 
from numpy.linalg import cond

In [8]:
# For printing rmse, mae and variance figures in the training/evaluation functions.
np.set_printoptions(precision=5)

In [9]:
# Loads feature_engineered data, ready for modelling
X_train_scale = pd.read_csv('X_train_scale.csv')
# Index resettting/naming
X_train_scale.set_index('Unnamed: 0', inplace=True)
X_train_scale.index.rename('Index',inplace=True)

X_test_scale = pd.read_csv('X_test_scale.csv')
X_test_scale.set_index('Unnamed: 0', inplace=True)
X_test_scale.index.rename('Index',inplace=True)

X_train_scale2 = pd.read_csv('X_train_scale2.csv')
X_train_scale2.set_index('Unnamed: 0', inplace=True)
X_train_scale2.index.rename('Index',inplace=True)

X_test_scale2 = pd.read_csv('X_test_scale2.csv')
X_test_scale2.set_index('Unnamed: 0', inplace=True)
X_test_scale2.index.rename('Index',inplace=True)

y_train = pd.read_csv('y_train.csv')
y_train.set_index('Unnamed: 0', inplace=True)
y_train.index.rename('Index',inplace=True)
# Take only y Series from the csv, with index preserved to be the same as X_train
y_train = y_train.Life_expectancy

y_test = pd.read_csv('y_test.csv')
y_test.set_index('Unnamed: 0', inplace=True)
y_test.index.rename('Index',inplace=True)
# Take only y Series from the csv, with index preserved to be the same as X_test
y_test = y_test.Life_expectancy


## Full Model

In [10]:
def train_model(X_train_scale, y_train):

    X_train_scale = sm.add_constant(X_train_scale) # Adds constant to fully transformed data
    model = sm.OLS(y_train, X_train_scale).fit() # Train linear regression model

    # Metrics
    train_rmse = statsmodels.tools.eval_measures.rmse(y_train, model.predict(X_train_scale))
    train_mae = statsmodels.tools.eval_measures.meanabs(y_train, model.predict(X_train_scale))
    vare = statsmodels.tools.eval_measures.vare(y_train, model.predict(X_train_scale))
    print(f'Train Root Mean Squared Error: {train_rmse}')
    print(f'Train Mean Absolute Error: {train_mae}')
    print(f'Train Variance Explained: {vare}')

    return model

def evaluate_model(model, X_test_scale, y_test):
    
    X_test_scale = sm.add_constant(X_test_scale)
    predictions = model.predict(X_test_scale)
    results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

    rmse = statsmodels.tools.eval_measures.rmse(y_test, predictions)
    mae = statsmodels.tools.eval_measures.meanabs(y_test, predictions)
    vare = statsmodels.tools.eval_measures.vare(y_test, predictions)

    print(f'Root Mean Squared Error: {rmse}') 
    print(f'Mean Absolute Error: {mae}')
    print(f'Variance Explained: {vare}')

In [11]:
full_linreg_model = train_model(X_train_scale, y_train)
evaluate_model(full_linreg_model, X_test_scale, y_test)

full_linreg_model.summary()

Train Root Mean Squared Error: 1.3483679667582404
Train Mean Absolute Error: 1.067455255875636
Train Variance Explained: 1.8180961737797514
Root Mean Squared Error: 1.3518598668637192
Mean Absolute Error: 1.0813251131141326
Variance Explained: 1.8246898035138661


0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.98
Model:,OLS,Adj. R-squared:,0.98
Method:,Least Squares,F-statistic:,9184.0
Date:,"Mon, 14 Jul 2025",Prob (F-statistic):,0.0
Time:,10:13:26,Log-Likelihood:,-3935.6
No. Observations:,2291,AIC:,7897.0
Df Residuals:,2278,BIC:,7972.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,76.3813,0.379,201.468,0.000,75.638,77.125
GDP_per_capita_log,3.9510,0.283,13.942,0.000,3.395,4.507
Adult_mortality,-30.8633,0.461,-67.011,0.000,-31.766,-29.960
Economy_status_Developed,0.8520,0.115,7.400,0.000,0.626,1.078
Under_five_deaths,-16.8442,0.401,-41.972,0.000,-17.631,-16.057
BMI,-2.4692,0.263,-9.399,0.000,-2.984,-1.954
Schooling,0.9995,0.239,4.176,0.000,0.530,1.469
Thinness_ten_nineteen_years,-1.3312,0.246,-5.402,0.000,-1.814,-0.848
Year,0.4487,0.095,4.728,0.000,0.263,0.635

0,1,2,3
Omnibus:,25.804,Durbin-Watson:,1.972
Prob(Omnibus):,0.0,Jarque-Bera (JB):,35.16
Skew:,0.14,Prob(JB):,2.32e-08
Kurtosis:,3.538,Cond. No.,44.4


## Slim Model (non-medical records)

In [12]:
slim_linreg_model = train_model(X_train_scale2, y_train)
evaluate_model(slim_linreg_model, X_test_scale2, y_test)

slim_linreg_model.summary()

Train Root Mean Squared Error: 1.382004782100395
Train Mean Absolute Error: 1.0899239205263547
Train Variance Explained: 1.9099372177483598
Root Mean Squared Error: 1.3969218961815861
Mean Absolute Error: 1.1051908515665594
Variance Explained: 1.9507410218906178


0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.979
Model:,OLS,Adj. R-squared:,0.979
Method:,Least Squares,F-statistic:,15000.0
Date:,"Mon, 14 Jul 2025",Prob (F-statistic):,0.0
Time:,10:13:34,Log-Likelihood:,-3992.0
No. Observations:,2291,AIC:,8000.0
Df Residuals:,2283,BIC:,8046.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,75.6883,0.188,402.628,0.000,75.320,76.057
GDP_per_capita_log,3.4130,0.258,13.223,0.000,2.907,3.919
Adult_mortality,-29.9728,0.303,-98.792,0.000,-30.568,-29.378
Economy_status_Developed,1.2061,0.113,10.694,0.000,0.985,1.427
Under_five_deaths,-16.9706,0.331,-51.233,0.000,-17.620,-16.321
Schooling,0.6785,0.234,2.903,0.004,0.220,1.137
Year,0.3419,0.096,3.561,0.000,0.154,0.530
Alcohol_consumption,1.0791,0.197,5.484,0.000,0.693,1.465

0,1,2,3
Omnibus:,25.21,Durbin-Watson:,1.966
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40.328
Skew:,0.057,Prob(JB):,1.75e-09
Kurtosis:,3.64,Cond. No.,21.1


## Ridge Model

The following cells run a ride_model over the original data, with RobustScaler() to verify metrics.

In [144]:
df = pd.read_csv('Life Expectancy Data.csv')

In [145]:
# Splits features and target

def create_ft(df):
    features = df.drop(columns=['Life_expectancy'])
    target = df['Life_expectancy']
    return features, target
    

In [146]:
# Call the function to split features and target from DataFrame
features, target = create_ft(df)

In [147]:
def ridge_model(features, target):
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=42
    )

    X_train = feature_eng(X_train)
    X_test = feature_eng(X_test)

    x_scaler = RobustScaler()
    y_scaler = RobustScaler()

    X_train_scaled = x_scaler.fit_transform(X_train)
    X_test_scaled = x_scaler.transform(X_test)

    y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

    model = Ridge(alpha=0.1)
    model.fit(X_train_scaled, y_train_scaled)

    predictions_scaled = model.predict(X_test_scaled)
    predictions = y_scaler.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()

    rmse = statsmodels.tools.eval_measures.rmse(y_test, predictions)
    print(f'Ridge Root Mean Squared Error: {rmse:.5f}')

    r_squared = model.score(X_test_scaled, y_test_scaled)
    print(f'Ridge R^2 (scaled): {r_squared:.5f}')

    condition_number = np.linalg.cond(X_train_scaled)
    print(f'Ridge Condition Number: {condition_number:.5f}')

    return model, x_scaler, y_scaler, X_train_scaled, X_test_scaled, y_test

def evaluate_ridge_model(model, X_test_scaled, y_test_scaled, y_scaler):
    predictions_scaled = model.predict(X_test_scaled)

    predictions = y_scaler.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    actual = y_scaler.inverse_transform(y_test_scaled.values.reshape(-1, 1)).flatten()

    rmse = statsmodels.tools.eval_measures.rmse(actual, predictions)
    mae = statsmodels.tools.eval_measures.meanabs(actual, predictions)
    vare = statsmodels.tools.eval_measures.vare(actual, predictions)

    print(f'Ridge Test Root Mean Squared Error: {rmse:.5f}') 
    print(f'Ridge Test Mean Absolute Error: {mae:.5f}')
    print(f'Ridge Test Variance Explained: {vare:.5f}')

    return pd.DataFrame({'Actual': actual, 'Predicted': predictions})

In [148]:
ridge_model, x_scaler, y_scaler, X_train_scaled, X_test_scaled, y_test_scaled = ridge_model(features, target)

Ridge Root Mean Squared Error: 1.34905
Ridge R^2 (scaled): 0.97807
Ridge Condition Number: 81.81233
