# Sanitation imputation quick test
So sanh noi suy tuyen tinh + median fallback voi hoi quy tuyen tinh (year + GDP + dien + nuoc).


In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [3]:
# Load and rename columns
df = pd.read_csv('../data/worldbank_2000_2024.csv')
df.columns = [
    'country_name', 'country_code', 'year', 'population', 'poverty_ratio',
    'pop_growth', 'life_expectancy', 'gdp_per_capita', 'gdp_growth',
    'sanitation', 'electricity', 'water_access', 'co2_emissions',
    'slum_population', 'labor_force'
]

# Drop high-missing columns
df = df.drop(columns=['poverty_ratio', 'slum_population'])


In [4]:
# Fill other numeric columns by skewness (exclude sanitation)
numeric_cols = df.select_dtypes(include=[np.number]).columns
exclude_cols = ['sanitation']
cols_to_fill = [c for c in numeric_cols if c not in exclude_cols]
for c in cols_to_fill:
    if df[c].isna().any():
        skew = df[c].skew()
        fill = df[c].mean() if -0.5 <= skew <= 0.5 else df[c].median()
        df[c] = df[c].fillna(fill)


In [5]:
def interpolate_by_year(frame, target, group_col='country_code', sort_col='year'):
    def _apply(group):
        group = group.sort_values(sort_col)
        group[target] = group[target].interpolate(method='linear', limit_direction='both')
        return group
    return frame.groupby(group_col, group_keys=False).apply(_apply)

def impute_interpolate_median(frame, target='sanitation'):
    work = frame.copy()
    work = interpolate_by_year(work, target)
    country_median = work.groupby('country_code')[target].transform('median')
    work[target] = work[target].fillna(country_median)
    work[target] = work[target].fillna(work[target].median())
    return work

def impute_linear_regression(frame, target='sanitation', predictors=None, min_samples=5):
    if predictors is None:
        predictors = ['year', 'gdp_per_capita', 'electricity', 'water_access']
    work = frame.copy()
    known = work[work[target].notna()]
    if known.empty:
        return work
    global_model = LinearRegression()
    global_model.fit(known[predictors], known[target])

    for code, idx in work.groupby('country_code').groups.items():
        country = work.loc[idx]
        missing = country[country[target].isna()]
        if missing.empty:
            continue
        country_known = country[country[target].notna()]
        if len(country_known) >= min_samples:
            model = LinearRegression()
            model.fit(country_known[predictors], country_known[target])
        else:
            model = global_model
        work.loc[missing.index, target] = model.predict(missing[predictors])
    return work


In [6]:
# Mask some known values to evaluate imputation quality
mask_rate = 0.2
known_idx = df[df['sanitation'].notna()].index
mask_size = int(len(known_idx) * mask_rate)
mask_idx = np.random.choice(known_idx, size=mask_size, replace=False)

df_masked = df.copy()
true_values = df_masked.loc[mask_idx, 'sanitation'].copy()
df_masked.loc[mask_idx, 'sanitation'] = np.nan


In [7]:
def eval_imputation(imputed, true, idx):
    pred = imputed.loc[idx, 'sanitation']
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    return mae, rmse

# Method A: interpolation + median fallback
imputed_a = impute_interpolate_median(df_masked)
mae_a, rmse_a = eval_imputation(imputed_a, true_values, mask_idx)

# Method B: linear regression (year + GDP + electricity + water)
imputed_b = impute_linear_regression(df_masked)
mae_b, rmse_b = eval_imputation(imputed_b, true_values, mask_idx)

print('Method A (interpolate + median): MAE', round(mae_a, 4), 'RMSE', round(rmse_a, 4))
print('Method B (linear regression):    MAE', round(mae_b, 4), 'RMSE', round(rmse_b, 4))


  return frame.groupby(group_col, group_keys=False).apply(_apply)


Method A (interpolate + median): MAE 0.0841 RMSE 0.3043
Method B (linear regression):    MAE 0.4538 RMSE 1.7139


In [13]:
imputed_b['country_name'].unique()

array(['Aruba', 'Afghanistan', 'Angola', 'Albania', 'Andorra',
       'United Arab Emirates', 'Argentina', 'Armenia', 'American Samoa',
       'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan',
       'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh',
       'Bulgaria', 'Bahrain', 'Bahamas, The', 'Bosnia and Herzegovina',
       'Belarus', 'Belize', 'Bermuda', 'Bolivia', 'Brazil', 'Barbados',
       'Brunei Darussalam', 'Bhutan', 'Botswana',
       'Central African Republic', 'Canada', 'Switzerland',
       'Channel Islands', 'Chile', 'China', "Cote d'Ivoire", 'Cameroon',
       'Congo, Dem. Rep.', 'Congo, Rep.', 'Colombia', 'Comoros',
       'Cabo Verde', 'Costa Rica', 'Cuba', 'Curacao', 'Cayman Islands',
       'Cyprus', 'Czechia', 'Germany', 'Djibouti', 'Dominica', 'Denmark',
       'Dominican Republic', 'Algeria', 'Ecuador', 'Egypt, Arab Rep.',
       'Eritrea', 'Spain', 'Estonia', 'Ethiopia', 'Finland', 'Fiji',
       'France', 'Faroe Islands', 'Micronesia, Fed

In [18]:
imputed_b[imputed_b['country_name'] == 'Georgia']['sanitation']

1725    42.614288
1726    41.880343
1727    41.184892
1728    40.136015
1729    39.423502
1730    38.558766
1731    37.691783
1732    36.868457
1733    35.951176
1734    35.077741
1735    34.201923
1736    33.323943
1737    32.443509
1738    31.561104
1739    30.647446
1740    29.586024
1741    28.805948
1742    27.876421
1743    26.940672
1744    26.202059
1745    25.117428
1746    24.552508
1747    24.077853
1748    23.247393
1749    22.526937
Name: sanitation, dtype: float64