In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from sklearn.impute import SimpleImputer
import matplotlib as mpl

In [2]:
# ──────────────────────────────────────────────────────────────
# 1. Load raw data  + helper cols
# ----------------------------------------------------------------
df = pd.read_csv("world_bank.csv")
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year

In [3]:
# ──────────────────────────────────────────────────────────────
# Block 2  ▸  Core cleaning (years, sparse columns, inactive countries)
# -----------------------------------------------------------------
# 2-A  Keep only 2017–2023
df = df[df['year'].between(2017, 2023)].copy()

# 2-B  Drop ultra-sparse / discontinued or irrelevant features
drop_cols = [
    'doing_business', 'time_to_get_operation_license',
    'multidimensional_poverty_headcount_ratio%', 'human_capital_index',
    'risk_premium_on_lending', 'electric_power_consumption',
    'logistic_performance_index', 'statistical_performance_indicators',
    'avg_precipitation',
    'goverment_effectiveness_std', 'regulatory_quality_std',
    'voice_and_accountability_std', 'control_of_corruption_std',
    'rule_of_law_std', 'political_stability_std'
]
df.drop(columns=drop_cols, inplace=True)

# 2-C  Remove countries with ≥ 90 % missing data
pct_missing = df.isna().groupby(df['country']).mean().mean(axis=1)
df = df[~df['country'].isin(pct_missing[pct_missing >= 0.9].index)].copy()

# 2-D  Remove any country with no Gini observations
has_gini = df.groupby('country')['gini_index'].transform('count') > 0
df = df[has_gini].copy()


In [4]:
# ──────────────────────────────────────────────────────────────
# Block 3  ▸  Impute missing values (country mean → global mean)
# -----------------------------------------------------------------
impute_cols = df.columns.difference(['country', 'date', 'year', 'gini_index'])
df_imp = df.sort_values(['country', 'date']).copy()

# 3-A  Fill NaNs with that country’s mean
df_imp[impute_cols] = (
    df_imp.groupby('country')[impute_cols]
          .transform(lambda g: g.fillna(g.mean()))
)

# 3-B  Fill any remaining NaNs with the global mean
df_imp[impute_cols] = df_imp[impute_cols].fillna(df_imp[impute_cols].mean())


In [5]:
# ──────────────────────────────────────────────────────────────
# Block 4  ▸  Feature engineering: rolling stats, lags, next-year target
# -----------------------------------------------------------------
df_feat = df_imp.sort_values(['country', 'date']).copy()

# 4-A  5-year rolling mean & std for each feature
rolled     = df_feat.groupby('country')[impute_cols].rolling(5, min_periods=5)
roll_mean  = rolled.mean().reset_index(level=0, drop=True).add_suffix('_rollmean')
roll_std   = rolled.std() .reset_index(level=0, drop=True).add_suffix('_rollstd')
df_feat    = pd.concat([df_feat, roll_mean, roll_std], axis=1)

# 4-B  Create 1–4 year lag features for raw columns
lag_frames = []
for lag in range(1, 5):
    lag_df = (
        df_feat.groupby('country')[impute_cols]
               .shift(lag)
               .add_suffix(f'_lag{lag}')
    )
    lag_frames.append(lag_df)
df_feat = pd.concat([df_feat] + lag_frames, axis=1)

# 4-C  Define the prediction target: next-year Gini
df_feat['gini_next'] = df_feat.groupby('country')['gini_index'].shift(-1)

# 4-D  Clean up and filter:
#      • Drop original Gini column
#      • Remove rows without a next-year Gini
#      • Keep only 2021–2023 now that all features are ready
df_feat = df_feat.drop(columns='gini_index')
df_feat = df_feat[df_feat['gini_next'].notna()].reset_index(drop=True)
df_feat = df_feat[df_feat['year'] >= 2021].copy()

df_feat

Unnamed: 0,country,date,agricultural_land%,forest_land%,land_area,trade_in_services%,control_of_corruption_estimate,access_to_electricity%,renewvable_energy_consumption%,CO2_emisions,...,real_interest_rate_lag4,regulatory_quality_estimate_lag4,renewvable_energy_consumption%_lag4,research_and_development_expenditure%_lag4,rule_of_law_estimate_lag4,rural_population_lag4,tax_revenue%_lag4,trade_in_services%_lag4,voice_and_accountability_estimate_lag4,gini_next
8,Argentina,2021-01-01,43.102346,10.400764,2736690.0,4.632097,-0.422273,100.0,10.3675,169714.95,...,0.455893,-0.1963,10.37,0.55631,-0.238357,3634137.0,10.939624,6.324743,0.528667,40.7
13,Armenia,2021-01-01,58.827538,11.530032,28470.0,22.145771,0.048978,100.0,10.6025,6006.425,...,11.99734,0.228408,12.56,0.22788,-0.209941,1052274.0,20.780653,33.693061,-0.560795,27.9
19,Bangladesh,2021-01-01,77.34501,14.468772,130170.0,4.40598,-0.985861,98.991028,27.0325,90435.175,...,4.27654,-0.826628,28.01,1.102477,-0.681922,103777884.0,6.997142,4.210319,-0.615031,33.4
29,Bhutan,2021-01-01,13.450446,71.501311,38140.0,5.722658,1.507121,100.0,87.45,1308.0,...,2.786647,-0.339258,83.63,1.102477,0.630079,452410.0,11.859194,14.338403,0.010955,28.5
38,Brazil,2021-01-01,28.639094,59.270527,8358140.0,5.450982,-0.498798,99.46212,47.4775,434495.1,...,41.713808,-0.130867,45.33,1.1175,-0.305652,28546414.0,13.605229,5.240724,0.449007,52.0
51,Chad,2021-01-01,39.976175,3.33651,1259200.0,27.354818,-1.495754,11.268559,73.74,1549.4,...,4.259767,-1.230408,72.84,1.102477,-1.321413,11637553.0,16.95962,27.354818,-1.371381,37.4
53,Chile,2021-01-01,14.250277,24.657473,743532.0,7.664255,0.956733,100.0,25.39,87615.35,...,-0.262655,1.325201,24.1,0.35679,0.979843,2297909.0,17.505795,8.689948,1.000429,43.0
61,Colombia,2021-01-01,38.502028,53.125396,1109500.0,7.020611,-0.365435,99.999107,31.44,78318.375,...,8.134491,0.257449,32.38,0.26109,-0.366342,9454686.0,14.850818,7.550977,0.112487,54.8
67,Costa Rica,2021-01-01,35.468077,59.757932,51060.0,20.678456,0.470483,100.0,34.0775,7815.975,...,8.26983,0.433457,33.33,0.42587,0.428042,1070680.0,13.244103,20.548219,1.121756,47.2
90,Dominican Republic,2021-01-01,51.10349,45.279712,47531.0,13.286975,-0.590034,98.1,16.1,24393.45,...,9.338161,-0.038595,16.87,1.102477,-0.43001,2099956.0,13.030751,15.205121,0.159245,37.0


In [6]:
# ──────────────────────────────────────────────────────────────
# Block 5  ▸  Prepare demo set: split groups, scale, sanity-check
# -----------------------------------------------------------------
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing  import StandardScaler
import joblib

# 5-A  Separate features (X) and target (y), and save country labels
demo_X       = df_feat.drop(columns='gini_next')
demo_y       = df_feat['gini_next']
groups_demo  = df_feat['country']
groups_demo.to_csv('groups_demo.csv', index=False)

# 5-B  Identify numeric columns for scaling
num_cols = demo_X.select_dtypes(include='number').columns

# 5-C  Load the pre-fitted scaler and apply it to the numeric columns
scaler = joblib.load('scaler.pkl')
demo_X[num_cols] = scaler.transform(demo_X[num_cols])

# 5-D  Confirm no NaNs remain after scaling
assert demo_X.isna().sum().sum() == 0

print(f"Scaled demo features shape: {demo_X.shape}")


Scaled demo features shape: (26, 227)


In [7]:
# ──────────────────────────────────────────────────────────────
# Block 6  ▸  Build three demo feature‐sets (in memory only)
# -----------------------------------------------------------------
# • demo_X already contains all engineered & scaled columns.
# • We now partition it into:
#     1) raw year-t features
#     2) raw + 5-year rolling statistics
#     3) raw + 4 year-lag features

# A) Identify helper columns that should not be included
helper_cols = ['country', 'date', 'year']

# B) Identify rolling‐stat and lagged feature columns
roll_cols = [c for c in demo_X.columns if c.endswith(('_rollmean', '_rollstd'))]
lag_cols  = [c for c in demo_X.columns if c.endswith(('_lag1', '_lag2', '_lag3', '_lag4'))]

# C) Base–year features = everything else minus helpers, rolling & lags
base_cols = [c for c in demo_X.columns
             if c not in helper_cols + roll_cols + lag_cols]

# D) Slice out the three datasets
X1_demo = demo_X[base_cols]
X2_demo = demo_X[base_cols + roll_cols]
X3_demo = demo_X[base_cols + lag_cols]


In [8]:
# ──────────────────────────────────────────────────────────────
# Block 7  ▸  Save the three demo sets to CSV
# -----------------------------------------------------------------
# Writing each to its own file in the notebook’s folder
X1_demo.to_csv('X1_demo.csv', index=False)
X2_demo.to_csv('X2_demo.csv', index=False)
X3_demo.to_csv('X3_demo.csv', index=False)
demo_y.to_csv('y_demo.csv', index=False)

print("✓ Demo datasets saved")


✓ Demo datasets saved
