In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

In [None]:
# ----- Load the same dataset as the one for the FE regression -----

PROJECT_ROOT = Path("/files/sustainability-economic-performance")

DATA_PATH = PROJECT_ROOT / "data" / "processed" / "panel_FE_regression.csv"

print("Using data from:", DATA_PATH)

df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
print(df.columns)
df.head()

In [None]:
# ----- Define column names only-----

# Target (Y)
y_col = "gdp_growth"

# Features (X, numerical only)
X_cols = [
        "ENV_index",
        "SOC_index",
        "GOV_index",
        "gdp_per_capita",
        "inflation",
        "fdi_inflows",
        "R&D_expenditure",
]

In [None]:
# -----Missing values -----

# drop any missing row that has one or more of the elements of X_cols or y_col missing
ml_df = df[X_cols + [y_col]].dropna()
print("Shape:", ml_df.shape)
ml_df.head(10)

In [None]:
# ----- Save the final dataset for the Machine Learning section -----

# Define project root
PROJECT_ROOT = Path("/files/sustainability-economic-performance")

# Path
processed_dir = PROJECT_ROOT / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

output_path = processed_dir / "panel_machine_learning.csv"

ml_df.to_csv(output_path, index=False)

print(f"Final machine learning dataset saved to: {output_path}")

In [None]:
# ----- Define X and Y -----

# Target, y 1D (samples,)
y = ml_df[y_col]   # Series (one column)

# Features, X is 2D (sample x features)
X = ml_df[X_cols]   # dataset with 7 columns

In [None]:
# ----- Split the data -----

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,      # 20% test
    random_state=42     # for reproducibility
)

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

In [None]:
# ----- 1st ML model: Linear Regression -----

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler

# Create and fit the scalar on train data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same scalar to test data (no fit)
X_test_scaled = scaler.transform(X_test)

# Sanity check to see if it worked
print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

# Create the model
lin_reg = LinearRegression()

# Cross-validation on TRAIN only (after scaling)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_r2 = cross_val_score(lin_reg, X_train_scaled, y_train, cv=cv, scoring="r2")

cv_rmse_neg = cross_val_score(
    lin_reg,
    X_train_scaled,
    y_train,
    cv=cv,
    scoring="neg_root_mean_squared_error"
)

cv_rmse = -cv_rmse_neg

print("LINEAR REGRESSION RESULTS\n")

print("Cross-Validation (train only):")
print(f"R² mean = {cv_r2.mean():.3f} (std = {cv_r2.std():.3f})")
print(f"CV RMSE mean = {cv_rmse.mean():.3f}, std = {cv_rmse.std():.3f}\n")

# Train (fit)
lin_reg.fit(X_train_scaled, y_train)

# Test (predict)
y_train_pred = lin_reg.predict(X_train_scaled)
y_test_pred  = lin_reg.predict(X_test_scaled)

# Evaluate
r2_train = r2_score(y_train, y_train_pred)
r2_test  = r2_score(y_test, y_test_pred)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test  = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("TRAIN:")
print(f"R² = {r2_train:.3f}, RMSE = {rmse_train:.3f}")

print("\nTEST:")
print(f"R² = {r2_test:.3f}, RMSE = {rmse_test:.3f}")

In [None]:
# ----- 2nd model: Random Forest -----

In [None]:
# Quick check to see if X is unscalled
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Create the model
rf = RandomForestRegressor(
    n_estimators=300,     # number of trees
    max_depth=6,          # how deep each tree can go
    min_samples_leaf=10,  # minimum observations per leaf
    random_state=42,      # reproductibility
    n_jobs=-1             # use all CPU cores
)

# Cross-validation on TRAIN only
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_r2 = cross_val_score(rf, X_train, y_train, cv=cv, scoring="r2", n_jobs=-1)

cv_rmse_neg = cross_val_score(
    rf, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1
)
cv_rmse = -cv_rmse_neg

print("RANDOM FOREST RESULTS\n")

print("Cross-Validation (train only):")
print(f"R² mean = {cv_r2.mean():.3f} (std = {cv_r2.std():.3f})")
print(f"RMSE mean = {cv_rmse.mean():.3f} (std = {cv_rmse.std():.3f})\n")

# Train (fit)
rf.fit(X_train, y_train)

# Test (predict)
y_train_pred_rf = rf.predict(X_train)
y_test_pred_rf = rf.predict(X_test)

# Evaluate
r2_train_rf = r2_score(y_train, y_train_pred_rf)
r2_test_rf  = r2_score(y_test, y_test_pred_rf)

rmse_train_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
rmse_test_rf  = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))

print("TRAIN:")
print(f"R² = {r2_train_rf:.3f}, RMSE = {rmse_train_rf:.3f}")

print("\nTEST:")
print(f"R² = {r2_test_rf:.3f}, RMSE = {rmse_test_rf:.3f}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Feature importances histogram for Random Forest

importances = rf.feature_importances_ * 100
feature_names = X.columns

fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(6, 4))
fi.plot(kind="bar")

plt.ylabel("Feature importance (%)")
plt.xticks(rotation=45, ha="right")

plt.tight_layout()

# save the figure
output_path = "sustainability-economic-performance/results/machine_learning/RF_features_importance.pdf"
plt.savefig(output_path, bbox_inches="tight")
plt.show()
plt.close()

print(f"Saved Random Forest features importance histogram to: {output_path}")

In [None]:
# ----- 3rd model: Gradient Boosting -----

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

# Create the model
gbr = GradientBoostingRegressor(
    n_estimators=800,
    learning_rate=0.03,   # slower learning
    max_depth=2,          # how deep each tree can go
    subsample=0.7,        # stochastic GB (reduces overfitting)
    min_samples_leaf=20,
    max_features=0.7,
    random_state=42
)

# Cross-validation on TRAIN only
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_r2_gbr = cross_val_score(gbr, X_train, y_train, cv=cv, scoring="r2")

cv_rmse_neg_gbr = cross_val_score(
    gbr, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
cv_rmse_gbr = -cv_rmse_neg_gbr

print("GRADIENT BOOSTING RESULTS\n")

print("Cross-Validation (train only):")
print(f"R² mean = {cv_r2_gbr.mean():.3f} (std = {cv_r2_gbr.std():.3f})")
print(f"RMSE mean = {cv_rmse_gbr.mean():.3f} (std = {cv_rmse_gbr.std():.3f})")

# Train (fit)
gbr.fit(X_train, y_train)

# Test (predict)
y_train_pred_gbr = gbr.predict(X_train)
y_test_pred_gbr  = gbr.predict(X_test)

# Evaluate
r2_train_gbr = r2_score(y_train, y_train_pred_gbr)
r2_test_gbr  = r2_score(y_test, y_test_pred_gbr)

rmse_train_gbr = np.sqrt(mean_squared_error(y_train, y_train_pred_gbr))
rmse_test_gbr  = np.sqrt(mean_squared_error(y_test, y_test_pred_gbr))

print("\nTRAIN:")
print(f"R² = {r2_train_gbr:.3f}, RMSE = {rmse_train_gbr:.3f}")

print("\nTEST:")
print(f"R² = {r2_test_gbr:.3f}, RMSE = {rmse_test_gbr:.3f}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Feature importances histogram for Gradient Boosting

importances_gbr = gbr.feature_importances_ * 100
feature_names = X.columns

fi_gbr = (
    pd.Series(importances_gbr, index=feature_names)
    .sort_values(ascending=False)
)

plt.figure(figsize=(6, 4))

fi_gbr.plot(kind="bar")

plt.ylabel("Importance (%)")
plt.xticks(rotation=45, ha="right")

plt.tight_layout()

# save the figure
output_path = "sustainability-economic-performance/results/machine_learning/GB_features_importance.pdf"
plt.savefig(output_path, bbox_inches="tight")
plt.show()
plt.close()

print(f"Saved Gradient Boosting features importance histogram to: {output_path}")

In [None]:
import pandas as pd

# ----- Create a table summarizing and comparing the results of all 3 Machine Learning models -----

results = pd.DataFrame({
    "Model": [
        "Linear Regression",
        "Random Forest",
        "Gradient Boosting"
    ],
    "CV R² (mean)": [
        0.022,
        0.166,
        0.103
    ],
    "CV R² (std)": [
        0.082,
        0.058,
        0.033
    ],
    "CV RMSE (mean)": [
        3.464,
        3.200,
        3.318
    ],
    "CV RMSE (std)": [
        0.359,
        0.334,
        0.302
    ],
    "Train R²": [
        0.094,
        0.431,
        0.570
    ],
    "Train RMSE": [
        3.367,
        2.669,
        2.318
    ],
    "Test R²": [
        0.143,
        0.327,
        0.358
    ],
    "Test RMSE": [
        3.137,
        2.781,
        2.715
    ],
})

results

In [None]:
import matplotlib.pyplot as plt

# ----- Save the table as a pdf -----

col_labels = [
    "Model",
    "CV R²\n(mean)",
    "CV R²\n(std)",
    "CV RMSE\n(mean)",
    "CV RMSE\n(std)",
    "Train R²",
    "Train\nRMSE",
    "Test R²",
    "Test\nRMSE"
]

results["Model"] = [
    "Linear\nRegression",
    "Random\nForest",
    "Gradient\nBoosting"
]

fig, ax = plt.subplots(figsize=(8, 8))
ax.axis("off")

table = ax.table(
    cellText=results.round(3).values,
    colLabels=col_labels,
    cellLoc="center",
    loc="center"
)

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.6)

plt.tight_layout()
plt.savefig(
    "sustainability-economic-performance/results/machine_learning/model_comparison.pdf",
    bbox_inches="tight"
)
plt.close()

print("Table saved to : results/machine_learning/model_comparison.pdf")