In [2]:
# Setup and Paths

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


NOTEBOOK_DIR = Path.cwd().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent

DATA_PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
FIGURES_DIR = PROJECT_ROOT / "reports" / "figures"
TABLES_DIR = PROJECT_ROOT / "reports" / "tables"

FIGURES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Data processed:", DATA_PROCESSED_DIR)
print("Figures:", FIGURES_DIR)
print("Tables:", TABLES_DIR)

Project root: D:\DS-Project\Demographic-Change_Understanding-Fertility-Through-Data
Data processed: D:\DS-Project\Demographic-Change_Understanding-Fertility-Through-Data\data\processed
Figures: D:\DS-Project\Demographic-Change_Understanding-Fertility-Through-Data\reports\figures
Tables: D:\DS-Project\Demographic-Change_Understanding-Fertility-Through-Data\reports\tables


In [3]:
train_path = DATA_PROCESSED_DIR / "panel_country_split_train.csv"
test_path = DATA_PROCESSED_DIR / "panel_country_split_test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape, test_df.shape)

(9917, 6) (2532, 6)


In [4]:
TARGET_COL = "fertility"

feature_cols = [col for col in train_df.columns if col not in ["Country", "Year", TARGET_COL]]

X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]

X_test = test_df[feature_cols]
y_test = test_df[TARGET_COL]

In [5]:
lin_reg = LinearRegression(
    fit_intercept=True,
    n_jobs=-1
)
dt_reg = DecisionTreeRegressor(random_state=42)
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

models = [
    ("Linear Regression", lin_reg),
    ("Decision Tree", dt_reg),
    ("Random Forest", rf_reg),
]

for name, model in models:
    model.fit(X_train, y_train)
    print(f"Trained: {name}")

Trained: Linear Regression
Trained: Decision Tree
Trained: Random Forest


In [6]:
def evaluate_regression_model(model, X_train, y_train, X_test, y_test, model_name):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    mae_train = mean_absolute_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)

    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)

    return {
        "Model": model_name,
        "RMSE_train": rmse_train,
        "MAE_train": mae_train,
        "R2_train": r2_train,
        "RMSE_test": rmse_test,
        "MAE_test": mae_test,
        "R2_test": r2_test,
    }

In [7]:
results = []

for name, model in models:
    metrics = evaluate_regression_model(
        model, X_train, y_train, X_test, y_test, model_name=name
    )
    results.append(metrics)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="RMSE_test", ascending=True)
results_df

Unnamed: 0,Model,RMSE_train,MAE_train,R2_train,RMSE_test,MAE_test,R2_test
2,Random Forest,0.309676,0.201968,0.976129,0.806688,0.535723,0.844449
1,Decision Tree,0.001136,1.6e-05,1.0,1.095804,0.565289,0.712969
0,Linear Regression,1.493275,1.231832,0.444949,1.513461,1.253247,0.452473


In [8]:
performance_path = TABLES_DIR / "model_performance_regression.csv"
results_df.to_csv(performance_path, index=False)
print("Saved:", performance_path)

Saved: D:\DS-Project\Demographic-Change_Understanding-Fertility-Through-Data\reports\tables\model_performance_regression.csv
