
# Regression Training (Linear / Ridge / Lasso via SGD)

**Goal:** Train models to predict `life_expectancy`, save each model to `.pkl`, and plot **Epoch vs Loss** (Train & Validation).

**Notes:**
- `X_train.csv` & `y_train.csv` are assumed **preprocessed** for numeric features.
- Only `country_name` and `country_code` are one-hot encoded; all numeric columns are **passthrough** (no extra scaling).
- If `X_test.csv` & `y_test.csv` exist, they are used as validation; otherwise an 90/10 split is created from training data.


## 1. Setup & Paths

In [19]:

import os
import joblib
import copy
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.base import clone
from sklearn.model_selection import train_test_split

# Paths (edit if needed)
DATA_DIR = "../data/processed"  # change to your paths if different
X_TRAIN_PATH = f"{DATA_DIR}/X_train.csv"
Y_TRAIN_PATH = f"{DATA_DIR}/y_train.csv"
X_TEST_PATH  = f"{DATA_DIR}/X_test.csv"
Y_TEST_PATH  = f"{DATA_DIR}/y_test.csv"

MODEL_DIR = "../model/1_linear_regression"  # output models & plots will be stored here
os.makedirs(MODEL_DIR, exist_ok=True)

print('Paths set.')


Paths set.


## 2. Load Data

In [20]:

# Read training data
X_train = pd.read_csv(X_TRAIN_PATH)
y_train = pd.read_csv(Y_TRAIN_PATH).squeeze("columns")

# Use provided test as validation if available; else split from train
if os.path.exists(X_TEST_PATH) and os.path.exists(Y_TEST_PATH):
    X_val = pd.read_csv(X_TEST_PATH)
    y_val = pd.read_csv(Y_TEST_PATH).squeeze("columns")
    HAS_VAL = True
else:
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.1, random_state=42
    )
    HAS_VAL = False

print("Train shape:", X_train.shape, "| Val shape:", X_val.shape)
display(X_train.head(5))
display(y_train.head(5))


Train shape: (4882, 14) | Val shape: (543, 14)


Unnamed: 0,country_name,country_code,year,population,poverty_ratio,pop_growth,gdp_per_capita,gdp_growth,sanitation,electricity,water_access,co2_emissions,slum_population,labor_force
0,Northern Mariana Islands,MNP,2009,-0.251841,-0.728122,-2.815921,-0.055375,-3.599555,1.231196,0.619765,0.667126,-0.546646,-0.861598,7.014949e-16
1,Estonia,EST,2014,-0.242198,-0.698243,-0.962903,0.197888,-0.007659,1.347967,0.619765,0.725428,1.428233,-0.961443,0.01824257
2,Bangladesh,BGD,2018,0.999992,-0.337463,-0.290063,-0.561088,0.679336,-0.922636,0.156854,0.60691,-0.47025,0.871098,-0.1934273
3,Cabo Verde,CPV,2009,-0.248389,0.567734,-0.209854,-0.492006,-0.837283,-0.180633,-0.295456,-0.279821,-0.380493,0.887766,-0.4989862
4,Afghanistan,AFG,2012,-0.018236,0.996113,1.714338,-0.614651,1.613214,-0.852976,-0.47214,-1.951212,-0.508793,1.386267,-1.376172


0    77.903000
1    77.034146
2    72.122000
3    72.142000
4    61.735000
Name: life_expectancy, dtype: float64

## 3. Build Preprocess (One-Hot only for `country_name`, `country_code`)

In [21]:

# Identify columns
cat_cols = ['country_name', 'country_code']
num_cols = [c for c in X_train.columns if c not in cat_cols]

# ColumnTransformer: passthrough numeric, one-hot categorical
preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ],
    remainder='drop'
)

# Fit on train and transform both train & val
preprocess.fit(X_train)
Xtr = preprocess.transform(X_train)
Xva = preprocess.transform(X_val)

print("Transformed shapes:", Xtr.shape, Xva.shape)


Transformed shapes: (4882, 446) (543, 446)


## 4. Train Loop (Epochs) & Save Model + Plot

In [22]:

def train_with_epochs(
    model_name: str,
    sgd,                     # SGDRegressor mẫu
    Xtr, ytr, Xva, yva,
    epochs: int = 60,
    model_dir: str = MODEL_DIR,
    early_stopping: bool = True,
    patience: int = 10,
    plot_extra: bool = True   # vẽ thêm MAE và R2 nếu muốn
):
    est = clone(sgd)  # dùng fresh estimator
    tr_rmse, va_rmse = [], []
    tr_mae,  va_mae  = [], []
    tr_r2,   va_r2   = [], []

    best_val_rmse = np.inf
    best_state = None
    bad_epochs = 0

    for ep in range(1, epochs + 1):
        est.partial_fit(Xtr, ytr)

        ytr_pred = est.predict(Xtr)
        yva_pred = est.predict(Xva)

        # --- Metrics ---
        rmse_tr = mean_squared_error(ytr, ytr_pred) if 'squared' not in mean_squared_error.__code__.co_varnames else mean_squared_error(ytr, ytr_pred, squared=False)
        rmse_va = mean_squared_error(yva, yva_pred) if 'squared' not in mean_squared_error.__code__.co_varnames else mean_squared_error(yva, yva_pred, squared=False)
        # ensure rmse is sqrt if returned MSE
        if rmse_tr > 0 and rmse_tr == mean_squared_error(ytr, ytr_pred):
            rmse_tr = math.sqrt(rmse_tr)
        if rmse_va > 0 and rmse_va == mean_squared_error(yva, yva_pred):
            rmse_va = math.sqrt(rmse_va)

        mae_tr  = mean_absolute_error(ytr, ytr_pred)
        mae_va  = mean_absolute_error(yva, yva_pred)
        r2_tr   = r2_score(ytr, ytr_pred)
        r2_va   = r2_score(yva, yva_pred)

        tr_rmse.append(rmse_tr); va_rmse.append(rmse_va)
        tr_mae.append(mae_tr);   va_mae.append(mae_va)
        tr_r2.append(r2_tr);     va_r2.append(r2_va)

        # Early stopping theo val RMSE
        if early_stopping:
            if rmse_va + 1e-9 < best_val_rmse:
                best_val_rmse = rmse_va
                best_state = copy.deepcopy(est)
                bad_epochs = 0
            else:
                bad_epochs += 1
                if bad_epochs >= patience:
                    if best_state is not None:
                        est = copy.deepcopy(best_state)
                    break

    # Lưu model (kèm preprocess)
    bundle = {"preprocess": preprocess, "estimator": est}
    pkl_path = os.path.join(model_dir, f"{model_name}.pkl")
    joblib.dump(bundle, pkl_path)

    # Lưu CSV metrics
    history = pd.DataFrame({
        "epoch": np.arange(1, len(tr_rmse)+1),
        "train_rmse": tr_rmse, "val_rmse": va_rmse,
        "train_mae":  tr_mae,  "val_mae":  va_mae,
        "train_r2":   tr_r2,   "val_r2":   va_r2
    })
    csv_path = os.path.join(model_dir, f"{model_name}_metrics.csv")
    history.to_csv(csv_path, index=False)

    # Plot RMSE
    x = np.arange(1, len(tr_rmse)+1)
    plt.figure()
    plt.plot(x, tr_rmse, label="Train RMSE")
    plt.plot(x, va_rmse, label="Validation RMSE")
    plt.xlabel("Epoch"); plt.ylabel("RMSE"); plt.title(f"{model_name}: RMSE vs Epoch")
    plt.legend()
    rmse_png = os.path.join(model_dir, f"{model_name}_rmse.png")
    plt.savefig(rmse_png, bbox_inches="tight"); plt.close()

    mae_png = r2_png = None
    if plot_extra:
        # Plot MAE
        plt.figure()
        plt.plot(x, tr_mae, label="Train MAE")
        plt.plot(x, va_mae, label="Validation MAE")
        plt.xlabel("Epoch"); plt.ylabel("MAE"); plt.title(f"{model_name}: MAE vs Epoch")
        plt.legend()
        mae_png = os.path.join(model_dir, f"{model_name}_mae.png")
        plt.savefig(mae_png, bbox_inches="tight"); plt.close()

        # Plot R²
        plt.figure()
        plt.plot(x, tr_r2, label="Train R²")
        plt.plot(x, va_r2, label="Validation R²")
        plt.xlabel("Epoch"); plt.ylabel("R²"); plt.title(f"{model_name}: R² vs Epoch")
        plt.legend()
        r2_png = os.path.join(model_dir, f"{model_name}_r2.png")
        plt.savefig(r2_png, bbox_inches="tight"); plt.close()

    # Backwards-compatible return: (pkl_path, rmse_png, tr_rmse_list, va_rmse_list)
    return pkl_path, rmse_png, tr_rmse, va_rmse

print("Train function ready.")


Train function ready.


## 5. Define Models (SGD equivalents)

In [23]:

# Linear (no regularization)
lin = SGDRegressor(
    loss="squared_error",
    penalty=None,
    learning_rate="optimal",
    eta0=0.01,
    random_state=0
)

# Ridge (L2)
ridge = SGDRegressor(
    loss="squared_error",
    penalty="l2",
    alpha=1e-4,
    learning_rate="optimal",
    eta0=0.01,
    random_state=0
)

# Lasso (L1)
lasso = SGDRegressor(
    loss="squared_error",
    penalty="l1",
    alpha=1e-5,
    learning_rate="optimal",
    eta0=0.01,
    random_state=0
)

# ElasticNet
elastic = SGDRegressor(
    loss="squared_error",
    penalty="elasticnet",
    alpha=1e-4,
    l1_ratio=0.15,
    learning_rate="optimal",
    eta0=0.01,
    random_state=0
)

print("Models ready.")


Models ready.


## 6. Train, Save `.pkl`, Plot Loss

In [24]:

results = {}

for name, est in [
    ("linear_sgd", lin),
    ("ridge_sgd", ridge),
    ("lasso_sgd", lasso),
    ("elastic_sgd", elastic),
]:
    pkl, png, tr_losses, va_losses = train_with_epochs(
        name, est, Xtr, y_train, Xva, y_val, epochs=10
    )
    results[name] = {
        "pkl": pkl,
        "png": png,
        "final_train_rmse": tr_losses[-1],
        "final_val_rmse": va_losses[-1],
    }
    print(f"{name}:")
    print("  model ->", pkl)
    print("  plot  ->", png)
    print(f"  last RMSE (train/val) -> {tr_losses[-1]:.4f} / {va_losses[-1]:.4f}")


linear_sgd:
  model -> ../model/1_linear_regression\linear_sgd.pkl
  plot  -> ../model/1_linear_regression\linear_sgd_rmse.png
  last RMSE (train/val) -> 60838130521508968.0000 / 60768264774596864.0000
ridge_sgd:
  model -> ../model/1_linear_regression\ridge_sgd.pkl
  plot  -> ../model/1_linear_regression\ridge_sgd_rmse.png
  last RMSE (train/val) -> 69749031177675032.0000 / 69752563391715360.0000
ridge_sgd:
  model -> ../model/1_linear_regression\ridge_sgd.pkl
  plot  -> ../model/1_linear_regression\ridge_sgd_rmse.png
  last RMSE (train/val) -> 69749031177675032.0000 / 69752563391715360.0000
lasso_sgd:
  model -> ../model/1_linear_regression\lasso_sgd.pkl
  plot  -> ../model/1_linear_regression\lasso_sgd_rmse.png
  last RMSE (train/val) -> 576657826935785856.0000 / 576814076225687168.0000
lasso_sgd:
  model -> ../model/1_linear_regression\lasso_sgd.pkl
  plot  -> ../model/1_linear_regression\lasso_sgd_rmse.png
  last RMSE (train/val) -> 576657826935785856.0000 / 576814076225687168.000

## 7. Load a Saved Model & Predict (Later)

In [25]:

# Example to use later:
# bundle = joblib.load("model/linear_sgd.pkl")
# preprocess_loaded = bundle["preprocess"]
# estimator_loaded = bundle["estimator"]
# X_new should be a DataFrame with the same columns as X_train
# y_pred = estimator_loaded.predict(preprocess_loaded.transform(X_new))
# print("Sample prediction:", y_pred[:5])
print("To use later: joblib.load('../model/1_linear_regression/<name>.pkl') and call estimator.predict(preprocess.transform(X_new)).")


To use later: joblib.load('../model/1_linear_regression/<name>.pkl') and call estimator.predict(preprocess.transform(X_new)).
