# SVM Regression (L1 & L2)

Two linear SVM-style regressors to predict `life_expectancy`.


In [12]:
import os, joblib, numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

DATA_DIR = "../data/processed"
X_TRAIN_PATH = f"{DATA_DIR}/X_train.csv"
Y_TRAIN_PATH = f"{DATA_DIR}/y_train.csv"
X_TEST_PATH  = f"{DATA_DIR}/X_test.csv"
Y_TEST_PATH  = f"{DATA_DIR}/y_test.csv"

MODEL_DIR = "../model/2_SVM_regression"
os.makedirs(MODEL_DIR, exist_ok=True)
print("Paths ready.")

Paths ready.


In [13]:
X_train = pd.read_csv(X_TRAIN_PATH)
y_train = pd.read_csv(Y_TRAIN_PATH).squeeze("columns")
HAS_TEST = os.path.exists(X_TEST_PATH) and os.path.exists(Y_TEST_PATH)
if HAS_TEST:
    X_test = pd.read_csv(X_TEST_PATH)
    y_test = pd.read_csv(Y_TEST_PATH).squeeze("columns")
    print("Train:", X_train.shape, "| Test:", X_test.shape)
else:
    print("Train:", X_train.shape, "| Test: (not provided)")
X_train.head(3)

Train: (4882, 14) | Test: (543, 14)


Unnamed: 0,country_name,country_code,year,population,poverty_ratio,pop_growth,gdp_per_capita,gdp_growth,sanitation,electricity,water_access,co2_emissions,slum_population,labor_force
0,Northern Mariana Islands,MNP,2009,-0.251841,-0.728122,-2.815921,-0.055375,-3.599555,1.231196,0.619765,0.667126,-0.546646,-0.861598,7.014949e-16
1,Estonia,EST,2014,-0.242198,-0.698243,-0.962903,0.197888,-0.007659,1.347967,0.619765,0.725428,1.428233,-0.961443,0.01824257
2,Bangladesh,BGD,2018,0.999992,-0.337463,-0.290063,-0.561088,0.679336,-0.922636,0.156854,0.60691,-0.47025,0.871098,-0.1934273


In [14]:
cat_cols = ['country_name', 'country_code']
num_cols = [c for c in X_train.columns if c not in cat_cols]
preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ],
    remainder='drop'
)
print("Numeric cols:", len(num_cols), "| Categorical cols:", len(cat_cols))

Numeric cols: 12 | Categorical cols: 2


In [15]:
def report_metrics(y_true, y_pred, prefix=""):
    # mean_squared_error(..., squared=False) is only available in newer sklearn versions.
    # Fall back to sqrt(MSE) if the 'squared' kwarg is unsupported.
    try:
        rmse = mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        mse = mean_squared_error(y_true, y_pred)
        rmse = float(np.sqrt(mse))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    print(f"{prefix}RMSE={rmse:.4f} | MAE={mae:.4f} | R2={r2:.4f}")
    return {"rmse": rmse, "mae": mae, "r2": r2}

def save_metrics_csv(path, metrics_dict):
    pd.DataFrame([metrics_dict]).to_csv(path, index=False)
    return path

In [16]:
def train_svm_l2(do_cv: bool = True, model_dir: str = MODEL_DIR):
    pipe = Pipeline([('prep', preprocess), ('svm', LinearSVR(random_state=0, max_iter=20000))])
    if do_cv:
        param_grid = {
            'svm__C': [0.1, 1.0, 10.0],
            'svm__epsilon': [0.05, 0.1, 0.2, 0.5],
            'svm__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive']
        }
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        gs = GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=0)
        gs.fit(X_train, y_train)
        model = gs.best_estimator_
        print("[L2-SVM] Best params:", gs.best_params_)
        print("[L2-SVM] CV best RMSE:", -gs.best_score_)
    else:
        model = pipe.fit(X_train, y_train)
        print("[L2-SVM] Trained with default params.")
    test_csv = None
    if HAS_TEST:
        y_pred = model.predict(X_test)
        met = report_metrics(y_test, y_pred, prefix="[L2-SVM Test] " )
        test_csv = os.path.join(model_dir, "svm_l2_test_metrics.csv")
        save_metrics_csv(test_csv, met)
    pkl_path = os.path.join(model_dir, "svm_l2_linear.pkl")
    joblib.dump(model, pkl_path)
    print("[L2-SVM] Saved ->", pkl_path)
    if test_csv: print("[L2-SVM] Test metrics CSV ->", test_csv)
    return model

In [17]:
def train_svm_l1(do_cv: bool = True, model_dir: str = MODEL_DIR):
    pipe = Pipeline([('prep', preprocess),
                     ('svm', SGDRegressor(loss="epsilon_insensitive",
                                          penalty="l1",
                                          max_iter=5000, tol=1e-3,
                                          learning_rate="optimal",
                                          random_state=0))])
    if do_cv:
        param_grid = {
            'svm__alpha': [1e-5, 1e-4, 1e-3],
            'svm__epsilon': [0.05, 0.1, 0.2, 0.5],
        }
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        gs = GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=0)
        gs.fit(X_train, y_train)
        model = gs.best_estimator_
        print("[L1-SVM] Best params:", gs.best_params_)
        print("[L1-SVM] CV best RMSE:", -gs.best_score_)
    else:
        model = pipe.fit(X_train, y_train)
        print("[L1-SVM] Trained with default params.")
    test_csv = None
    if HAS_TEST:
        y_pred = model.predict(X_test)
        met = report_metrics(y_test, y_pred, prefix="[L1-SVM Test] " )
        test_csv = os.path.join(model_dir, "svm_l1_test_metrics.csv")
        save_metrics_csv(test_csv, met)
    pkl_path = os.path.join(model_dir, "svm_l1_linear.pkl")
    joblib.dump(model, pkl_path)
    print("[L1-SVM] Saved ->", pkl_path)
    if test_csv: print("[L1-SVM] Test metrics CSV ->", test_csv)
    return model

In [18]:
# Set do_cv=True for 5-fold CV; False to just fit once
l2_model = train_svm_l2(do_cv=True)
l1_model = train_svm_l1(do_cv=True)



[L2-SVM] Best params: {'svm__C': 0.1, 'svm__epsilon': 0.5, 'svm__loss': 'epsilon_insensitive'}
[L2-SVM] CV best RMSE: 3.905733205911863
[L2-SVM Test] RMSE=3.4599 | MAE=2.6480 | R2=0.8243
[L2-SVM] Saved -> ../model/2_SVM_regression\svm_l2_linear.pkl
[L2-SVM] Test metrics CSV -> ../model/2_SVM_regression\svm_l2_test_metrics.csv
[L1-SVM] Best params: {'svm__alpha': 0.001, 'svm__epsilon': 0.5}
[L1-SVM] CV best RMSE: 2859.149284686668
[L1-SVM Test] RMSE=25563.8478 | MAE=25563.6703 | R2=-9592148.2107
[L1-SVM] Saved -> ../model/2_SVM_regression\svm_l1_linear.pkl
[L1-SVM] Test metrics CSV -> ../model/2_SVM_regression\svm_l1_test_metrics.csv
