In [15]:
# import models and fit
import pandas as pd

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

In [4]:
# Load data
X_train_scaled = pd.read_csv('../data/processed/X_train_scaled.csv')
X_test_scaled = pd.read_csv('../data/processed/X_test_scaled.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

X_train_scaled.describe()

Unnamed: 0,GDP_per_capita,social_support,healthy_life_expectancy,freedom_of_choice,generosity,costofliving_index,rent_index,col_plus_rent_index,groceries_index,restaurant_price_index,local_purchase_power_index
count,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0
mean,1.259124e-15,-9.717717e-16,-2.847396e-15,-1.577823e-15,-4.4408920000000007e-17,-3.683328e-16,2.442491e-16,2.429429e-16,1.149407e-16,-2.220446e-16,-3.134747e-16
std,1.005935,1.005935,1.005935,1.005935,1.005935,1.005935,1.005935,1.005935,1.005935,1.005935,1.005935
min,-4.155727,-2.988632,-2.701178,-3.311316,-2.061494,-1.533774,-1.20622,-1.450991,-1.569581,-1.265004,-1.407434
25%,-0.5445485,-0.396952,-0.4749214,-0.448859,-0.7322077,-0.8087431,-0.6528445,-0.7895353,-0.7164162,-0.8260346,-0.7707839
50%,0.04419892,0.3687715,0.09178268,0.07069645,-0.1272631,-0.2505337,-0.299047,-0.2681527,-0.238402,-0.1370187,-0.349003
75%,0.7835336,0.7319994,0.7260109,0.7765077,0.8677115,0.6156532,0.3178308,0.4088665,0.3545776,0.4408656,0.5900563
max,1.802669,1.458455,1.621045,1.433304,2.260676,3.746758,4.672262,3.513817,3.972963,3.413636,3.155227


In [12]:
# Ridge Regression (L2)
ridge_model = Ridge(alpha=1.0)  # alpha is the regularization strength
ridge_model.fit(X_train_scaled, y_train)

y_train_ridge = ridge_model.predict(X_train_scaled)
y_test_ridge = ridge_model.predict(X_test_scaled)

# Lasso Regression (L1)
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train_scaled, y_train)

y_train_lasso = lasso_model.predict(X_train_scaled)
y_test_lasso = lasso_model.predict(X_test_scaled)

In [14]:
train_mse_ridge = mean_squared_error(y_train, y_train_ridge)
train_r2_ridge = r2_score(y_train, y_train_ridge)

mse_ridge = mean_squared_error(y_test, y_test_ridge)
r2_ridge = r2_score(y_test, y_test_ridge)

ridge_metrics = {'MSE':mse_ridge, 'R2':r2_ridge}

print(f'Train MSE: \t {train_mse_ridge}')
print(f'Test MSE: \t {mse_ridge}')
print(f'Train R2: \t {train_r2_ridge}')
print(f'Test R2: \t {r2_ridge}')

Train MSE: 	 0.16756935391332417
Test MSE: 	 0.16429258534304636
Train R2: 	 0.8359779862515065
Test R2: 	 0.8567334823029755


In [5]:
#Linear Regression
LR_model = LinearRegression()
LR_model.fit(X_train_scaled, y_train)

y_train_LR = LR_model.predict(X_train_scaled)
y_test_LR = LR_model.predict(X_test_scaled)

train_mse_LR = mean_squared_error(y_train, y_train_LR)
train_r2_LR = r2_score(y_train, y_train_LR)

mse_LR = mean_squared_error(y_test, y_test_LR)
r2_LR = r2_score(y_test, y_test_LR)

LR_metrics = {'MSE':mse_LR, 'R2':r2_LR}

print(f'Train MSE: \t {train_mse_LR}')
print(f'Test MSE: \t {mse_LR}')
print(f'Train R2: \t {train_r2_LR}')
print(f'Test R2: \t {r2_LR}')



Train MSE: 	 0.1621316220371805
Test MSE: 	 0.14469695049134182
Train R2: 	 0.841300604688114
Test R2: 	 0.8738212794266512


In [6]:
# XGBoots
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train_scaled, y_train)

y_train_xgb = xgb_model.predict(X_train_scaled)
y_test_xgb = xgb_model.predict(X_test_scaled)

train_mse_xgb = mean_squared_error(y_train, y_train_xgb)
train_r2_xgb = r2_score(y_train, y_train_xgb)

mse_xgb = mean_squared_error(y_test, y_test_xgb)
r2_xgb = r2_score(y_test, y_test_xgb)

XGB_metrics = {'MSE':mse_xgb, 'R2':r2_xgb}

print(f'Train MSE: \t {train_mse_xgb}')
print(f'Test MSE: \t {mse_xgb}')
print(f'Train R2: \t {train_r2_xgb}')
print(f'Test R2: \t {r2_xgb}')

Train MSE: 	 4.918704321774255e-07
Test MSE: 	 0.2588026276067454
Train R2: 	 0.9999995185421624
Test R2: 	 0.7743187792033396


In [7]:
# Decision Tree
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train_scaled, y_train)

y_train_tree = tree_model.predict(X_train_scaled)
y_test_tree = tree_model.predict(X_test_scaled)

train_mse_tree = mean_squared_error(y_train, y_train_tree)
train_r2_tree = r2_score(y_train, y_train_tree)

mse_tree = mean_squared_error(y_test, y_test_tree)
r2_tree = r2_score(y_test, y_test_tree)

DTree_metrics = {'MSE':mse_tree, 'R2':r2_tree}

print(f'Train MSE: \t {train_mse_tree}')
print(f'Test MSE: \t {mse_tree}')
print(f'Train R2: \t {train_r2_tree}')
print(f'Test R2: \t {r2_tree}')

Train MSE: 	 0.0
Test MSE: 	 0.30768786363636363
Train R2: 	 1.0
Test R2: 	 0.731689846691645


In [8]:
# Random Forest
forest_model = RandomForestRegressor(random_state=42)
forest_model.fit(X_train_scaled, y_train)

y_train_forest = forest_model.predict(X_train_scaled)
y_test_forest = forest_model.predict(X_test_scaled)

train_mse_forest = mean_squared_error(y_train, y_train_forest)
train_r2_forest = r2_score(y_train, y_train_forest)

mse_forest = mean_squared_error(y_test, y_test_forest)
r2_forest = r2_score(y_test, y_test_forest)

Forest_metrics = {'MSE':mse_forest, 'R2':r2_forest}

print(f'Train MSE: \t {train_mse_forest}')
print(f'Test MSE: \t {mse_forest}')
print(f'Train R2: \t {train_r2_forest}')
print(f'Test R2: \t {r2_forest}')

Train MSE: 	 0.04231499002941153
Test MSE: 	 0.25673043334091006
Train R2: 	 0.9585807922851959
Test R2: 	 0.776125775275853


  forest_model.fit(X_train_scaled, y_train)


In [9]:
# SVR
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train_scaled, y_train)

y_train_svr = svr_model.predict(X_train_scaled)
y_test_svr = svr_model.predict(X_test_scaled)

train_mse_svr = mean_squared_error(y_train, y_train_svr)
train_r2_svr = r2_score(y_train, y_train_tree)

mse_svr = mean_squared_error(y_test, y_test_svr)
r2_svr = r2_score(y_test, y_test_svr)

SVR_metrics = {'MSE':mse_svr, 'R2':r2_svr}

print(f'Train MSE: \t {train_mse_svr}')
print(f'Test MSE: \t {mse_svr}')
print(f'Train R2: \t {train_r2_svr}')
print(f'Test R2: \t {r2_svr}')

Train MSE: 	 0.15843753400973856
Test MSE: 	 0.2779567950484619
Train R2: 	 1.0
Test R2: 	 0.7576159507523135


  y = column_or_1d(y, warn=True)


#### Cross-Validation for Regularization Parameter Tuning

In [19]:
# Define a grid of alpha values for Ridge
param_grid = {'alpha': [0.01, 0.1, 1.0, 10, 100]}
ridge_cv = GridSearchCV(Ridge(), param_grid, cv=5)
ridge_cv.fit(X_train_scaled, y_train)

# Best alpha
print("Best alpha:", ridge_cv.best_params_)

# Evaluate the best model
best_ridge_model = ridge_cv.best_estimator_
print("Best Ridge Train R²:", best_ridge_model.score(X_train_scaled, y_train))
print("Best Ridge Test R²:", best_ridge_model.score(X_test_scaled, y_test))

Best alpha: {'alpha': 10}
Best Ridge Train R²: 0.8301067235378031
Best Ridge Test R²: 0.8452075328700599
