In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from tqdm import tqdm

number_of_folds = 10

data = pd.read_csv("./bodyfat.csv")
X = data.drop(columns=["BodyFat", "Density"])
y = data["BodyFat"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [3]:
# (i)
# Linear Regression:

linear_regression = LinearRegression()
fitted_linear_regression = linear_regression.fit(X_train, y_train)
train_mean_squared_error = np.mean((fitted_linear_regression.predict(X_train) - y_train) ** 2)
test_mean_squared_error = np.mean((fitted_linear_regression.predict(X_test) - y_test) ** 2)

print(
    "Erro de treino do modelo de regressão linear: ",
    train_mean_squared_error,
    "\n",
    "Erro de teste do modelo de regressão linear: ",
    test_mean_squared_error,
)

Erro de treino do modelo de regressão linear:  18.090133641655054 
 Erro de teste do modelo de regressão linear:  15.910489104854424


In [5]:
# (ii)
# Ridge Regression:
alphas = 10 ** np.linspace(3, -2, 100)

ridge_cv_mse = []
for alpha in tqdm(alphas):
    cv_MSE = []
    folds = KFold(n_splits=number_of_folds, shuffle=True, random_state=2023).split(X_train, y_train)
    for train_idx, val_idx in folds:
        ridge_pipeline = make_pipeline(StandardScaler(), Ridge(alpha=alpha))
        ridge_pipeline[1].fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        y_hat = ridge_pipeline[1].predict(X_train.iloc[val_idx])
        cv_MSE.append(np.mean(y_hat - y_train.iloc[val_idx]) ** 2)

    ridge_cv_mse.append(np.mean(cv_MSE))

optimal_alpha = alphas[np.argmin(ridge_cv_mse)]

ridge_pipeline = make_pipeline(StandardScaler(), Ridge(alpha=optimal_alpha))
ridge_pipeline[1].fit(X_train, y_train)

y_hat_ridge = ridge_pipeline[1].predict(X_test) 
ridge_test_mse = np.mean((y_hat_ridge - y_test) ** 2)
print(ridge_test_mse)

100%|██████████| 100/100 [00:03<00:00, 28.33it/s]

15.910432432652605





In [11]:
optimal_alpha

0.01

In [5]:
# (iii)

kernels = ["linear", "polynomial", "rbf", "laplacian"]
gammas = [10**-3]
alphas = 10 ** np.linspace(3, -2, 100)
hyperparams = [(kernel, gamma, alpha) for kernel in kernels for gamma in gammas for alpha in alphas]

ridge_cv_mse = []
for hyperparam in tqdm(hyperparams):
    cv_MSE = []
    folds = KFold(n_splits=number_of_folds, shuffle=True, random_state=2023).split(X_train, y_train)
    for train_idx, val_idx in folds:
        ridge_pipeline = make_pipeline(StandardScaler(), KernelRidge(alpha=hyperparam[2], kernel=hyperparam[0], gamma=hyperparam[1]))
        ridge_pipeline[1].fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        y_hat = ridge_pipeline[1].predict(X_train.iloc[val_idx])
        cv_MSE.append(np.mean(y_hat - y_train.iloc[val_idx]) ** 2)

    ridge_cv_mse.append(np.mean(cv_MSE))

optimal_hyperparam = hyperparams[np.argmin(ridge_cv_mse)]

ridge_pipeline = make_pipeline(StandardScaler(), KernelRidge(alpha=optimal_hyperparam[2], kernel=optimal_hyperparam[0], gamma=optimal_hyperparam[1]))
ridge_pipeline[1].fit(X_train, y_train)

y_hat_ridge = ridge_pipeline[1].predict(X_test) 
ridge_test_mse = np.mean((y_hat_ridge - y_test) ** 2)
print(ridge_test_mse)
print(optimal_hyperparam)

100%|██████████| 400/400 [03:00<00:00,  2.22it/s]

18.459461584351413



