# 4 - Regression on a given dataset

### Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

### Loading the dataset

In [2]:
# Training - Load the raw data and the associated labels
X_train = np.load('./data/regression/X_train.npy')
y_train = np.load('./data/regression/y_train.npy')

# Testing - Similarly, the raw data and the associated labels
X_test = np.load('./data/regression/X_test.npy')
y_test = np.load('./data/regression/y_test.npy')

## Regression

### Ridge

For the Ridge regression, we have to find the $\lambda$ that minimizes the empirical risk. To that end, we use the Optunity library to find the best $\lambda$ in a specific range. After finding the best value for the parameter, we use the `R2` score as a metric to evaluate the performance of the Ridge regression.

In [3]:
# Hyperparameter search with Optuna

import optuna
from sklearn.model_selection import cross_val_score, KFold

def objective(trial):
    # Define the hyperparameter configuration space for the Ridge regressor
    alpha = trial.suggest_float('alpha', 1e-10, 1e5, log=True)

    # Define the Ridge regressor
    ridge_regressor = Ridge(alpha=alpha)

    # Fit the model to the training data
    ridge_regressor.fit(X_train, y_train)

    # Define the cross-validation folds

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Evaluate the model
    cross_validation_score = cross_val_score(ridge_regressor, X_train, y_train, cv=cv, n_jobs=-1).mean()

    return cross_validation_score

# Create a study object and optimize the objective function
study = optuna.create_study(store

optuna.logging.set_verbosity(optuna.logging.WARNING)

# Start the optimization
study.optimize(objective, n_trials=500, n_jobs=5, show_progress_bar=True)

# Print the optimized parameter values
print(study.best_params)
print(study.best_value)

[I 2023-06-20 16:33:31,766] A new study created in memory with name: no-name-f4e99142-27ad-49bc-857d-cc0c1db58fad


  0%|          | 0/500 [00:00<?, ?it/s]

{'alpha': 2.0801118914092473}
0.5568448762057682


In [4]:
ridge = Ridge(**study.best_params)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2 score: {r2:.2f}")

R2 score: 0.59


### Lasso

We use the same approach as for the Ridge regression, applied to the Lasso regression. We also use the `R2` score as a metric to evaluate the performance of the Lasso regression.

In [5]:
# Same with Lasso
def objective(trial):
    # Define the hyperparameter configuration space for the Lasso regressor
    alpha = trial.suggest_float('alpha', 1e-10, 1e5, log=True)

    # Define the Lasso regressor
    lasso_regressor = Lasso(alpha=alpha, max_iter=10000, tol=1e-2)

    # Fit the model to the training data
    lasso_regressor.fit(X_train, y_train)

    # Define the cross-validation folds

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Evaluate the model
    cross_validation_score = cross_val_score(lasso_regressor, X_train, y_train, cv=cv, n_jobs=-1).mean()

    return cross_validation_score

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')

optuna.logging.set_verbosity(optuna.logging.ERROR)

# Start the optimization
study.optimize(objective, n_trials=500, n_jobs=5, show_progress_bar=True)

# Print the optimized parameter values
print(study.best_params)
print(study.best_value)

  0%|          | 0/500 [00:00<?, ?it/s]

{'alpha': 0.009022172041213472}
0.8718600551752118


In [6]:
lasso = Lasso(**study.best_params)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2 score: {r2:.2f}")

R2 score: 0.88


## Results

We can conclude that the most efficient regression is the Lasso regression. Indeed, the output `R2` performs as well as the Bayes estimator.

<!-- random_forest_regressor =  -->