In [None]:
# Import 
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.linear_model import Ridge, Lasso, SGDRegressor, ElasticNet
from sklearn.model_selection import ParameterGrid
from sklearn.utils.validation import column_or_1d

# R2 score
from sklearn.metrics import r2_score

from warnings import filterwarnings
filterwarnings('ignore')

from tqdm.auto import tqdm

In [None]:
# Constants
# Path to data
PATH = "../data/regression/"
SEED = 1234
np.random.seed(SEED)

In [None]:
# Load npy data
X_train = np.load(PATH + "X_train.npy")
X_test = np.load(PATH + "X_test.npy")
y_train = np.load(PATH + "y_train.npy")
y_test = np.load(PATH + "y_test.npy")

# Check shapes and avoid warnings
y_train = column_or_1d(y_train)
y_test = column_or_1d(y_test)

X_train.shape

In [None]:
models = {
    "ElasticNet": ElasticNet(),
    "SGDRegressor": SGDRegressor(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
}

In [None]:
def regression(model: object, X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray) -> tuple:
    # Init hyperparameters
    hyperparameters = {
        "alpha": np.linspace(0.001, 0.6, 100),
        "max_iter": np.linspace(1, 50, 5, dtype=int),
    }
    # Create grid
    grid = ParameterGrid(hyperparameters)
    # Init best score
    best_score, best_param = -np.inf, None

    # Save params and scores
    params_save = []

    # Loop over all hyperparameters
    bar = tqdm(grid)
    for params in bar:
        bar.set_description(f"Testing {params}")
        # Set params
        model.set_params(**params)
        # Fit model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_test)
        # Score
        score = r2_score(y_test, y_pred)
        # Check if score is better
        if score > best_score:
            # Update best score
            best_score = score
            best_param = params
        # Save params and scores
        params_save.append((params, score))
        
    print(f"Best R2 score for model {model.__class__.__name__}: {best_score} with params {best_param}")
    return best_score, params_save

In [None]:
def compare_models(models: dict, X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray) -> tuple:
    r2 = []
    model_params = []
    bar = tqdm(models.items())
    for name, model in bar:
        bar.set_description(f"Training {name}")
        score, params = regression(model, X_train, X_test, y_train, y_test)
        r2.append(score)
        model_params.append(params)    
    return r2, model_params

In [None]:
r2_scores, model_params = compare_models(models, X_train, X_test, y_train, y_test)

In [None]:
# Plot Models Parameters
def plot_model_params(model_params: list, models: dict) -> None:
    fig = plt.figure(figsize=(15, 10))
    for i, name in enumerate(models.keys()):
        ax = fig.add_subplot(2, 2, i+1, projection='3d')
        params = model_params[i]
        alphas = [param["alpha"] for param, _ in params]
        max_iters = [param["max_iter"] for param, _ in params]
        scores = [score for _, score in params]
        ax.scatter(alphas, max_iters, scores)
        ax.set_title(name)
        ax.set_xlabel("Alpha")
        ax.set_ylabel("Max Iter")
        ax.set_zlabel("R2")
    plt.tight_layout()
    plt.show()

plot_model_params(model_params, models)

In [None]:
# Plot R2 scores
def plot_r2_scores(r2_scores: list, models: dict) -> None:
    fig, ax = plt.subplots(1, 1, figsize=(15, 5))
    ax.bar(models.keys(), r2_scores)
    ax.set_title("R2 scores")
    ax.set_xlabel("Model")
    ax.set_ylabel("R2")

plot_r2_scores(r2_scores, models)