In [8]:
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from crossvalidation import *
import os
from helpers import *
from itertools import product

In [12]:
# Loading the data
#data_path = os.path.join(os.getcwd(), "dataset")
# or
data_path = data_path = "/Users/louistschanz/Documents/EPFL/MA1/ML/Project 1/ml-project-1-ai-huasca-1/dataset"
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print("Data loaded successfully!")

# Basic settings for reproducibility
np.random.seed(42)

FileNotFoundError: /Users/louistschanz/Documents/EPFL/MA1/ML/Project 1/ml-project-1-ai-huasca-1/dataset/y_train.csv not found.

In [None]:
# Define Models and Hyperparameter Grids
# Dictionary of model functions
models = {
    "mean_squared_error_gd": mean_squared_error_gd,
    "mean_squared_error_sgd": mean_squared_error_sgd,
    "least_squares": least_squares,
    "ridge_regression": ridge_regression,
    "logistic_regression": logistic_regression,
    "reg_logistic_regression": reg_logistic_regression
}

# Define hyperparameter grids for each model
param_grid = {
    "mean_squared_error_gd": {"initial_w": [np.zeros(x_train.shape[1])], "max_iters": [500, 1000], "gamma": [0.01, 0.1, 0.5]},
    "mean_squared_error_sgd": {"initial_w": [np.zeros(x_train.shape[1])], "max_iters": [500, 1000], "gamma": [0.01, 0.1, 0.5]},
    "least_squares": {},  # No hyperparameter for least squares
    "ridge_regression": {"lambda_": [0.01, 0.1, 1, 10]},
    "logistic_regression": {"initial_w": [np.zeros(x_train.shape[1])], "max_iters": [500, 1000], "gamma": [0.01, 0.1, 0.5]},
    "reg_logistic_regression": {"initial_w": [np.zeros(x_train.shape[1])], "max_iters": [500, 1000], "gamma": [0.01, 0.1, 0.5], "lambda_": [0.01, 0.1, 1, 10]} # lambda only for reg_logistic_regression
}

In [None]:
# Cross validate and tune hyperparameters for each model
# Dictionary to store best parameters and scores for each model
tuning_results = {}

# Loop through each model for cross-validation and tuning
for model_name, model_fn in models.items():
    print(f"\nCross-validating and tuning {model_name}...")
    
    # Retrieve hyperparameters for the current model
    model_params = param_grid[model_name]
    
    best_score = float('inf')
    best_params = None
    scores = []  # Store scores for plotting

    # Perform grid search over hyperparameter combinations
    param_names, param_values = zip(*model_params.items())
    for param_combo in product(*param_values):
        params = dict(zip(param_names, param_combo))
        
        # Cross-validate with the current parameter combination
        cv_score = cross_validate(model_fn, X=x_train, y=y_train, k=5, **params)
        scores.append((params, cv_score))
        print(f"Params: {params}, Cross-validated loss: {cv_score}")
        
        # Update best score and parameters if current score is better
        if cv_score < best_score:
            best_score = cv_score
            best_params = params
    
    # Store the best result for the current model
    tuning_results[model_name] = {"best_params": best_params, "best_score": best_score, "scores": scores}

In [None]:
# Code que j'avais fais avant qd c'était dans le run, 
# mtn méthode differente, car run de crossvalidation dans le notebook crossvalidation


# ***************** Cross validation ***************** #

# Flexible crossvalidation on which function from implementations you want to validate
# there are dynamical arguments and you can change the function you want to validate by changing the function name

# 0) define the parameters
initial_w = np.zeros(x_train.shape[1])  # Initial weights
max_iters = 1000                       # Number of iterations
gamma = 0.1                             # Learning rate
lambda_ = 0.1                           # Regularization parameter

# 1) mean_squared_error_gd
cv_loss_gd = cross_validate(model_fn=mean_squared_error_gd, X=x_train, Y=y_train, k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
print("Cross-validated loss for mean_squared_error_gd:", cv_loss_gd)

# 2) mean_squared_error_sgd
cv_loss_sgd = cross_validate(model_fn=mean_squared_error_sgd, X=x_train, Y=y_train, k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
print("Cross-validated loss for mean_squared_error_sgd:", cv_loss_gd)

# 3) least_squares
cv_loss_ls = cross_validate(model_fn=least_squares, X=x_train, Y=y_train, k=5)
print("Cross-validated loss for least_squares:", cv_loss_ls)

# 4) ridge_regression
#lambda_ = 0.1
#cv_loss_rr = cross_validate(ridge_regression, X=x_train, Y=y_train, k=5, lambda_=lambda_)
#print("Cross-validated loss for ridge_regression:", cv_loss_rr)

# 5) logistic_regression
cv_loss_lr = cross_validate(
    model_fn=logistic_regression,  # Pass the logistic regression function
    X=x_train,
    y=y_train,
    k=5,
    initial_w=initial_w,
    max_iters=max_iters,
    gamma=gamma
    )
print("Cross-validated loss for logistic_regression:", cv_loss_lr)

# 6) reg_logistic_regression
cv_loss_rlr = cross_validate(
    model_fn=reg_logistic_regression,  # Pass the regularized logistic regression function
    X=x_train,
    y=y_train,
    k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma,lambda_=lambda_
)
print("Cross-validated loss for reg_logistic_regression:", cv_loss_rlr)

# 4) Ridge Regression : cross validate + fine tuning of parameters
# Define ranges for hyperparameters
gamma_values = [0.01, 0.1, 0.5]
lambda_values = [0.01, 0.1, 1, 10]

best_score = float('inf')
best_params = {}

# Perform grid search over gamma and lambda_
for gamma in gamma_values:
    for lambda_ in lambda_values:
        cv_loss_rr = cross_validate(model_fn=ridge_regression,X=x_train,y=y_train,k=5,gamma=gamma,lambda_=lambda_)
        print(f"Cross-validated loss with gamma={gamma}, lambda_={lambda_}: {cv_loss_rr}")
        
        # Update best score and parameters
        if cv_loss_rr < best_score:
            best_score = cv_loss_rr
            best_params = {'gamma': gamma, 'lambda_': lambda_}

print(f"The best parameters: {best_params} yield a cross-validated loss of: {best_score}")


# LOOCV: To perform Leave-One-Out Cross-Validation --> just call cross_validate with k=len(y).