In [1]:
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
import os
from helpers import *

from crossvalidation import cross_validate
from crossvalidation import tune_hyperparameters
#from crossvalidation import tune_hyperparameters1
#from crossvalidation import tune_hyperparameters2

In [None]:
data_path = os.path.join(os.getcwd(), "data", "dataset")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print("Data loaded successfully!")


In [None]:
data_path = os.path.join(os.getcwd(), "data", "dataset")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print("Data loaded successfully!")


In [None]:
# Loading the data
#data_path = os.path.join(os.getcwd(), "dataset")
# or
# data_path = data_path = "/Users/louistschanz/Documents/EPFL/MA1/ML/Project 1/ml-project-1-ai-huasca-1/dataset"
# x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
# print("Data loaded successfully!")

data_path = os.path.join(os.getcwd(), "data", "dataset")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print("Data loaded successfully!")


# Basic settings for reproducibility
np.random.seed(42)

In [28]:
# TEST CODE
# Create test arrays
x_train = np.random.rand(150, 200)  # 25 x 25 array for features
y_train = np.random.rand(150)   # 25 x 1 array for target values
initial_weights = np.zeros(x_train.shape[1])  # Define initial weights based on feature count

In [None]:
print("NaNs in x_train:", np.isnan(x_train).any())
print("Infs in x_train:", np.isinf(x_train).any())
print("NaNs in y_train:", np.isnan(y_train).any())
print("Infs in y_train:", np.isinf(y_train).any())

In [30]:
# Define Models and Hyperparameter Grids
# Dictionary of model functions
models = {
    "mean_squared_error_gd": mean_squared_error_gd,
    "mean_squared_error_sgd": mean_squared_error_sgd,
    "least_squares": least_squares,
    "ridge_regression": ridge_regression,
    "logistic_regression": logistic_regression,
    "reg_logistic_regression": reg_logistic_regression
}

# # Define hyperparameter grids for each model
param_grid = {
    "mean_squared_error_gd": {"max_iters": [10], "gamma": [0.01, 0.001,0.0001]},
    "mean_squared_error_sgd": {"max_iters": [10], "gamma": [0.01, 0.001, 0.0001]},
    "least_squares": {},  # No hyperparameters for least squares
    "ridge_regression": {"lambda_": [0.2, 0.1, 0.01, 0.001, 0.0001]},
    "logistic_regression": {"max_iters": [10], "gamma": [0.01, 0.001, 0.0001]},
    "reg_logistic_regression": {"max_iters": [10], "gamma": [0.01, 0.001, 0.0001], "lambda_": [0.2, 0.1, 0.01, 0.001, 0.0001]}
}

In [None]:
# Run hyperparameter tuning
# Define models and parameter grid (as done before)
# Example usage of the tune_hyperparameters function

tuning_results = tune_hyperparameters(models, param_grid, x_train, y_train, initial_weights, k=5)
#tuning_results = tune_hyperparameters1(models, param_grid, x_train, y_train, initial_weights, k=5)
#tuning_results = tune_hyperparameters2(models, param_grid, x_train, y_train, k=5)

# Print best results for each model
for model_name, result in tuning_results.items():
    print(f"{model_name}: Best params: {result['best_params']}, Best score: {result['best_score']}")

In [8]:
# 0) define the parameters
initial_w = np.zeros(x_train.shape[1])  # Initial weights
max_iters = 100                       # Number of iterations
gamma = 0.1                             # Learning rate
lambda_ = 0.1                           # Regularization parameter

In [None]:
# 1) mean_squared_error_gd
cv_loss_gd = cross_validate(model_fn=mean_squared_error_gd, X=x_train, y=y_train, k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
print("Cross-validated loss for mean_squared_error_gd:", cv_loss_gd)

In [None]:
# Code que j'avais fais avant qd c'était dans le run, 
# mtn méthode differente, car run de crossvalidation dans le notebook crossvalidation


# ***************** Cross validation ***************** #

# Flexible crossvalidation on which function from implementations you want to validate
# there are dynamical arguments and you can change the function you want to validate by changing the function name

# 0) define the parameters
initial_w = np.zeros(x_train.shape[1])  # Initial weights
max_iters = 100                       # Number of iterations
gamma = 0.1                             # Learning rate
lambda_ = 0.1                           # Regularization parameter

# 1) mean_squared_error_gd
cv_loss_gd = cross_validate(model_fn=mean_squared_error_gd, X=x_train, y=y_train, k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
print("Cross-validated loss for mean_squared_error_gd:", cv_loss_gd)

# 2) mean_squared_error_sgd
cv_loss_sgd = cross_validate(model_fn=mean_squared_error_sgd, X=x_train, y=y_train, k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
print("Cross-validated loss for mean_squared_error_sgd:", cv_loss_gd)

# 3) least_squares
cv_loss_ls = cross_validate(model_fn=least_squares, X=x_train, Y=y_train, k=5)
print("Cross-validated loss for least_squares:", cv_loss_ls)

# 4) ridge_regression
#lambda_ = 0.1
#cv_loss_rr = cross_validate(ridge_regression, X=x_train, Y=y_train, k=5, lambda_=lambda_)
#print("Cross-validated loss for ridge_regression:", cv_loss_rr)

# 5) logistic_regression
cv_loss_lr = cross_validate(
    model_fn=logistic_regression,  # Pass the logistic regression function
    X=x_train,
    y=y_train,
    k=5,
    initial_w=initial_w,
    max_iters=max_iters,
    gamma=gamma
    )
print("Cross-validated loss for logistic_regression:", cv_loss_lr)

# 6) reg_logistic_regression
cv_loss_rlr = cross_validate(
    model_fn=reg_logistic_regression,  # Pass the regularized logistic regression function
    X=x_train,
    y=y_train,
    k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma,lambda_=lambda_
)
print("Cross-validated loss for reg_logistic_regression:", cv_loss_rlr)

# 4) Ridge Regression : cross validate + fine tuning of parameters
# Define ranges for hyperparameters
gamma_values = [0.01, 0.1, 0.5]
lambda_values = [0.01, 0.1, 1, 10]

best_score = float('inf')
best_params = {}

# Perform grid search over gamma and lambda_
for gamma in gamma_values:
    for lambda_ in lambda_values:
        cv_loss_rr = cross_validate(model_fn=ridge_regression,X=x_train,y=y_train,k=5,gamma=gamma,lambda_=lambda_)
        print(f"Cross-validated loss with gamma={gamma}, lambda_={lambda_}: {cv_loss_rr}")
        
        # Update best score and parameters
        if cv_loss_rr < best_score:
            best_score = cv_loss_rr
            best_params = {'gamma': gamma, 'lambda_': lambda_}

print(f"The best parameters: {best_params} yield a cross-validated loss of: {best_score}")


# LOOCV: To perform Leave-One-Out Cross-Validation --> just call cross_validate with k=len(y).