In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
os.chdir("..")
from helpers import *

from crossvalidation import cross_validate
from crossvalidation import tune_hyperparameters
#from crossvalidation import tune_hyperparameters1
#from crossvalidation import tune_hyperparameters2

from helpers import *
from helpers_perso import *
from preprocessing.nan_imputation import *
from preprocessing.binary_encoding import *
from implementations import *
from preprocessing.standardization import *
from preprocessing.class_balancing import *

In [None]:
data_path = os.path.join(os.getcwd(), "data", "dataset")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print("Data loaded successfully!")


In [28]:
# TEST CODE
# Create test arrays
x_train = np.random.rand(150, 200)  # 25 x 25 array for features
y_train = np.random.rand(150)   # 25 x 1 array for target values
initial_weights = np.zeros(x_train.shape[1])  # Define initial weights based on feature count

In [None]:
x_balanced, y_balanced, deleted_ids = balance_classes(x_train, y_train, 1)

x_train_cleaned, deleted_indices = remove_nan_features(x_train, 0.8)
adapted_x_test = np.delete(x_test, deleted_indices, axis=1)


integer_columns, non_integer_columns = identify_integer_columns(x_train_cleaned)
assert len(integer_columns) + len(non_integer_columns) == x_train_cleaned.shape[1]

x_train_cleaned_without_nans = encode_nan_integer_columns(x_train_cleaned, replacement_value='mode')
x_train_cleaned_without_nans = encode_nan_continuous_columns(x_train_cleaned_without_nans, replacement_value='mode')
assert np.isnan(x_train_cleaned_without_nans).sum() == 0
assert x_train_cleaned.shape == x_train_cleaned_without_nans.shape
adapted_x_test_without_nans = encode_nan_integer_columns(adapted_x_test, replacement_value='mode')
adapted_x_test_without_nans = encode_nan_continuous_columns(adapted_x_test_without_nans, replacement_value='mode')
assert np.isnan(adapted_x_test_without_nans).sum() == 0
assert adapted_x_test.shape == adapted_x_test_without_nans.shape

categorical_threshold = 5
unique_value_counts = np.array([len(np.unique(x_train_cleaned[:, col])) for col in integer_columns])
indexes_categorical_features = [integer_columns[i] for i, count in enumerate(unique_value_counts) if count <= categorical_threshold]
indexes_non_categorical_features = [integer_columns[i] for i in range(len(unique_value_counts)) if integer_columns[i] not in indexes_categorical_features]
assert len(indexes_categorical_features) + len(indexes_non_categorical_features) == len(unique_value_counts)
assert unique_value_counts.size == len(integer_columns)
indexes_non_categorical_features.extend(non_integer_columns)

x_standardized = standardize_columns(x_train_cleaned_without_nans, indexes_non_categorical_features)
x_test_standardized = standardize_columns(adapted_x_test_without_nans, indexes_non_categorical_features)

encoded_x_train, encoded_x_test = consistent_binary_encode(x_standardized, x_test_standardized, indexes_categorical_features)

In [None]:
print("NaNs in x_train:", np.isnan(x_train).any())
print("Infs in x_train:", np.isinf(x_train).any())
print("NaNs in y_train:", np.isnan(y_train).any())
print("Infs in y_train:", np.isinf(y_train).any())

In [16]:
# Define Models and Hyperparameter Grids
# Dictionary of model functions
models = {
    # "mean_squared_error_gd": mean_squared_error_gd,
    # "mean_squared_error_sgd": mean_squared_error_sgd,
    # "least_squares": least_squares,
    # "ridge_regression": ridge_regression,
    "logistic_regression": logistic_regression,
    # "reg_logistic_regression": reg_logistic_regression
}

# # Define hyperparameter grids for each model
param_grid = {
    "mean_squared_error_gd": {"max_iters": [15], "gamma": np.linspace(0.0001, 0.00001, 5).tolist()},
    "mean_squared_error_sgd": {"max_iters": [15], "gamma": np.linspace(0.0001, 0.00001, 5).tolist()},
    "least_squares": {},  # No hyperparameters for least squares
    "ridge_regression": {"lambda_": [0.2, 0.1, 0.01, 0.001, 0.0001]},
    "logistic_regression": {"max_iters": [15], "gamma": np.linspace(0.1, 0.00001, 100).tolist()},
    "reg_logistic_regression": {"max_iters": [15], "gamma": np.linspace(0.0001, 0.00001, 5).tolist(), "lambda_": [0.2, 0.1, 0.01, 0.001, 0.0001]}
}

In [None]:
# Run hyperparameter tuning
# Define models and parameter grid (as done before)
# Example usage of the tune_hyperparameters function
x_train = encoded_x_train
initial_weights = np.zeros(x_train.shape[1])  # Initial weights

# TODO create loss for regression (-log)

tuning_results = tune_hyperparameters(models, param_grid, x_train, y_train, initial_weights, k=5)
#tuning_results = tune_hyperparameters1(models, param_grid, x_train, y_train, initial_weights, k=5)
#tuning_results = tune_hyperparameters2(models, param_grid, x_train, y_train, k=5)

# Print best results for each model
for model_name, result in tuning_results.items():
    print(f"{model_name}: Best params: {result['best_params']}, Best score: {result['best_score']}")

In [8]:
# 0) define the parameters
initial_w = np.zeros(x_train.shape[1])  # Initial weights
max_iters = 100                       # Number of iterations
gamma = 0.1                             # Learning rate
lambda_ = 0.1                           # Regularization parameter

In [None]:
# 1) mean_squared_error_gd
cv_loss_gd = cross_validate(model_fn=mean_squared_error_gd, X=x_train, y=y_train, k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
print("Cross-validated loss for mean_squared_error_gd:", cv_loss_gd)

In [None]:
# Code que j'avais fais avant qd c'était dans le run, 
# mtn méthode differente, car run de crossvalidation dans le notebook crossvalidation


# ***************** Cross validation ***************** #

# Flexible crossvalidation on which function from implementations you want to validate
# there are dynamical arguments and you can change the function you want to validate by changing the function name

# 0) define the parameters
initial_w = np.zeros(x_train.shape[1])  # Initial weights
max_iters = 100                       # Number of iterations
gamma = 0.1                             # Learning rate
lambda_ = 0.1                           # Regularization parameter

# 1) mean_squared_error_gd
cv_loss_gd = cross_validate(model_fn=mean_squared_error_gd, X=x_train, y=y_train, k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
print("Cross-validated loss for mean_squared_error_gd:", cv_loss_gd)

# 2) mean_squared_error_sgd
cv_loss_sgd = cross_validate(model_fn=mean_squared_error_sgd, X=x_train, y=y_train, k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
print("Cross-validated loss for mean_squared_error_sgd:", cv_loss_gd)

# 3) least_squares
cv_loss_ls = cross_validate(model_fn=least_squares, X=x_train, Y=y_train, k=5)
print("Cross-validated loss for least_squares:", cv_loss_ls)

# 4) ridge_regression
#lambda_ = 0.1
#cv_loss_rr = cross_validate(ridge_regression, X=x_train, Y=y_train, k=5, lambda_=lambda_)
#print("Cross-validated loss for ridge_regression:", cv_loss_rr)

# 5) logistic_regression
cv_loss_lr = cross_validate(
    model_fn=logistic_regression,  # Pass the logistic regression function
    X=x_train,
    y=y_train,
    k=5,
    initial_w=initial_w,
    max_iters=max_iters,
    gamma=gamma
    )
print("Cross-validated loss for logistic_regression:", cv_loss_lr)

# 6) reg_logistic_regression
cv_loss_rlr = cross_validate(
    model_fn=reg_logistic_regression,  # Pass the regularized logistic regression function
    X=x_train,
    y=y_train,
    k=5, initial_w=initial_w, max_iters=max_iters, gamma=gamma,lambda_=lambda_
)
print("Cross-validated loss for reg_logistic_regression:", cv_loss_rlr)

# 4) Ridge Regression : cross validate + fine tuning of parameters
# Define ranges for hyperparameters
gamma_values = [0.01, 0.1, 0.5]
lambda_values = [0.01, 0.1, 1, 10]

best_score = float('inf')
best_params = {}

# Perform grid search over gamma and lambda_
for gamma in gamma_values:
    for lambda_ in lambda_values:
        cv_loss_rr = cross_validate(model_fn=ridge_regression,X=x_train,y=y_train,k=5,gamma=gamma,lambda_=lambda_)
        print(f"Cross-validated loss with gamma={gamma}, lambda_={lambda_}: {cv_loss_rr}")
        
        # Update best score and parameters
        if cv_loss_rr < best_score:
            best_score = cv_loss_rr
            best_params = {'gamma': gamma, 'lambda_': lambda_}

print(f"The best parameters: {best_params} yield a cross-validated loss of: {best_score}")


# LOOCV: To perform Leave-One-Out Cross-Validation --> just call cross_validate with k=len(y).