In [1]:
#load libraries and previous functions:
using Downloads
using DelimitedFiles
using Plots
using MLJ
using MLJModels
using MLJMultivariateStatsInterface
using MLJLinearModels
using MLJDecisionTreeInterface
using MLJNaiveBayesInterface
using MLJLIBSVMInterface
using Statistics
using Flux
using Flux: Losses
using Printf
using Random
using NearestNeighborModels
using CSV
using DataFrames
using DataFramesMeta
import MultivariateStats
include("unit2-multilayer-perceptron.jl")
include("unit3-overfitting.jl")
include("unit4-metrics.jl")
include("unit5-crossvalidation.jl")
include("unit6-modelcrossvalidation.jl")
include("preprocess_utils.jl")

universalCrossValidation_PCA

In [2]:
"""
    stratified_holdOut(y, p_val, p_test)

    Split the indices, while keeping the proportions of the classes in "y" for each set
    Also seed is fixed by default, as for reproducibility
"""
function stratified_holdOut(targets::AbstractVector, p_val::Real, p_test::Real; seed::Int=1234) 
    Random.seed!(seed)
    N = length(targets)
    classes = unique(targets)
    
    # Final Indices
    idx_train = Int[]
    idx_val   = Int[]
    idx_test  = Int[]
    
    # Loop iterating over all classes
    for c in classes
        # Get the index for a given class
        idx_class = findall(x -> x == c, targets)
        n_class = length(idx_class)
        
        # Shuffle inside the class
        shuffle!(idx_class)
        
        # Measure how many are going into each set
        n_val  = round(Int, n_class * p_val)
        n_test = round(Int, n_class * p_test)
        n_train = n_class - n_val - n_test
        
        # Distibute the data
        
        append!(idx_val,   idx_class[1:n_val])
        append!(idx_test,  idx_class[n_val+1 : n_val+n_test])
        append!(idx_train, idx_class[n_val+n_test+1 : end])
    end
    
    # Shuffle final indices, avoiding order by class
    shuffle!(idx_train)
    shuffle!(idx_val)
    shuffle!(idx_test)
    
    return idx_train, idx_val, idx_test
end



"""
    load_and_clean_data(path::String)

Carga el dataset de enfermedades cardíacas, gestiona los valores nulos según
la lógica del 'Approach 1' (imputación categórica + eliminación de filas numéricas)
y devuelve los datos listos junto con los metadatos de las columnas.

# Returns
- `data`: DataFrame limpio.
- `num_col`: Vector de símbolos con columnas numéricas.
- `cat_col`: Vector de símbolos con columnas categóricas.
- `target_col`: Símbolo de la columna objetivo.
"""
function load_and_clean_data(path::String)
    println(">>> Loading data from: $path")
    
    # 1. Load data
    data = DataFrame()
    try
        data = CSV.read(path, DataFrame)
    catch e
        error("Error while loading file. Check path.\nDetails: $e")
    end

    # 2. Drop irrelevant features
    select!(data, Not([:id, :dataset])) 

    println("  Original Size: $(size(data))")

    # ---------------------------------------------------------
    # 3. Mange Nulls (CAT) -> "missingval"
    # ---------------------------------------------------------
    cat_col_null = [:fbs, :restecg, :exang, :slope, :thal, :ca]
    
    for col in cat_col_null
        
        data[!, col] = replace(data[!, col], missing => "missingval")
    end
    println(" Categorical Null values replaced with ---> 'missingval'.")

    # ---------------------------------------------------------
    # 4. manage Nulls (NUM) -> Drop rows if null < given percentage
    # ---------------------------------------------------------
    
    rows = nrow(data)
    desc_df = describe(data, :nmissing)
    aux_miss = DataFrame(
        Column = desc_df.variable, 
        Percent = (desc_df.nmissing ./ rows) .* 100
    )

    #  Drop rows if null < given percentage
    cols_to_clean_df = @rsubset(aux_miss, 0 < :Percent < 7.5)
    cols_to_clean = Symbol.(cols_to_clean_df.Column)

    if !isempty(cols_to_clean)
        dropmissing!(data, cols_to_clean)
        println("  Deleted rows in features: $cols_to_clean")
    end

    # ---------------------------------------------------------
    # 5. Final sort
    # ---------------------------------------------------------
    num_col = [:age, :trestbps, :chol, :thalch, :oldpeak]
    cat_col = [:sex, :cp, :fbs, :restecg, :exang, :slope, :ca, :thal]
    target_col = :num
    # disallowmissing, transform clean features to single types
    data = select(data, num_col, cat_col, target_col)
    disallowmissing!(data) 

    println("  Final shape: $(size(data))")
    println("------------------------")

    return data, num_col, cat_col, target_col
end


""" 
    check_class_distribution(y_train, y_val, y_test)

    Auxiliar function to check the class distribution and verify stratification
"""
function check_class_distribution(y_train, y_val, y_test)
    
    all_classes = sort(unique(vcat(y_train, y_val, y_test)))
    
    println("\n--- Class Distribution Analisys ---")
    @printf("%-10s | %-15s | %-15s | %-15s\n", "Class", "Train % (N)", "Val % (N)", "Test % (N)")
    println("-"^65)
    
    for c in all_classes
        
        n_tr = count(x -> x == c, y_train)
        n_val = count(x -> x == c, y_val)
        n_te = count(x -> x == c, y_test)
        
        p_tr = (n_tr / length(y_train)) * 100
        p_val = (n_val / length(y_val)) * 100
        p_te = (n_te / length(y_test)) * 100
        
        @printf("Class %d    | %5.2f%% (%3d)    | %5.2f%% (%3d)    | %5.2f%% (%3d)\n", 
                c, p_tr, n_tr, p_val, n_val, p_te, n_te)
    end
    println("-"^65)
    println("Total |         %-6d    |         %-6d    |         %-6d\n", 
            length(y_train), length(y_val), length(y_test))
end







"""
    prepare_data(clean_data, num_col, cat_col, target_col; ...)

    Take a clean DataFrame without NULL values and prepare the data to feed the models:
    1. Split Train/Validation/Test (stratified_holdOut)
    2. Split X/Y, input and output 
    3. Normalization (MinMax or Z-score) numerical features
    4. One-Hot Encoding Categorical features
    5. Combine the matrices
    6. Process the target Y (MLJ.categorical and OHE)
"""
function prepare_data(clean_data::DataFrame, 
                                    num_col::Vector{Symbol}, #name of the numerical features
                                    cat_col::Vector{Symbol}, #name of the categorical features
                                    target_col::Symbol; #name of the target feature
                                    Pval::Real=0.15, #percent for split  val set
                                    Ptest::Real=0.15, #percent for split test set
                                    norm_method::Symbol=:minmax) #normalization method, either :minmax or :zscore
    
    println("\n--- init Preprocess ---")
    println("   Normalization: $norm_method")

    # --- 1. Data Split (HoldOut) ---
    rows, columns = size(clean_data)
    N = rows
    
    (train_indices, val_indices, test_indices) = stratified_holdOut(data[!, target_col], Pval, Ptest; seed = 1234)
    
    train_data = clean_data[train_indices, :]
    val_data = clean_data[val_indices, :]
    test_data = clean_data[test_indices, :]
    println("    Stratigfied HoldOut split: $(size(train_data,1)) train, $(size(val_data,1)) val, $(size(test_data,1)) test")

    # --- 2. Features/Target Split ---
    x_train_df = select(train_data, Not(target_col))
    y_train_vec = train_data[!, target_col]
    x_val_df = select(val_data, Not(target_col))
    y_val_vec = val_data[!, target_col]
    x_test_df = select(test_data, Not(target_col))
    y_test_vec = test_data[!, target_col]

    # --- 3. Normalization of numerical features ---
    println("    Normalizing numerical features...")
    x_train_num_mat = Matrix{Float64}(x_train_df[!, num_col])
    x_test_num_mat = Matrix{Float64}(x_test_df[!, num_col])
    x_val_num_mat = Matrix{Float64}(x_val_df[!, num_col])
    
    norm_param = nothing #Init the variable 

    if norm_method == :minmax
        norm_param = calculateMinMaxNormalizationParameters(x_train_num_mat)
        normalizeMinMax!(x_train_num_mat, norm_param)
        normalizeMinMax!(x_test_num_mat, norm_param)
        normalizeMinMax!(x_val_num_mat, norm_param)
    elseif norm_method == :zscore
        norm_param = calculateZeroMeanNormalizationParameters(x_train_num_mat)
        normalizeZeroMean!(x_train_num_mat, norm_param)
        normalizeZeroMean!(x_test_num_mat, norm_param)
        normalizeZeroMean!(x_val_num_mat, norm_param)
    else
        error("Normalization method not clear: '$norm_method' . Use :minmax or :zscore.")
    end
    println("    ...Normalization completed.")

    # --- 4. One-Hot Encoding Categorial features ---
    println("    Encoding categorical features (OHE)...")
    
    x_train_cat_mat = BitArray{2}(undef, size(x_train_df, 1), 0)
    x_test_cat_mat  = BitArray{2}(undef, size(x_test_df, 1), 0)
    x_val_cat_mat = BitArray{2}(undef, size(x_val_df, 1), 0)
    
    ohe_classes_map = Dict{Symbol, Vector{Any}}() # Store classes

    for col in cat_col
        feature_train = x_train_df[!, col]
        feature_test  = x_test_df[!, col]
        feature_val = x_val_df[!, col]
        
        learn_classes = unique(feature_train)
        
        # Manage unseen clases, ex. if missingval is only present in tets and validation due to the randomness in split
        for val in unique(vcat(feature_test, feature_val))
            if !(val in learn_classes)
                push!(learn_classes, val)
                println("        -> Warning: Feature '$col': Class '$val' aded (Not present in train).")
            end
        end
        
        ohe_classes_map[col] = learn_classes # Save the classes
        
        encoded_train = oneHotEncoding(feature_train, learn_classes)
        encoded_test  = oneHotEncoding(feature_test, learn_classes)
        encoded_val   = oneHotEncoding(feature_val, learn_classes)
        
        x_train_cat_mat = hcat(x_train_cat_mat, encoded_train)
        x_test_cat_mat  = hcat(x_test_cat_mat, encoded_test)
        x_val_cat_mat   = hcat(x_val_cat_mat, encoded_val) 
    end
    println("    ...OHE completed.")

    # --- 5. Combine the matrices ---
    println("    Concatenate numerical and categorical matrices...")
    x_train_final = hcat(x_train_num_mat, x_train_cat_mat)
    x_test_final  = hcat(x_test_num_mat, x_test_cat_mat)
    x_val_final = hcat(x_val_num_mat, x_val_cat_mat)
    
    # --- 6. Process the targets ---
    target_classes = sort(unique(clean_data[!, target_col]))
    println("    Classes stored for the target: $target_classes")
    
    # (SVM, DT, kNN)
    y_train_cat = MLJ.categorical(y_train_vec)
    y_test_cat = MLJ.categorical(y_test_vec)
    y_val_cat = MLJ.categorical(y_val_vec)
    
    # For ANN (OHE)
    y_train_ohe = oneHotEncoding(y_train_vec, target_classes)
    y_test_ohe  = oneHotEncoding(y_test_vec, target_classes)
    y_val_ohe   = oneHotEncoding(y_val_vec, target_classes)
    
    println("--- PREPROCESS END SUCCESFULLY ---")

    # --- 7. Return data ---
    return (
        x_train = x_train_final,
        y_train_cat = y_train_cat, # For MLJ
        y_train_ohe = y_train_ohe, # For ANN
        
        x_val = x_val_final,
        y_val_cat = y_val_cat,     # For MLJ
        y_val_ohe = y_val_ohe,     # For ANN
        
        x_test = x_test_final,
        y_test_cat = y_test_cat,   # For MLJ
        y_test_ohe = y_test_ohe,   # For ANN
        
        norm_params = norm_param,
        ohe_classes = ohe_classes_map
    )
end




prepare_data

In [3]:
# -----------------------------------------------------------------
# (Aproach 1) minmax
# -----------------------------------------------------------------

data_path = "heart_disease_uci.csv"

data, num_col, cat_col, target_col = load_and_clean_data(data_path)

println("\n Init aproach 1 (MinMax)...")
approach_1 = prepare_data(
    data,          # Clean DataFrame without Nulls
    num_col,       # numerical features
    cat_col,       # caetgorical features
    target_col,    # target feature
    norm_method=:minmax #norm metghod, either :minmax or :zscore
)

println("\n--- Approach 1 ---")
println("To acces data:")
println("approach_1.x_train")
println("approach_1.y_train_cat (for SVM/DT/kNN)")
println("approach_1.y_train_ohe (for ANN)")
println("...")

>>> Loading data from: heart_disease_uci.csv
  Original Size: (920, 14)
 Categorical Null values replaced with ---> 'missingval'.
  Deleted rows in features: [:trestbps, :chol, :thalch, :oldpeak]
  Final shape: (827, 14)
------------------------

 Init aproach 1 (MinMax)...

--- init Preprocess ---
   Normalization: minmax
    Stratigfied HoldOut split: 577 train, 125 val, 125 test
    Normalizing numerical features...
    ...Normalization completed.
    Encoding categorical features (OHE)...
    ...OHE completed.
    Concatenate numerical and categorical matrices...
    Classes stored for the target: [0, 1, 2, 3, 4]
--- PREPROCESS END SUCCESFULLY ---

--- Approach 1 ---
To acces data:
approach_1.x_train
approach_1.y_train_cat (for SVM/DT/kNN)
approach_1.y_train_ohe (for ANN)
...


In [4]:
#acceder a los datos del approach 1:
x_train = approach_1.x_train
x_val   = approach_1.x_val
x_test  = approach_1.x_test
y_train = approach_1.y_train_cat
y_val   = approach_1.y_val_cat
y_test  = approach_1.y_test_cat;

In [5]:
#check_class_distribution(y_train, y_val, y_test)

In [6]:
# -----------------------------------------------------------------
# (Aproach 2) PCA and z-score 
# -----------------------------------------------------------------
using MLJMultivariateStatsInterface
PCA = @load PCA pkg=MultivariateStats

data_path = "heart_disease_uci.csv"

data, num_col, cat_col, target_col = load_and_clean_data(data_path)

approach_2 = prepare_data(
    data,         
    num_col,       
    cat_col,       
    target_col,    
    norm_method=:zscore 
)

println("\n---data preprocessed---")

println("\n---Init PCA transformation---")


#Unpack variables for MLJ
x_train = approach_2.x_train
x_val = approach_2.x_val
x_test = approach_2.x_test

y_train_pca = approach_2.y_train_cat 
y_val_pca = approach_2.y_val_cat     
y_test_pca = approach_2.y_test_cat     

# Combine Train + Val (to adjust PCA) for models != ANN, for ANN take this into account
x_train_val_combined = vcat(x_train, x_val)
y_train_val_combined = vcat(y_train, y_val)

println(" Train set size: ", size(x_train_val_combined))


# Use PCA to select the components that explain 95% of the variance
pca_model = PCA(variance_ratio=0.95)

#1 Adjust the PCA only with the training data
pca_machine = machine(pca_model, MLJ.table(x_train_val_combined))
MLJ.fit!(pca_machine, verbosity=0)

#2 transform data
x_train_val_pca = MLJ.transform(pca_machine, MLJ.table(x_train_val_combined))

x_test_pca = MLJ.transform(pca_machine, MLJ.table(x_test))

#For MLJ is better to pass the data as table
#To see data as matrix use: mat_train_pca = MLJ.matrix(x_train_val_pca)
# Para ver los datos transformados como matriz:
mat_train_pca = MLJ.matrix(x_train_val_pca)
println("Train set size: after PCA: ", size(mat_train_pca))

"""
#Example of use for ANN

println(" Train set size: ", size(x_train))


# Use PCA to select the components that explain 95% of the variance
pca_model = PCA(variance_ratio=0.95)

#1 Adjust the PCA only with the training data
pca_machine = machine(pca_model, MLJ.table(x_train))
MLJ.fit!(pca_machine, verbosity=0)

#2 transform data
x_train_pca = MLJ.transform(pca_machine, MLJ.table(x_train))
x_val_pca = MLJ.transform(pca_machine, MLJ.table(x_val))
x_test_pca = MLJ.transform(pca_machine, MLJ.table(x_test))

#For MLJ is better to pass the data as table
#To see data as matrix use: mat_train_pca = MLJ.matrix(x_train_val_test)
"""

import MLJMultivariateStatsInterface ✔
>>> Loading data from: heart_disease_uci.csv
  Original Size: (920, 14)
 Categorical Null values replaced with ---> 'missingval'.
  Deleted rows in features: [:trestbps, :chol, :thalch, :oldpeak]
  Final shape: (827, 14)
------------------------

--- init Preprocess ---
   Normalization: zscore
    Stratigfied HoldOut split: 577 train, 125 val, 125 test
    Normalizing numerical features...
    ...Normalization completed.
    Encoding categorical features (OHE)...
    ...OHE completed.
    Concatenate numerical and categorical matrices...
    Classes stored for the target: [0, 1, 2, 3, 4]
--- PREPROCESS END SUCCESFULLY ---

---data preprocessed---

---Init PCA transformation---


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 


 Train set size: (702, 31)
Train set size: after PCA: (702, 17)


"#Example of use for ANN\n\nprintln(\" Train set size: \", size(x_train))\n\n\n# Use PCA to select the components that explain 95% of the variance\npca_model = PCA(variance_ratio=0.95)\n\n#1 Adjust the PCA only with the training data\npca_machine = machine(pca_model, MLJ.table(x_tra"[93m[1m ⋯ 84 bytes ⋯ [22m[39m"rm(pca_machine, MLJ.table(x_train))\nx_val_pca = MLJ.transform(pca_machine, MLJ.table(x_val))\nx_test_pca = MLJ.transform(pca_machine, MLJ.table(x_test))\n\n#For MLJ is better to pass the data as table\n#To see data as matrix use: mat_train_pca = MLJ.matrix(x_train_val_test)\n"

In [7]:
#check_class_distribution(y_train_pca, y_val_pca, y_test_pca)

In [8]:
# -----------------------------------------------------------------
# (Aproach 3) ICA and z-score (ICA just in numerical features , if we use it with the caegorical ones it wouldn't find a solution) 
# -----------------------------------------------------------------
using MLJMultivariateStatsInterface
ICA = @load ICA pkg=MultivariateStats

data_path = "heart_disease_uci.csv"

data, num_col, cat_col, target_col = load_and_clean_data(data_path)


println("\nInit approach 3, ICA (Numerical features only)...")

# 1. Preprocess as previous aproach ( Z-Score, ideal for ICA
approach_ica = prepare_data(
    data,          
    num_col,       
    cat_col,       
    target_col,    
    norm_method=:zscore 
)

# 2. Unpack results
x_train = approach_ica.x_train
x_val = approach_ica.x_val
x_test = approach_ica.x_test

y_train_ica = approach_ica.y_train_cat 
y_val_ica = approach_ica.y_val_cat     
y_test_ica = approach_ica.y_test_cat;

# Our function  'prepare_data'order first numerical fetures and then categorical ones.

n_num = length(num_col) # Should be (age, trestbps, chol, thalch, oldpeak)

#Split 
x_num_train = x_train[:, 1:n_num]      # Just numerical 
x_cat_train = x_train[:, n_num+1:end]  # Just categorical OHE

x_num_val = x_val[:, 1:n_num]
x_cat_val = x_val[:, n_num+1:end]
x_num_test = x_test[:, 1:n_num]
x_cat_test = x_test[:, n_num+1:end]


# --- ICA just for numerical ---

# k should be less or equal than umber of features (5)
k_components = 2

#Random.seed!(1234)#ICA is a no deterministic method so we fix the seed for reproducibility. But somehow fail 
# Give some tolerance for the solution
ica_model = ICA(outdim=k_components, maxiter=100000, tol=0.2) 

println(" ICA with k=$k_components ...")

# Fit only the numerical data from training set, for ANN
ica_machine = machine(ica_model, MLJ.table(x_num_train))
MLJ.fit!(ica_machine, verbosity=1) # verbosity=1 for debug

"""
#for models != ANN:
x_train_val_num=vcat(x_num_train, x_num_val)
#fit on both, training and validation
ica_machine = machine(ica_model, MLJ.table(x_train_val_num))
MLJ.fit!(ica_machine, verbosity=1) # verbosity=1 for debug
"""

# Transform and return to matrix
x_num_train_ica = MLJ.transform(ica_machine, MLJ.table(x_num_train))
x_num_val_ica  = MLJ.transform(ica_machine, MLJ.table(x_num_val))
x_num_test_ica  = MLJ.transform(ica_machine, MLJ.table(x_num_test))


mat_train_ica = MLJ.matrix(x_num_train_ica)
mat_val_ica  = MLJ.matrix(x_num_val_ica)
mat_test_ica  = MLJ.matrix(x_num_test_ica)

#Add the categorical OHE 
x_train_ica = hcat(mat_train_ica, x_cat_train)
x_val_ica = hcat(mat_val_ica, x_cat_val)
x_test_ica     = hcat(mat_test_ica, x_cat_test)

println("Final shape (ICA Nums + OHE Cats): ", size(x_train_ica))
println("\n Results for approach 3 stored in:")
println("x_train_ica")
println("x_val_ica")
println("x_test_ica")
println("y_train_ica")
println("y_val_ica")
println("y_test_ica")

import MLJMultivariateStatsInterface ✔
>>> Loading data from: heart_disease_uci.csv
  Original Size: (920, 14)
 Categorical Null values replaced with ---> 'missingval'.
  Deleted rows in features: [:trestbps, :chol, :thalch, :oldpeak]
  Final shape: (827, 14)
------------------------

Init approach 3, ICA (Numerical features only)...

--- init Preprocess ---
   Normalization: zscore
    Stratigfied HoldOut split: 577 train, 125 val, 125 test
    Normalizing numerical features...
    ...Normalization completed.
    Encoding categorical features (OHE)...
    ...OHE completed.
    Concatenate numerical and categorical matrices...
    Classes stored for the target: [0, 1, 2, 3, 4]
--- PREPROCESS END SUCCESFULLY ---
 ICA with k=2 ...


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(ICA(outdim = 2, …), …).


Final shape (ICA Nums + OHE Cats): (577, 28)

 Results for approach 3 stored in:
x_train_ica
x_val_ica
x_test_ica
y_train_ica
y_val_ica
y_test_ica


In [9]:
# -----------------------------------------------------------------
# EXample of usage with models that don't use val set.
# -----------------------------------------------------------------

models_final = Dict(
    "SVM" => SVC(cost=10.0), 
    "LR"  => LogisticClassifier(),
    "DT"  => DecisionTreeClassifier(max_depth=4), 
)

x_train_val = vcat(x_train_ica, x_val_ica)
y_train_val= vcat(y_train_ica, y_val_ica)
println("\nResultados tras ICA:")
@time begin
    for (name, model) in models_final
        # Entrenamos con la matriz reconstruida
        mach = machine(model, MLJ.table(x_train_val), y_train_val)
        MLJ.fit!(mach, verbosity=0)
        
        # Predecimos
        ŷ = MLJ.predict(mach, MLJ.table(x_test_ica))
        
        if name != "SVM"
            ŷ = mode.(ŷ)
        end

        acc = MLJ.accuracy(ŷ, y_test_ica)
        println("$name: $(round(acc*100, digits=2)) %")
    end
end


Resultados tras ICA:
SVM: 55.2 %
LR: 56.8 %
DT: 56.0 %
 14.752198 seconds (49.01 M allocations: 2.405 GiB, 3.77% gc time, 99.59% compilation time)


In [10]:
# -----------------------------------------------------------------
# (Aproach 4) Same as approach 1 with corssvalidation 
# -----------------------------------------------------------------
data_path = "heart_disease_uci.csv"
data, num_col, cat_col, target_col = load_and_clean_data(data_path)
Random.seed!(1234)
#---split training data and final test data---
Pval = 0.15
Ptest = 0.15
rows, columns = size(data)
N = rows
(train_indices, val_indices, test_indices) = stratified_holdOut(data[!,target_col], Pval, Ptest)
train_data = data[train_indices, :]
val_data = data[val_indices, :]
dev_data = vcat(train_data, val_data)
test_data = data[test_indices, :]
println("  Data split: $(size(dev_data,1)) dev(85%), $(size(test_data,1)) test(15%)")

#---split for crossvalidation---
dev_num = select(dev_data, num_col)
dev_cat = select(dev_data, cat_col)
dev_targets = dev_data[!, target_col];

#---make cv indices---
k_folds=5 #numebr of folds, set to 5 as our dataset is small
cv_indices = crossvalidation(dev_targets, k_folds);
println("Indices generated for $k_folds stratified folds.")

>>> Loading data from: heart_disease_uci.csv
  Original Size: (920, 14)
 Categorical Null values replaced with ---> 'missingval'.
  Deleted rows in features: [:trestbps, :chol, :thalch, :oldpeak]
  Final shape: (827, 14)
------------------------
  Data split: 702 dev(85%), 125 test(15%)
Indices generated for 5 stratified folds.


In [11]:
using DataFrames, Statistics

println("--- Iniciando Grid Search Masivo (Approach 4) ---")

# 1. DEFINICIÓN DE HIPERPARÁMETROS A PROBAR
# (Cumpliendo requisitos de la práctica)

# A. SVM (Mínimo 8 configuraciones: Kernels y C variados)
configs_svm = [
    (:SVC, Dict(:kernel => "rbf", :C => 0.1)),
    (:SVC, Dict(:kernel => "rbf", :C => 1.0)),
    (:SVC, Dict(:kernel => "rbf", :C => 10.0)),
    (:SVC, Dict(:kernel => "rbf", :C => 100.0)),
    (:SVC, Dict(:kernel => "linear", :C => 0.1)),
    (:SVC, Dict(:kernel => "linear", :C => 1.0)),
    (:SVC, Dict(:kernel => "poly", :degree => 2, :C => 1.0)),
    (:SVC, Dict(:kernel => "sigmoid", :C => 1.0))
]

# B. Decision Tree (Mínimo 6 profundidades)
configs_dt = [
    (:DT, Dict(:max_depth => 3)),
    (:DT, Dict(:max_depth => 5)),
    (:DT, Dict(:max_depth => 7)),
    (:DT, Dict(:max_depth => 9)),
    (:DT, Dict(:max_depth => 12)),
    (:DT, Dict(:max_depth => -1)) # Sin límite
]

# C. k-NN (Mínimo 6 valores de k)
configs_knn = [
    (:KNN, Dict(:K => 1)),
    (:KNN, Dict(:K => 3)),
    (:KNN, Dict(:K => 5)),
    (:KNN, Dict(:K => 7)),
    (:KNN, Dict(:K => 15)),
    (:KNN, Dict(:K => 31)) # k alto para ver si suaviza demasiado
]

# D. ANN (Mínimo 8 arquitecturas)
# Variamos capas (1 o 2 ocultas) y neuronas
configs_ann = [
    # 1 Capa Oculta
    (:ANN, Dict(:topology => [5], :learningRate => 0.01)),
    (:ANN, Dict(:topology => [10], :learningRate => 0.01)),
    (:ANN, Dict(:topology => [32], :learningRate => 0.01)),
    (:ANN, Dict(:topology => [64], :learningRate => 0.005)),
    # 2 Capas Ocultas
    (:ANN, Dict(:topology => [10, 5], :learningRate => 0.01)),
    (:ANN, Dict(:topology => [16, 8], :learningRate => 0.01)),
    (:ANN, Dict(:topology => [32, 16], :learningRate => 0.005)),
    (:ANN, Dict(:topology => [64, 32], :learningRate => 0.001))
]

# Unimos todas las configuraciones en una sola lista
all_configs = vcat(configs_svm, configs_dt, configs_knn, configs_ann)

# 2. BUCLE DE EJECUCIÓN
results_grid = DataFrame(
    ModelType = String[], 
    Hyperparams = String[], 
    Mean_Accuracy = Float64[], 
    Std_Accuracy = Float64[]
)

println("Evaluando $(length(all_configs)) configuraciones...")

for (idx, (m_type, params)) in enumerate(all_configs)
    # Convertimos params a string para reporte
    param_str = string(params)
    
    print("[$idx/$(length(all_configs))] Probando $m_type con $param_str ... ")
    
    # LLAMADA A TU FUNCIÓN MAESTRA
    # (Asegúrate de que 'universalCrossValidation' esté cargada)
    mu, sigma = universalCrossValidation1(
        m_type, 
        params, 
        dev_num, 
        dev_cat, 
        dev_targets, 
        cv_indices
    )
    
    push!(results_grid, (string(m_type), param_str, mu * 100, sigma * 100))
    println("-> Acc: $(round(mu*100, digits=2))%")
end

# 3. RESULTADOS Y MEJOR MODELO
println("\n--- TOP 10 MEJORES MODELOS ---")
sort!(results_grid, :Mean_Accuracy, rev=true)
display(first(results_grid, 10))

# Extraer el ganador
best_row = results_grid[1, :]
println("\n GANADOR DEL APPROACH 4:")
println("Modelo: $(best_row.ModelType)")
println("Config: $(best_row.Hyperparams)")
println("Accuracy CV: $(round(best_row.Mean_Accuracy, digits=2))% ± $(round(best_row.Std_Accuracy, digits=2))")

--- Iniciando Grid Search Masivo (Approach 4) ---
Evaluando 28 configuraciones...
[1/28] Probando SVC con Dict{Symbol, Any}(:kernel => "rbf", :C => 0.1) ... -> Acc: 57.84%
[2/28] Probando SVC con Dict{Symbol, Any}(:kernel => "rbf", :C => 1.0) ... -> Acc: 56.56%
[3/28] Probando SVC con Dict{Symbol, Any}(:kernel => "rbf", :C => 10.0) ... -> Acc: 55.99%
[4/28] Probando SVC con Dict{Symbol, Any}(:kernel => "rbf", :C => 100.0) ... -> Acc: 54.56%
[5/28] Probando SVC con Dict{Symbol, Any}(:kernel => "linear", :C => 0.1) ... -> Acc: 57.84%
[6/28] Probando SVC con Dict{Symbol, Any}(:kernel => "linear", :C => 1.0) ... -> Acc: 56.56%
[7/28] Probando SVC con Dict{Symbol, Any}(:degree => 2, :kernel => "poly", :C => 1.0) ... -> Acc: 56.56%
[8/28] Probando SVC con Dict{Symbol, Any}(:kernel => "sigmoid", :C => 1.0) ... -> Acc: 56.56%
[9/28] Probando DT con Dict(:max_depth => 3) ... -> Acc: 55.69%
[10/28] Probando DT con Dict(:max_depth => 5) ... -> Acc: 56.55%
[11/28] Probando DT con Dict(:max_depth =

Row,ModelType,Hyperparams,Mean_Accuracy,Std_Accuracy
Unnamed: 0_level_1,String,String,Float64,Float64
1,SVC,"Dict{Symbol, Any}(:kernel => ""rbf"", :C => 0.1)",57.8413,1.26273
2,SVC,"Dict{Symbol, Any}(:kernel => ""linear"", :C => 0.1)",57.8413,1.26273
3,KNN,Dict(:K => 31),57.8352,1.73292
4,KNN,Dict(:K => 7),57.6943,0.57103
5,ANN,"Dict{Symbol, Any}(:topology => [10, 5], :learningRate => 0.01)",57.5648,4.01697
6,ANN,"Dict{Symbol, Any}(:topology => [32, 16], :learningRate => 0.005)",57.5607,3.9794
7,ANN,"Dict{Symbol, Any}(:topology => [5], :learningRate => 0.01)",57.4208,2.64739
8,ANN,"Dict{Symbol, Any}(:topology => [16, 8], :learningRate => 0.01)",57.2871,4.56485
9,ANN,"Dict{Symbol, Any}(:topology => [32], :learningRate => 0.01)",57.279,3.10975
10,ANN,"Dict{Symbol, Any}(:topology => [10], :learningRate => 0.01)",57.271,3.91323



 GANADOR DEL APPROACH 4:
Modelo: SVC
Config: Dict{Symbol, Any}(:kernel => "rbf", :C => 0.1)
Accuracy CV: 57.84% ± 1.26


**LA FUNCION QUE SE VE ARRIBA PARA EL GRID SEARCH SIRVE COMO REFERENCIA PARA ENCONTRAR EL QEU DEBERÍA SER EL MEJOR MODELO, UNA VEZ ENCONTRADO ESE MODELO HAY QUE REENTRENARLO DE CERO CON EL 85% DE LOS DATOS Y EVALUARLO EN EL TEST SET RESERVADO AL INICIO QUE AÚN NO SE UTILIZÓ, PARA EVITAR ASI FILTRAR INFORMACION DEL TEST SET AL MODELO**

In [12]:
# -----------------------------------------------------------------
# (Aproach 5) PCA with corssvalidation 
# -----------------------------------------------------------------
data_path = "heart_disease_uci.csv"
data, num_col, cat_col, target_col = load_and_clean_data(data_path)
Random.seed!(1234)
#---split training data and final test data---
Pval = 0.15
Ptest = 0.15
rows, columns = size(data)
N = rows
(train_indices, val_indices, test_indices) = stratified_holdOut(data[!,target_col], Pval, Ptest)
train_data = data[train_indices, :]
val_data = data[val_indices, :]
dev_data = vcat(train_data, val_data)
test_data = data[test_indices, :]
println("  Data split: $(size(dev_data,1)) dev(85%), $(size(test_data,1)) test(15%)")

#---split for crossvalidation---
dev_num = select(dev_data, num_col)
dev_cat = select(dev_data, cat_col)
dev_targets = dev_data[!, target_col];

#---make cv indices---
k_folds=5 #numebr of folds, set to 5 as our dataset is small
cv_indices = crossvalidation(dev_targets, k_folds);
println("Indices generated for $k_folds stratified folds.")

>>> Loading data from: heart_disease_uci.csv
  Original Size: (920, 14)
 Categorical Null values replaced with ---> 'missingval'.
  Deleted rows in features: [:trestbps, :chol, :thalch, :oldpeak]
  Final shape: (827, 14)
------------------------
  Data split: 702 dev(85%), 125 test(15%)
Indices generated for 5 stratified folds.


In [13]:
println("\n--- Massive Grid Search (Approach 5: PCA + CV) ---")

# Define Configurations
configs_pca = [
    # SVM (Kernel Linear often works well with PCA)
    (:SVC, Dict(:C => 1.0, :pca_components => 17)),
    (:SVC, Dict(:C => 10.0, :pca_components => 17)),
    (:SVC, Dict(:kernel => "linear", :C => 1.0, :pca_components => 17)),
    
    # Decision Tree
    (:DT,  Dict(:max_depth => 5, :pca_components => 17)),
    (:DT,  Dict(:max_depth => 8, :pca_components => 17)),
    
    # k-NN
    (:KNN, Dict(:K => 5, :pca_components => 17)),
    (:KNN, Dict(:K => 15, :pca_components => 17)),
    
    # ANN (Topologies adjusted for 17 inputs)
    (:ANN, Dict(:topology => [10], :learningRate => 0.01, :pca_components => 17)),
    (:ANN, Dict(:topology => [16, 8], :learningRate => 0.005, :pca_components => 17))
]

results_pca = DataFrame(Model=[], Params=[], Acc_Mean=[], Acc_Std=[])

for (m_type, params) in configs_pca
    p_str = string(params)
    print("Testing $m_type ... ")
    
    mu, sigma = universalCrossValidation_PCA(
        m_type, params, 
        dev_num, dev_cat, dev_targets, 
        cv_indices
    )
    
    push!(results_pca, (string(m_type), p_str, mu*100, sigma*100))
    println("-> Acc: $(round(mu*100, digits=2))%")
end

println("\n--- Ranking Approach 5 ---")
sort!(results_pca, :Acc_Mean, rev=true)
display(results_pca)


--- Massive Grid Search (Approach 5: PCA + CV) ---
Testing SVC ... -> Acc: 58.27%
Testing SVC ... -> Acc: 53.55%
Testing SVC ... -> Acc: 58.27%
Testing DT ... -> Acc: 52.41%
Testing DT ... -> Acc: 50.71%
Testing KNN ... -> Acc: 56.7%
Testing KNN ... -> Acc: 58.69%
Testing ANN ... -> Acc: 58.42%
Testing ANN ... -> Acc: 58.41%

--- Ranking Approach 5 ---


Row,Model,Params,Acc_Mean,Acc_Std
Unnamed: 0_level_1,Any,Any,Any,Any
1,KNN,"Dict(:K => 15, :pca_components => 17)",58.6944,2.06187
2,ANN,"Dict{Symbol, Any}(:topology => [10], :learningRate => 0.01, :pca_components => 17)",58.4167,2.30317
3,ANN,"Dict{Symbol, Any}(:topology => [16, 8], :learningRate => 0.005, :pca_components => 17)",58.4138,2.50284
4,SVC,"Dict{Symbol, Real}(:pca_components => 17, :C => 1.0)",58.2679,2.14098
5,SVC,"Dict{Symbol, Any}(:kernel => ""linear"", :pca_components => 17, :C => 1.0)",58.2679,2.14098
6,KNN,"Dict(:K => 5, :pca_components => 17)",56.7024,4.18508
7,SVC,"Dict{Symbol, Real}(:pca_components => 17, :C => 10.0)",53.5493,2.85826
8,DT,"Dict(:pca_components => 17, :max_depth => 5)",52.4074,3.44528
9,DT,"Dict(:pca_components => 17, :max_depth => 8)",50.7144,3.3973
