In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

In [2]:
base_classifiers = {
            'rf': RandomForestClassifier(random_state=42, n_jobs=-1),
            'svm': SVC(probability=True, random_state=42),
            # 'xgb': xgb.XGBClassifier(random_state=42, n_jobs=-1),
            # 'gb': GradientBoostingClassifier(random_state=42)
        }

param_grid = {    
    'rf': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },    
    'svm': {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001, 'scale', 'auto'],
        'kernel': ['rbf', 'poly', 'sigmoid']
    },
    # 'gb' : {
    #     'n_estimators': [100, 200, 300],
    #     'learning_rate': [0.1, 0.05, 0.02, 0.01],
    #     'max_depth': [4, 6, 8],
    #     'min_samples_leaf': [20, 50,100],
    #     'max_features': [1.0, 0.3, 0.1] 
    # }
}

best_models = {}
def train_base_models(X_train, y_train):
    """Entrena y optimiza los modelos base usando GridSearchCV."""
    print("Entrenando modelos base...")
    
    for name, model in base_classifiers.items():
        print(f"\nOptimizando {name}...")
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        grid_search = GridSearchCV(
            estimator=model,
            param_grid= param_grid[name],
            cv=cv,
            n_jobs=-1,
            scoring='accuracy',
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        best_models[name] = grid_search.best_estimator_
        
        print(f"Mejores parámetros para {name}: {grid_search.best_params_}")
        print(f"Mejor puntuación para {name}: {grid_search.best_score_:.4f}")
    
    return best_models

In [9]:
def main():
    data = pd.read_csv('../../csv/speaker_features.csv')
    X = data.drop('persona', axis=1)
    y = data['persona']
    
        # Dividir los datos
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    scaler = StandardScaler()
    
    # Escalar los datos
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Crear y entrenar el modelo
    best_models = train_base_models(X_train_scaled, y_train)
    
if __name__ == "__main__":
    main()

Entrenando modelos base...

Optimizando rf...
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Mejores parámetros para rf: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mejor puntuación para rf: 0.9781

Optimizando svm...
Fitting 5 folds for each of 72 candidates, totalling 360 fits


ValueError: 
All the 360 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\svm\_base.py", line 190, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1273, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1059, in check_array
    _assert_all_finite(
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 126, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 175, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
