In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

In [9]:
base_classifiers = {
            'rf': RandomForestClassifier(random_state=42, n_jobs=-1),
            'svm': SVC(probability=True, random_state=42),
            # 'xgb': xgb.XGBClassifier(random_state=42, n_jobs=-1),
            # 'gb': GradientBoostingClassifier(random_state=42)
        }

param_grid = {    
    'rf': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },    
    'svm': {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001, 'scale', 'auto'],
        'kernel': ['rbf', 'poly', 'sigmoid']
    },
    # 'gb' : {
    #     'n_estimators': [100, 200, 300],
    #     'learning_rate': [0.1, 0.05, 0.02, 0.01],
    #     'max_depth': [4, 6, 8],
    #     'min_samples_leaf': [20, 50,100],
    #     'max_features': [1.0, 0.3, 0.1] 
    # }
}

best_models = {}
def train_base_models(X_train, y_train):
    """Entrena y optimiza los modelos base usando GridSearchCV."""
    print("Entrenando modelos base...")
    
    for name, model in base_classifiers.items():
        print(f"\nOptimizando {name}...")
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        grid_search = GridSearchCV(
            estimator=model,
            param_grid= param_grid[name],
            cv=cv,
            n_jobs=-1,
            scoring='accuracy',
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        best_models[name] = grid_search.best_estimator_
        
        print(f"Mejores parámetros para {name}: {grid_search.best_params_}")
        print(f"Mejor puntuación para {name}: {grid_search.best_score_:.4f}")
    
    return best_models

In [10]:
def main():
    data = pd.read_csv('csv/word_features.csv')
    X = data.drop('palabra', axis=1)
    y = data['palabra']
    
        # Dividir los datos
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    scaler = StandardScaler()
    
    # Escalar los datos
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Crear y entrenar el modelo
    best_models = train_base_models(X_train_scaled, y_train)
    
if __name__ == "__main__":
    main()

Entrenando modelos base...

Optimizando rf...
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Mejores parámetros para rf: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Mejor puntuación para rf: 0.9575

Optimizando svm...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Mejores parámetros para svm: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Mejor puntuación para svm: 0.9706
