In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')


In [29]:
# 1. Cargar y preparar datos
def load_and_prepare_data():
    print("Cargando datos...")
    # Cargar datos
    df = pd.read_csv('datos_estandarizados.csv')

    # Separar features y variable objetivo
    X = df.drop(['custcat'], axis=1)
    y = df['custcat']

    # Convertir la variable objetivo a tipo categórico
    le = LabelEncoder()
    y = le.fit_transform(y)

    print("Dimensiones de los datos:")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")

    return X, y

In [30]:
# 2. Función para evaluar el desbalanceo de clases
def check_class_balance(y):
    print("\nAnalizando balance de clases...")
    class_distribution = pd.Series(y).value_counts(normalize=True)
    print("Distribución de clases:")
    for clase, proporcion in class_distribution.items():
        print(f"Clase {clase}: {proporcion:.2%}")
    return class_distribution


In [31]:
# 3. Pipeline principal de entrenamiento
def train_model_pipeline(X, y):
    print("\nPreparando y entrenando modelos...")
    # Split de datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Aplicando SMOTE para balance de clases...")
    # Aplicar SMOTE para balancear clases
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

    print("Configurando modelos base...")
    # Crear modelos base
    rf = RandomForestClassifier(random_state=42)
    xgb = XGBClassifier(random_state=42)
    svm = SVC(probability=True, random_state=42)

    # Parámetros para optimización
    rf_params = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5]
    }

    xgb_params = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1]
    }

    svm_params = {
        'C': [0.1, 1],
        'kernel': ['rbf', 'linear']
    }

    print("Optimizando hiperparámetros...")
    # Optimizar cada modelo
    rf_optimal = GridSearchCV(rf, rf_params, cv=5, n_jobs=-1)
    xgb_optimal = GridSearchCV(xgb, xgb_params, cv=5, n_jobs=-1)
    svm_optimal = GridSearchCV(svm, svm_params, cv=5, n_jobs=-1)

    # Entrenar modelos optimizados
    print("Entrenando Random Forest...")
    rf_optimal.fit(X_train_balanced, y_train_balanced)
    print("Entrenando XGBoost...")
    xgb_optimal.fit(X_train_balanced, y_train_balanced)
    print("Entrenando SVM...")
    svm_optimal.fit(X_train_balanced, y_train_balanced)

    print("Creando ensemble voting...")
    # Crear ensemble voting
    voting_clf = VotingClassifier(
        estimators=[
            ('rf', rf_optimal.best_estimator_),
            ('xgb', xgb_optimal.best_estimator_),
            ('svm', svm_optimal.best_estimator_)
        ],
        voting='soft'
    )

    # Entrenar ensemble final
    print("Entrenando modelo ensemble final...")
    voting_clf.fit(X_train_balanced, y_train_balanced)

    return voting_clf, X_test, y_test, rf_optimal, xgb_optimal, svm_optimal


In [32]:
# 4. Función para evaluar el modelo
def evaluate_model(model, X_test, y_test):
    print("\nEvaluando modelo final...")
    # Predicciones
    y_pred = model.predict(X_test)

    # Métricas de evaluación
    print("\nReporte de Clasificación:")
    print(classification_report(y_test, y_pred))

    # Cross-validation score
    cv_scores = cross_val_score(model, X_test, y_test, cv=5)
    print("\nCross-validation scores:", cv_scores)
    print(f"Media de CV score: {cv_scores.mean():.4f}")
    print(f"Desviación estándar de CV score: {cv_scores.std():.4f}")

    # Matriz de confusión
    print("\nMatriz de Confusión:")
    print(confusion_matrix(y_test, y_pred))

    return y_pred


In [33]:
# 5. Función principal
def main():
    # Cargar y preparar datos
    X, y = load_and_prepare_data()

    # Verificar balance de clases
    class_distribution = check_class_balance(y)

    # Entrenar modelo
    voting_clf, X_test, y_test, rf_optimal, xgb_optimal, svm_optimal = train_model_pipeline(X, y)

    # Evaluar modelo
    y_pred = evaluate_model(voting_clf, X_test, y_test)

    # Mostrar mejores parámetros
    print("\nMejores parámetros encontrados:")
    print("Random Forest:", rf_optimal.best_params_)
    print("XGBoost:", xgb_optimal.best_params_)
    print("SVM:", svm_optimal.best_params_)

    # Mostrar mejores scores
    print("\nMejores scores de validación:")
    print(f"Random Forest: {rf_optimal.best_score_:.4f}")
    print(f"XGBoost: {xgb_optimal.best_score_:.4f}")
    print(f"SVM: {svm_optimal.best_score_:.4f}")

    return voting_clf

if __name__ == "__main__":
    model = main()

Cargando datos...
Dimensiones de los datos:
X shape: (1000, 11)
y shape: (1000,)

Analizando balance de clases...
Distribución de clases:
Clase 2: 28.10%
Clase 0: 26.60%
Clase 3: 23.60%
Clase 1: 21.70%

Preparando y entrenando modelos...
Aplicando SMOTE para balance de clases...
Configurando modelos base...
Optimizando hiperparámetros...
Entrenando Random Forest...
Entrenando XGBoost...
Entrenando SVM...
Creando ensemble voting...
Entrenando modelo ensemble final...

Evaluando modelo final...

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.41      0.40      0.41        60
           1       0.29      0.31      0.30        39
           2       0.45      0.47      0.46        55
           3       0.28      0.26      0.27        46

    accuracy                           0.37       200
   macro avg       0.36      0.36      0.36       200
weighted avg       0.37      0.37      0.37       200


Cross-validation scores: [0.325 0.35  0