In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.inspection import permutation_importance
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')


In [None]:
# 1. Cargar y preparar datos
def load_and_prepare_data():
    print("Cargando datos...")
    df = pd.read_csv('datos_estandarizados.csv')


    # Análisis de correlaciones
    correlation_matrix = df.corr()
    high_corr_features = np.where(np.abs(correlation_matrix) > 0.8)
    high_corr_features = [(correlation_matrix.index[x], correlation_matrix.columns[y])
                         for x, y in zip(*high_corr_features) if x != y and x < y]

    print("Características altamente correlacionadas:", high_corr_features)

    # Separar features y target
    X = df.drop(['custcat'], axis=1)
    y = df['custcat']

     # Convertir la variable objetivo a tipo categórico
    le = LabelEncoder()
    y = le.fit_transform(y)

    # Feature selection using Random Forest
    rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_selector.fit(X, y)

    # Seleccionar características importantes
    selector = SelectFromModel(rf_selector, prefit=True)
    feature_idx = selector.get_support()
    selected_features = X.columns[feature_idx].tolist()

    print("Características seleccionadas:", selected_features)
    X = selector.transform(X)

    return X, y, selected_features

In [None]:
# 2. Función para evaluar el desbalanceo de clases
def check_class_balance(y):
    print("\nAnalizando balance de clases...")
    class_distribution = pd.Series(y).value_counts(normalize=True)
    print("Distribución de clases:")
    for clase, proporcion in class_distribution.items():
        print(f"Clase {clase}: {proporcion:.2%}")
    return class_distribution


In [None]:
def create_feature_interactions(X):
    print("Creando interacciones entre características...")
    n_features = X.shape[1]
    for i in range(n_features):
        for j in range(i+1, n_features):
            X = np.column_stack((X, X[:,i] * X[:,j]))
    return X

In [None]:
# 3. Pipeline principal de entrenamiento
def train_model_pipeline(X, y):
    print("\nPreparando y entrenando modelos...")

    # Crear interacciones de características
    X = create_feature_interactions(X)

    # Split estratificado
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42, stratify=y
    )

    # SMOTE con k_neighbors ajustado
    smote = SMOTE(random_state=42, k_neighbors=5)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

    # Configuración de modelos
    rf = RandomForestClassifier(random_state=42, class_weight='balanced')
    xgb = XGBClassifier(random_state=42)
    svm = SVC(probability=True, random_state=42, class_weight='balanced')
    lgb_clf = lgb.LGBMClassifier(random_state=42)

    # Parámetros expandidos
    rf_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }

    xgb_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'min_child_weight': [1, 3, 5]
    }

    svm_params = {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'poly'],
        'gamma': ['scale', 'auto', 0.1, 0.01],
        'degree': [2, 3, 4]
    }

    lgb_params = {
        'n_estimators': [100, 200, 300],
        'num_leaves': [31, 63, 127],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }

    # Optimización con CV estratificado
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    print("Optimizando modelos...")
    rf_optimal = GridSearchCV(rf, rf_params, cv=cv, n_jobs=-1, scoring='f1_macro')
    xgb_optimal = GridSearchCV(xgb, xgb_params, cv=cv, n_jobs=-1, scoring='f1_macro')
    svm_optimal = GridSearchCV(svm, svm_params, cv=cv, n_jobs=-1, scoring='f1_macro')
    lgb_optimal = GridSearchCV(lgb_clf, lgb_params, cv=cv, n_jobs=-1, scoring='f1_macro')

    # Entrenamiento
    rf_optimal.fit(X_train_balanced, y_train_balanced)
    xgb_optimal.fit(X_train_balanced, y_train_balanced)
    svm_optimal.fit(X_train_balanced, y_train_balanced)
    lgb_optimal.fit(X_train_balanced, y_train_balanced)

    # Calcular pesos para voting basados en el rendimiento
    weights = [
        rf_optimal.best_score_,
        xgb_optimal.best_score_,
        svm_optimal.best_score_,
        lgb_optimal.best_score_
    ]
    weights = np.array(weights) / sum(weights)

    # Voting classifier con pesos optimizados
    voting_clf = VotingClassifier(
        estimators=[
            ('rf', rf_optimal.best_estimator_),
            ('xgb', xgb_optimal.best_estimator_),
            ('svm', svm_optimal.best_estimator_),
            ('lgb', lgb_optimal.best_estimator_)
        ],
        voting='soft',
        weights=weights
    )

    voting_clf.fit(X_train_balanced, y_train_balanced)

    return voting_clf, X_test, y_test, rf_optimal, xgb_optimal, svm_optimal, lgb_optimal

In [None]:
# 4. Función para evaluar el modelo
def evaluate_model(model, X_test, y_test):
    print("\nEvaluando modelo final...")
    y_pred = model.predict(X_test)

    print("\nReporte de Clasificación:")
    print(classification_report(y_test, y_pred))

    # Cross-validation con estratificación
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_test, y_test, cv=cv, scoring='f1_macro')

    print("\nCross-validation scores:", cv_scores)
    print(f"Media de CV score: {cv_scores.mean():.4f}")
    print(f"Desviación estándar de CV score: {cv_scores.std():.4f}")

    print("\nMatriz de Confusión:")
    print(confusion_matrix(y_test, y_pred))

    # Análisis de errores
    error_mask = y_test != y_pred
    print("\nAnálisis de errores por clase:")
    for clase in np.unique(y_test):
        clase_mask = y_test == clase
        error_rate = error_mask[clase_mask].mean()
        print(f"Clase {clase} - Tasa de error: {error_rate:.2%}")

    return y_pred


In [None]:
# 5. Función principal
def main():
    # Cargar y preparar datos
    X, y, selected_features = load_and_prepare_data()

    # Verificar balance de clases
    class_distribution = check_class_balance(y)

    # Entrenar modelo
    voting_clf, X_test, y_test, rf_optimal, xgb_optimal, svm_optimal, lgb_optimal = train_model_pipeline(X, y)

    # Evaluar modelo
    y_pred = evaluate_model(voting_clf, X_test, y_test)

    # Mostrar mejores parámetros y scores
    print("\nMejores parámetros encontrados:")
    print("Random Forest:", rf_optimal.best_params_)
    print("XGBoost:", xgb_optimal.best_params_)
    print("SVM:", svm_optimal.best_params_)
    print("LightGBM:", lgb_optimal.best_params_)

    print("\nMejores scores de validación:")
    print(f"Random Forest: {rf_optimal.best_score_:.4f}")
    print(f"XGBoost: {xgb_optimal.best_score_:.4f}")
    print(f"SVM: {svm_optimal.best_score_:.4f}")
    print(f"LightGBM: {lgb_optimal.best_score_:.4f}")

    return voting_clf

if __name__ == "__main__":
    model = main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4153
[LightGBM] [Info] Number of data points in the train set: 956, number of used features: 21
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294

Evaluando modelo final...

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.41      0.38      0.39        40
           1       0.33      0.42      0.37        33
           2       0.36      0.33      0.35        42
           3       0.34      0.31      0.33        35

    accuracy                           0.36       150
   macro avg       0.36      0.36      0.36       15