In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Importar los datos
df = pd.read_csv("manufacturing_defect_dataset_OK .csv", sep=';', engine='python')

In [None]:
#ANALISIS EXPLORATORIO INICIAL

In [None]:
df.head()

In [None]:
import pandas as pd

# Importar los datos
df = pd.read_csv("manufacturing_defect_dataset_OK .csv", sep=';', engine='python')

df.head()

In [None]:
print("Tipos de datos por columna:")
print(df.dtypes)


In [None]:
print("Valores nulos por columna:")
print(df.isnull().sum())


In [None]:
df.describe()

In [None]:
print("Distribución de la variable 'DefectStatus':")
print(df['DefectStatus'].value_counts(normalize=True))

In [None]:
import matplotlib.pyplot as plt

df.hist(bins=30, figsize=(15, 12))
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Matriz de correlación entre variables")
plt.show()

In [None]:
df.info()

In [None]:
df['DefectStatus'].value_counts().plot(kind='bar')

In [None]:
# Comprobar si hay duplicados
df.duplicated().sum()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("DefectStatus", axis=1)
y = df["DefectStatus"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 3. Verificación (opcional) de proporciones de clases
print("Proporción de clases en el conjunto de entrenamiento:")
print(y_train.value_counts(normalize=True))

print("\nProporción de clases en el conjunto de prueba:")
print(y_test.value_counts(normalize=True))


In [None]:
print(X_train.shape, X_test.shape)

In [None]:
# Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
#REGRESIÓN LOGÍSTICA

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
logreg = LogisticRegression(max_iter=5000, solver='liblinear', random_state=42)

param_grid = {
    'C': [0.1, 1, 10]
}

grid_search = GridSearchCV(logreg, param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_logreg = grid_search.best_estimator_
print("Mejores hiperparámetros encontrados:", grid_search.best_params_)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

y_pred = best_logreg.predict(X_test_scaled)
y_proba_RL = best_logreg.predict_proba(X_test_scaled)[:, 1]

print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_RL))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_RL)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_RL):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - Regresión Logística")
plt.legend()
plt.grid()
plt.show()

In [None]:

conf_matrix = np.array([[39, 64],
                        [16, 529]])

df_cm_simple = pd.DataFrame(conf_matrix,
                            index=["0", "1"],
                            columns=["0", "1"])


plt.figure(figsize=(6.5, 6))
sns.heatmap(df_cm_simple, annot=True, fmt='d', cmap='Reds', cbar=True,
            linewidths=0.5, linecolor='gray', annot_kws={"size": 12})
plt.title("Matriz de Confusión - Regresión Logística", fontsize=14, pad=12)
plt.xlabel("Predicted", fontsize=12)
plt.ylabel("Actual", fontsize=12)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11, rotation=0)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_proba_RL)

plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.xlabel('Umbral')
plt.ylabel('Métrica')
plt.title('Precisión y recall según el umbral de decisión')
plt.legend()
plt.grid()
plt.show()


In [None]:
#MEJORA REGRESIÓN LOGÍSTICA

In [None]:
# Cambiar el umbral de decisión
nuevo_umbral = 0.6
y_pred_umbral = (y_proba_RL>= nuevo_umbral).astype(int)

# Evaluar con el nuevo umbral
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Informe con umbral = 0.6")
print(classification_report(y_test, y_pred_umbral))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred_umbral))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_RL))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_RL)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_RL):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - Regresión Logística (umbral 0.6)")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Definir la matriz de confusión 
conf_matrix = np.array([[51, 52],
                        [29, 516]])


df_cm_simple = pd.DataFrame(conf_matrix,
                            index=["0", "1"],
                            columns=["0", "1"])


plt.figure(figsize=(6.5, 6))
sns.heatmap(df_cm_simple, annot=True, fmt='d', cmap='Reds', cbar=True,
            linewidths=0.5, linecolor='gray', annot_kws={"size": 12})
plt.title("Matriz de Confusión - Regresión Logística (Umbral=0.6)", fontsize=14, pad=12)
plt.xlabel("Predicted", fontsize=12)
plt.ylabel("Actual", fontsize=12)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11, rotation=0)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Separar por clase
X_class_0 = X[y == 0]
X_class_1 = X[y == 1]
y_class_0 = y[y == 0]
y_class_1 = y[y == 1]

# Seleccionar el mismo número de muestras para test
n_test = min(len(X_class_0), len(X_class_1), 100)  # por ejemplo 100 de cada clase

X_0_test, X_0_remain, y_0_test, y_0_remain = train_test_split(X_class_0, y_class_0, test_size=(len(X_class_0)-n_test), random_state=42)
X_1_test, X_1_remain, y_1_test, y_1_remain = train_test_split(X_class_1, y_class_1, test_size=(len(X_class_1)-n_test), random_state=42)

# Crear conjunto de test final (balanceado)
X_test_bal = pd.concat([X_0_test, X_1_test])
y_test_bal = pd.concat([y_0_test, y_1_test])


In [None]:
n_iter = len(X_1_remain) // len(X_0_remain)
bloques_X1 = np.array_split(X_1_remain, n_iter)
bloques_y1 = np.array_split(y_1_remain, n_iter)

In [None]:
modelos = []
predicciones = []

for i in range(n_iter):
    # Crear conjunto balanceado para esta iteración
    X_train_iter = pd.concat([X_0_remain, bloques_X1[i]])
    y_train_iter = pd.concat([y_0_remain, bloques_y1[i]])

    
    model = LogisticRegression(max_iter=5000, solver='liblinear', random_state=42)
    model.fit(X_train_iter, y_train_iter)

    
    pred = model.predict(X_test_bal)
    predicciones.append(pred)
    modelos.append(model)

In [None]:
from scipy.stats import mode

# Convertir lista a matriz (n_iteraciones x n_test_muestras)
predicciones_matrix = np.array(predicciones)

# Votación por mayoría
y_pred_final = mode(predicciones_matrix, axis=0).mode.flatten()


In [None]:

print("Informe de clasificación:\n", classification_report(y_test_bal, y_pred_final))
print("Matriz de confusión:\n", confusion_matrix(y_test_bal, y_pred_final))


In [None]:

conf_matrix = np.array([[72, 28],
                        [24, 76]])


df_cm_simple = pd.DataFrame(conf_matrix,
                            index=["0", "1"],
                            columns=["0", "1"])


plt.figure(figsize=(6.5, 6))
sns.heatmap(df_cm_simple, annot=True, fmt='d', cmap='Reds', cbar=True,
            linewidths=0.5, linecolor='gray', annot_kws={"size": 12})
plt.title("Matriz de Confusión - Regresión Logística ", fontsize=14, pad=12)
plt.xlabel("Predicted", fontsize=12)
plt.ylabel("Actual", fontsize=12)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11, rotation=0)
plt.tight_layout()
plt.show()

In [None]:
#ARBOLES DE DECISION

In [None]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)


y_pred = tree.predict(X_test)
y_proba_tree = tree.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("Evaluación del modelo :")

print(confusion_matrix(y_test, y_pred))

print("ROC-AUC:", roc_auc_score(y_test, y_proba_tree))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_tree)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_tree):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - Arbol de decisión")
plt.legend()
plt.grid()
plt.show()

In [None]:
#AJUSTE HIPERPARÁMETROS

In [None]:
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [None]:
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # validación cruzada de 5 particiones
    scoring='recall',  # métrica de optimización
    n_jobs=-1,  # usa todos los núcleos del procesador
    verbose=1
)

grid_search.fit(X_train, y_train)

# Extraer los mejores hiperparámetros
best_params = grid_search.best_params_

print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

In [None]:

mejor_modelo_arb = DecisionTreeClassifier(**best_params, random_state=42)

# Reentrenar 
mejor_modelo_arb.fit(X_train, y_train)

# Evaluar 
y_pred_best = mejor_modelo_arb.predict(X_test)
y_proba_best_tree = mejor_modelo_arb.predict_proba(X_test)[:, 1]

print("Evaluación del modelo ajustado:")
print(classification_report(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_best_tree))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_best_tree)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_best_tree):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - Arbol de decisión")
plt.legend()
plt.grid()
plt.show()

In [None]:
from sklearn.tree import plot_tree


plt.figure(figsize=(25, 15))
plot_tree(mejor_modelo, filled=True, feature_names=X.columns, class_names=['No defectuoso', 'Defectuoso'])
plt.show()


In [None]:
conf_matrix = np.array([[75, 28],
                        [6, 539]])


df_cm_simple = pd.DataFrame(conf_matrix,
                            index=["0", "1"],
                            columns=["0", "1"])


plt.figure(figsize=(6.5, 6))
sns.heatmap(df_cm_simple, annot=True, fmt='d', cmap='Reds', cbar=True,
            linewidths=0.5, linecolor='gray', annot_kws={"size": 12})
plt.title("Matriz de Confusión - Árbol de decisión)", fontsize=14, pad=12)
plt.xlabel("Predicted", fontsize=12)
plt.ylabel("Actual", fontsize=12)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11, rotation=0)
plt.tight_layout()
plt.show()

In [None]:
importancias = pd.Series(mejor_modelo_arb.feature_importances_, index=X.columns)
importancias.nlargest(10).plot(kind='barh')

plt.title('Importancia de las variables en el árbol de decisión')
plt.xlabel('Importancia')
plt.ylabel('Variable')
plt.tight_layout()
plt.show()


In [None]:
#RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Entrenar modelo Random Forest con valores por defecto
modelo_rf_base = RandomForestClassifier(random_state=42)
modelo_rf_base.fit(X_train, y_train)

In [None]:

y_pred_rf_base = modelo_rf_base.predict(X_test)
y_proba_rf_base = modelo_rf_base.predict_proba(X_test)[:, 1]

# Evaluación
print(classification_report(y_test, y_pred_rf_base))
print(confusion_matrix(y_test, y_pred_rf_base))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf_base))

In [None]:
#AJUSTE DE HIPERPARÁMETROS

In [None]:
param_grid = {
    'n_estimators': [100, 200],           # número de árboles
    'max_depth': [None, 10, 20],          # profundidad máxima
    'min_samples_split': [2, 5],          # mínimo de muestras para dividir
    'min_samples_leaf': [1, 2],           # mínimo de muestras en una hoja
    'criterion': ['gini', 'entropy']      # función de impureza
}

grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

grid_search_rf.fit(X_train, y_train)

mejor_rf = grid_search_rf.best_estimator_

print("Mejores hiperparámetros:", grid_search_rf.best_params_)


In [None]:
y_pred_rf_ajustado = mejor_rf.predict(X_test)
y_proba_rf_ajustado = mejor_rf.predict_proba(X_test)[:, 1]

print("Mejor modelo - Random Forest")
print(classification_report(y_test, y_pred_rf_ajustado))
print(confusion_matrix(y_test, y_pred_rf_ajustado))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf_ajustado))

In [None]:
conf_matrix = np.array([[75, 28],
                        [4, 541]])


df_cm_simple = pd.DataFrame(conf_matrix,
                            index=["0", "1"],
                            columns=["0", "1"])


plt.figure(figsize=(6.5, 6))
sns.heatmap(df_cm_simple, annot=True, fmt='d', cmap='Reds', cbar=True,
            linewidths=0.5, linecolor='gray', annot_kws={"size": 12})
plt.title("Matriz de Confusión - Random Forest", fontsize=14, pad=12)
plt.xlabel("Predicted", fontsize=12)
plt.ylabel("Actual", fontsize=12)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11, rotation=0)
plt.tight_layout()
plt.show()

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_rf_ajustado)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_rf_ajustado):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - Random Forest")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Obtener importancias
importancias = modelo_rf_base.feature_importances_
nombres_vars = X.columns

# Ordenarlas
indices = np.argsort(importancias)

# Plot
plt.figure(figsize=(8, 5))
plt.barh(range(len(importancias)), importancias[indices], align='center')
plt.yticks(range(len(importancias)), [nombres_vars[i] for i in indices])
plt.xlabel("Importancia ")
plt.ylabel("Variable")
plt.title("Importancia de las variables - Random Forest")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
#MEJORA DEL MODELO RANDOM FOREST


In [None]:
# Separar por clase
df_clase0 = df[df["DefectStatus"] == 0]
df_clase1 = df[df["DefectStatus"] == 1]

# Número de muestras en clase minoritaria
n_min = min(len(df_clase0), len(df_clase1))

# Muestreo balanceado
df_balanceado = pd.concat([
    df_clase0.sample(n=n_min, random_state=42),
    df_clase1.sample(n=n_min, random_state=42)
])

# Dividir en X e y
X_bal = df_balanceado.drop("DefectStatus", axis=1)
y_bal = df_balanceado["DefectStatus"]

# División train-test
from sklearn.model_selection import train_test_split

X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(
    X_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)

In [None]:
modelo_rf_balanced_manual = RandomForestClassifier(random_state=42)
modelo_rf_balanced_manual.fit(X_train_bal, y_train_bal)

y_pred_manual = modelo_rf_balanced_manual.predict(X_test_bal)

print(classification_report(y_test_bal, y_pred_manual))
print(confusion_matrix(y_test_bal, y_pred_manual))


In [None]:
#XGBoost

In [None]:
pip install xgboost


In [None]:
from xgboost import XGBClassifier


xgb_default = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=50,  
    random_state=42
)



xgb_default.fit(X_train, y_train)


y_pred_xgb = xgb_default.predict(X_test)
y_proba_xgb = xgb_default.predict_proba(X_test)[:, 1]


print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred_xgb))
print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_xgb))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_xgb)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_xgb):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - XGBoost")
plt.legend()
plt.grid()
plt.show()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.5, 0.5)
}

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
print("Mejores parámetros encontrados:")
print(random_search.best_params_)

In [None]:
best_random_model_xgb = XGBClassifier(
    colsample_bytree=0.5,
    learning_rate=0.03,
    max_depth=3,
    n_estimators=200,
    subsample=1.0,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

# Entrenar
best_random_model_xgb.fit(X_train, y_train)

# Predecir
y_pred = best_random_model_xgb.predict(X_test)
y_proba_xgb_best = best_random_model_xgb.predict_proba(X_test)[:, 1]

# Evaluar
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_xgb_best))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_xgb_best)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_xgb_best):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - XGBoost")
plt.legend()
plt.grid()
plt.show()

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

# Gráfico de importancia
plot_importance(xgb_default, importance_type='gain', max_num_features=10)
plt.title("Importancia de variables según XGBoost")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


data = {
    "Modelo": ["Regresión Logística", "Árbol de Decisión", "Random Forest", "XGBoost"],
    "Accuracy": [0.91, 0.92, 0.94, 0.95],
    "AUC": [0.82, 0.83, 0.84, 0.86],
    "Falsos positivos": [35, 18, 12, 4],
    "Falsos negativos": [7, 6, 4, 4]
}

df_metrics = pd.DataFrame(data)

Convertir para graficar accuracy y AUC
df_plot1 = df_metrics.melt(id_vars="Modelo", value_vars=["Accuracy", "AUC"],
                           var_name="Métrica", value_name="Valor")

plt.figure(figsize=(10, 5))
sns.barplot(data=df_plot1, x="Modelo", y="Valor", hue="Métrica")
plt.title("Comparación de Accuracy y AUC entre modelos")
plt.ylim(0, 1.05)
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()


df_plot2 = df_metrics.melt(id_vars="Modelo", value_vars=["Falsos positivos", "Falsos negativos"],
                           var_name="Error", value_name="Cantidad")

plt.figure(figsize=(10, 5))
sns.barplot(data=df_plot2, x="Modelo", y="Cantidad", hue="Error")
plt.title("Comparación de errores tipo entre modelos")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold

models = {
    'Logistic Regression': make_pipeline(StandardScaler(), LogisticRegression()),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

plt.figure(figsize=(14, 10))

for i, (name, model) in enumerate(models.items(), 1):
    plt.subplot(2, 2, i)
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y,
        cv=StratifiedKFold(n_splits=5),
        scoring='f1',
        n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 5),
        random_state=42
    )

    plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Train F1-score')
    plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='Test F1-score')
    plt.title(f'{name}')
    plt.xlabel('Tamaño del conjunto de entrenamiento')
    plt.ylabel('F1-score')
    plt.legend()
    plt.grid(True)

plt.tight_layout()
plt.suptitle("Curvas de aprendizaje por modelo", fontsize=16, y=1.02)
plt.show()

In [None]:
import numpy as np

modelos = ['Regresión Log.', 'Árbol', 'RF', 'XGB']
FN = np.array([52, 28, 28, 28])
FP = np.array([29, 6, 4, 5])

plt.figure(figsize=(8, 5))
plt.bar(modelos, FN, label='Falsos Negativos', color='orange')
plt.bar(modelos, FP, bottom=FN, label='Falsos Positivos', color='lightblue')
plt.ylabel('Errores totales')
plt.title('Total de errores por modelo (FN + FP)')
plt.legend()
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


modelos = ['Regresión Log.', 'Árbol', 'RF', 'XGB']
FN = np.array([52, 28, 28, 28])
FP = np.array([29, 6, 4, 5])
errores_totales = FN + FP

df = pd.DataFrame({'Modelo': modelos, 'Falsos Negativos': FN, 'Falsos Positivos': FP, 'Total': errores_totales})
df = df.sort_values('Total', ascending=False)


x = np.arange(len(df)) * 0.7  
width = 0.35  # más fino


color_fn = '#F4A261'  
   


plt.figure(figsize=(8, 5))
plt.bar(x, df['Falsos Negativos'], width=width, color=color_fn, label='Falsos Negativos')
plt.bar(x, df['Falsos Positivos'], width=width, bottom=df['Falsos Negativos'], color='light', label='Falsos Positivos')


for i in range(len(df)):
    total = df['Total'].iloc[i]
    plt.text(x[i], total + 1, str(total), ha='center', fontsize=9)


plt.xticks(x, df['Modelo'], rotation=10)
plt.ylabel('Errores totales')
plt.title('Errores por modelo: Falsos negativos y positivos')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import auc

fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_umbral)
fpr_tree, tpr_tree, _ = roc_curve(y_test, y_pred_best)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf_ajustado)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb)

auc_log = auc(fpr_log, tpr_log)
auc_tree = auc(fpr_tree, tpr_tree)
auc_rf = auc(fpr_rf, tpr_rf)
auc_xgb = auc(fpr_xgb, fpr_xgb)

# Plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_log, tpr_log, label=f'Regresión logística (AUC = {auc_log:.2f})')
plt.plot(fpr_tree, tpr_tree, label=f'Árbol de decisión (AUC = {auc_tree:.2f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.2f})')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Aleatorio (AUC = 0.5)')
plt.xlabel('Tasa de falsos positivos')
plt.ylabel('Tasa de verdaderos positivos')
plt.title('Comparación de curvas ROC')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
modelo_arbol = mejor_modelo_arb  
modelo_rf = modelo_rf_base      
modelo_xgb = xgb_default        


variables = X_train.columns


df_arbol = pd.DataFrame({
    'Variable': variables,
    'Árbol de Decisión': mejor_modelo_arb.feature_importances_
})

df_rf = pd.DataFrame({
    'Variable': variables,
    'Random Forest': modelo_rf_base.feature_importances_
})


importancia_xgb_dict = modelo_xgb.get_booster().get_score(importance_type='gain')
df_xgb = pd.DataFrame({
    'Variable': list(importancia_xgb_dict.keys()),
    'XGBoost': list(importancia_xgb_dict.values())
})

colores = {
    'Árbol de Decisión': '#8599AA',   
    'Random Forest': '#1f4e66',       
    'XGBoost': '#d66c30'             
}



df_merge = df_arbol.merge(df_rf, on='Variable').merge(df_xgb, on='Variable')


top_vars = df_merge.set_index('Variable').mean(axis=1).sort_values(ascending=False).head(4).index
df_top.set_index('Variable')[list(colores.keys())].plot(
    kind='barh',
    figsize=(10, 6),
    color=[colores[col] for col in colores]
)


df_top.set_index('Variable').plot(kind='barh', figsize=(10, 6))
plt.title("Comparación de importancia de variables entre modelos")
plt.xlabel("Importancia normalizada")
plt.legend(title="Modelo")
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
modelo_arbol = mejor_modelo_arb  
modelo_rf = modelo_rf_base      
modelo_xgb = xgb_default        


variables = X_train.columns


df_arbol = pd.DataFrame({
    'Variable': variables,
    'Árbol de Decisión': mejor_modelo_arb.feature_importances_
})

df_rf = pd.DataFrame({
    'Variable': variables,
    'Random Forest': modelo_rf_base.feature_importances_
})


importancia_xgb_dict = modelo_xgb.get_booster().get_score(importance_type='gain')
df_xgb = pd.DataFrame({
    'Variable': list(importancia_xgb_dict.keys()),
    'XGBoost': list(importancia_xgb_dict.values())
})

colores = {
    'Árbol de Decisión': '#8599AA',   
    'Random Forest': '#1f4e66',       
    'XGBoost': '#d66c30'             
}



df_merge = df_arbol.merge(df_rf, on='Variable').merge(df_xgb, on='Variable')


top_vars = df_merge.set_index('Variable').mean(axis=1).sort_values(ascending=False).head(4).index
df_top.set_index('Variable')[list(colores.keys())].plot(
    kind='barh',
    figsize=(10, 6),
    color=[colores[col] for col in colores]
)



plt.title("Comparación de importancia de variables entre modelos")
plt.xlabel("Importancia normalizada")
plt.legend(title="Modelo")
plt.grid(True, axis='x', linestyle='', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
cm1 = np.array([[51, 52],
                [29, 516]])

cm2 = np.array([[75, 28],
                [6, 539]])

cm3 = np.array([[75, 28],
                [4, 541]])

cm4 = np.array([[75, 28],
                [4, 541]])

matrices = [cm1, cm2, cm3, cm4]
titles = ["Regresión Logística", "Árbol de decisión", "Random Forest", "XGBoost"]

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for i, ax in enumerate(axes.flat):
    sns.heatmap(matrices[i], annot=True, fmt="d", cmap="Reds", cbar=False,
                ax=ax)
    ax.set_title(titles[i])
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

plt.tight_layout()
plt.show()