In [None]:
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score, roc_curve, auc, 
                             precision_recall_curve)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import shap


In [11]:
def plot_decision_regions(X, y, classifier=None, resolution=0.02):
    """ Taken from Rashka's book """
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 0.3, X[:, 0].max() + 0.3
    x2_min, x2_max = X[:, 1].min() - 0.3, X[:, 1].max() + 0.3
    xx1, xx2 = np.meshgrid(
        np.arange(x1_min, x1_max, resolution),
        np.arange(x2_min, x2_max, resolution)
    )
    
    if classifier is not None:
        Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
        Z = Z.reshape(xx1.shape)
        plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
        
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    
    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(
            x=X[y == cl, 0],
            y=X[y == cl, 1],
            alpha=0.8,
            c=colors[idx],
            marker=markers[idx],
            label=cl,
            edgecolor='black'
        )

In [12]:
def modelos(X_test, y_test, models):
    metricas = dict()

    for name, model in models.items():

        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        #### Calcular metricas

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        pr_auc = roc_auc_score(y_test, y_proba)

        #### Guardar metricas
        metricas[name] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "pr_auc": pr_auc
        }

        #### Confusion matrix

        conf_matrix = confusion_matrix(y_test, y_pred)
        print(f"Matriz de Confusion para {name}")
        print(conf_matrix)

        #### Curva ROC+
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

    # Configuración de la gráfica ROC
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Tasa de Falsos Positivos')
    plt.ylabel('Tasa de Verdaderos Positivos')
    plt.title('Curva ROC Comparativa')
    plt.legend(loc="lower right")
    plt.show()

    # Mostrar las métricas
    metrics_df = pd.DataFrame(metricas).T
    print("\nMétricas de Rendimiento para cada Modelo:")
    print(metrics_df)

In [13]:
def models_training(X_train,y_train)->dict:
    rf = RandomForestClassifier(random_state=42)
    lr = LogisticRegression(random_state=42)
    dt = DecisionTreeClassifier(random_state=42)
    nb = GaussianNB()
    xgb = XGBClassifier(random_state = 42)

    rf.fit(X_train,y_train)
    lr.fit(X_train,y_train)
    dt.fit(X_train,y_train)
    nb.fit(X_train,y_train)
    xgb.fit(X_train,y_train)

    models = {
    "random_forest": rf,
    "logistic_regression": lr,
    "Decision_tree":dt,
    "Naive_Bayes" : nb,
    "xgboost": xgb
    }
    
    return models