In [1]:
# Load libraries
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn import datasets
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, precision_score, f1_score,
    precision_recall_curve, average_precision_score,
    roc_auc_score, roc_curve, auc, ConfusionMatrixDisplay, RocCurveDisplay
   )

import xgboost as xgb
from xgboost import XGBClassifier

import tensorflow as tf

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.metrics import AUC

import warnings

import holoviews as hv

from datetime import datetime

from xgboost import XGBRFClassifier

import os

import pickle

pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 10)

In [2]:
df=pd.read_csv('D:/PERSONAL-DAS/BOOTCAMP/Carpeta-VISUAL/CURSO/Alumno-DAS/3-Machine_Learning/Entregas/ML_project/Preddicion Prestamos Fallidos/data/lending-club-loan_ML_stand_balnc.csv',low_memory=False)

In [3]:
def save_score(true, pred, model_name, train=True, auc_roc=None):
    timestamp = datetime.now().strftime('%Y%m')
    file_name = f"{model_name}_TestResults_{timestamp}.csv"

    clf_report = classification_report(true, pred, output_dict=True)
    accuracy = accuracy_score(true, pred) * 100
    roc_auc = roc_auc_score(true, pred) * 100
    confusion = confusion_matrix(true, pred)

    if auc_roc is not None:
        result_df = pd.DataFrame({
            'Set': ['Train' if train else 'Test'],
            'Accuracy': [accuracy],
            'AUC-ROC': [roc_auc],
            'Classification Report': [clf_report],
            'Confusion Matrix': [confusion.tolist()]
        })
    else:
        result_df = pd.DataFrame({
            'Set': ['Train' if train else 'Test'],
            'Accuracy': [accuracy],
            'AUC-ROC': [roc_auc],
            'Classification Report': [clf_report],
            'Confusion Matrix': [confusion.tolist()]
        })

    current_dir = os.getcwd()
    pruebas_dir = os.path.join(current_dir, 'Pruebas')
    os.makedirs(pruebas_dir, exist_ok=True)

    # Unir la ruta del directorio 'pruebas' con el nombre del archivo CSV
    file_path = os.path.join(pruebas_dir, file_name)

    result_df.to_csv(file_path, index=False)

In [4]:
def print_score(true, pred, model_name, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
    else:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

In [5]:
def plot_confusion_matrix(model, X_test, y_test, model_name):
    # Predicciones del modelo
    preds = model.predict(X_test)

    # Matriz de confusión
    cm = confusion_matrix(y_test, preds)
    tn, fp, fn, tp = cm.ravel()

    # Configurar el gráfico
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=['Pred Neg', 'Pred Pos'], yticklabels=['True Neg', 'True Pos'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

    # Calcular porcentajes
    total = tn + fp + fn + tp
    accuracy = (tp + tn) / total * 100
    error_rate = (fp + fn) / total * 100
    precision = tp / (tp + fp) * 100 if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) * 100 if (tp + fn) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    # Calcular total de positivos y negativos con sus porcentajes
    total_correct = tp + tn
    total_errors = fp + fn
    percent_correct = total_correct / total * 100
    percent_errors = total_errors / total * 100

    # Mostrar resultados
    print(f"\nTotal Correct: {total_correct} ({percent_correct:.2f}%)")
    print('--'*30)
    print(f"True Positives (TP): {tp} ({tp/total*100:.2f}%)")
    print(f"True Negatives (TN): {tn} ({tn/total*100:.2f}%)")

    print(f"\nTotal Errors: {total_errors} ({percent_errors:.2f}%)")
    print('--'*30)
    print(f"False Positives (FP): {fp} ({fp/total*100:.2f}%)")
    print(f"False Negatives (FN): {fn} ({fn/total*100:.2f}%)")

In [6]:
def plot_roc_pr_curves(model, X_test, y_test, model_name):
    # Curva ROC
    y_test_probs = model.predict_proba(X_test)[:, 0]
    fpr, tpr, thresholds = roc_curve(y_test, y_test_probs)
    roc_auc = roc_auc_score(y_test, y_test_probs)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc='lower right')
    plt.show()

    # Curva Precisión-Recall
    precision, recall, _ = precision_recall_curve(y_test, y_test_probs)
    pr_auc = auc(recall, precision)

    plt.figure(figsize=(8, 8))
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title(f'Precision-Recall Curve - {model_name}')
    plt.show()

In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score 
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

# Suponiendo que df es tu DataFrame y 'loan_status' es tu columna objetivo
target = 'loan_status'
X = df.drop(columns=[target])
y = df[target]

# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline para ANN
# pca_ann = PCA(n_components=None) # None/0.95
ann = Pipeline(steps=[
    ('scaler', StandardScaler()),
    # ('pca', pca_ann),
    ('selectkbest', SelectKBest()),
    ('ann', MLPClassifier())
])

# Parámetros para ANN
ann_params = {
    'selectkbest__k': [2, 3, 4],
    'ann__hidden_layer_sizes': [(100,), (50, 50), (30, 20, 10)],
    'ann__activation': ['relu', 'tanh', 'logistic'],
    'ann__solver': ['sgd', 'adam'],
    'ann__alpha': [0.0001, 0.001, 0.01],
    'ann__learning_rate': ['constant', 'invscaling', 'adaptive'],
}

# Grid search con modelo ANN
gs_ann = GridSearchCV(ann, ann_params, cv=10, scoring='accuracy', verbose=1, n_jobs=-1)

# Entrenamos el grid search
gs_ann.fit(X_train, y_train)

# Resultados del grid search para ANN
print(f'Precisión en el conjunto de entrenamiento: {gs_ann.best_score_:.4f}')
print(f'Mejores parámetros: {gs_ann.best_params_}')
print('Mejor estimador:')
print(gs_ann.best_estimator_)

# Precisión y recall en el conjunto de prueba
preds_ann = gs_ann.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, preds_ann)
recall = recall_score(y_test, preds_ann)

print(f'Precisión en el conjunto de prueba: {accuracy:.4f}')
print(f'Recall en el conjunto de prueba: {recall:.4f}')


Fitting 10 folds for each of 486 candidates, totalling 4860 fits


In [None]:
train_preds_ann = gs_ann.best_estimator_.predict(X_train)
test_preds_ann = gs_ann.best_estimator_.predict(X_test)

model_name = 'ANN'
# Llamada a las funciones después de entrenar el modelo
print_score(y_train, train_preds_ann, train=True)
print_score(y_test, test_preds_ann, train=False)
plot_confusion_matrix(gs_ann.best_estimator_, X_test, y_test, model_name)
plot_roc_pr_curves(gs_ann.best_estimator_, X_test, y_test, model_name)

save_score(y_train, train_preds_ann, model_name, train=True)
save_score(y_test, test_preds_ann, model_name, train=False)

In [None]:
# Guardar el modelo
import pickle

with open('finished_model.model_ANN', "wb") as archivo_salida:
    pickle.dump(models_gridsearch['ANN'].best_estimator_, archivo_salida)