# Incumplimiento de prestamos
* Carlo Crivelli Hernández

En este proyecto se analizo y limpio la base de datos de un banco con el proposito de saber que caracteristicas tienen las personas que no cumplen con los pagos de los presta,os bancarios

In [None]:
import os
import warnings
import numpy  as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from matplotlib import colors as mcolors
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')

In [None]:
class Analisis_Predictivo:
    def __init__(self,datos:DataFrame, predecir:str, predictoras:list = [],
                 modelo = None,train_size:int = 80,random_state:int = None):
        '''
        Ajusta un modelo basado en sci-kit learn para realizar predicciones sobre prestatarios.

        datos: Datos completos y listos para construir un modelo
        
        modelo: Instancia de una Clase de un método de clasificación(KNN,Árboles,SVM,etc).
        Si no especifica un modelo no podrá utilizar el método fit_n_review()
        
        predecir: Nombre de la variable a predecir
        
        predictoras: Lista de los nombres de las variables predictoras.
        Si vacío entonces utiliza todas las variables presentes excepto la variable a predecir.
        
        train_size: Proporción de la tabla de entrenamiento respecto a la original.
        
        random_state: Semilla aleatoria para la división de datos(training-testing).
        '''        
        self.datos = datos
        self.predecir = predecir
        self.predictoras = predictoras
        self.modelo = modelo
        self.random_state = random_state
        if modelo != None:
            self.train_size = train_size
            self._training_testing()
        
        
    def _training_testing(self):
        if len(self.predictoras) == 0:
            X = self.datos.drop(columns=[self.predecir])
        else:
            X = self.datos[self.predictoras]
            
        y = self.datos[self.predecir].values
        
        train_test = train_test_split(X, y, train_size=self.train_size, 
                                      random_state=self.random_state)
        self.X_train, self.X_test,self.y_train, self.y_test = train_test
        
        
    def fit_predict(self):
        if(self.modelo != None):
            self.modelo.fit(self.X_train,self.y_train)
            return self.modelo.predict(self.X_test)
        
    def fit_predict_resultados(self, imprimir = True):
        if(self.modelo != None):
            y = self.datos[self.predecir].values
            prediccion = self.fit_predict()
            MC = confusion_matrix(self.y_test, prediccion)
            indices = self.indices_general(MC,list(np.unique(y)))
            if imprimir == True:
                for k in indices:
                    print("\n%s:\n%s"%(k,str(indices[k])))
            
            # return indices
            return MC
    
    def indices_general(self,MC, nombres = None):
        "Método para calcular los índices de calidad de la predicción"
        precision_global = np.sum(MC.diagonal()) / np.sum(MC)
        error_global = 1 - precision_global
        precision_categoria  = pd.DataFrame(MC.diagonal()/np.sum(MC,axis = 1)).T
        if nombres!=None:
            precision_categoria.columns = nombres
        return {"Matriz de Confusión":MC, 
                "Precisión Global":precision_global, 
                "Error Global":error_global, 
                "Precisión por categoría":precision_categoria}
    
    def distribucion_variable_predecir(self):
        "Método para graficar la distribución de la variable a predecir"
        variable_predict = self.predecir
        data = self.datos
        colors = list(dict(**mcolors.CSS4_COLORS))
        df = pd.crosstab(index=data[variable_predict],columns="valor") / data[variable_predict].count()
        fig = plt.figure(figsize=(10,2))
        g = fig.add_subplot(111)
        countv = 0
        titulo = "Distribución de la variable %s" % variable_predict
        for i in range(df.shape[0]):
            g.barh(1,df.iloc[i],left = countv, align='center',color=colors[11+i],label= df.iloc[i].name)
            countv = countv + df.iloc[i]
        vals = g.get_xticks()
        g.set_xlim(0,1)
        g.set_yticklabels("")
        g.set_title(titulo)
        g.set_ylabel(variable_predict)
        g.set_xticklabels(['{:.0%}'.format(x) for x in vals])
        countv = 0 
        for v in df.iloc[:,0]:
            g.text(np.mean([countv,countv+v]) - 0.03, 1 , '{:.1%}'.format(v), color='black', fontweight='bold')
            countv = countv + v
        g.legend(loc='upper center', bbox_to_anchor=(1.08, 1), shadow=True, ncol=1)
        
    def poder_predictivo_categorica(self, var:str):
        "Método para ver la distribución de una variable categórica respecto a la predecir"
        data = self.datos
        variable_predict = self.predecir
        df = pd.crosstab(index= data[var],columns=data[variable_predict])
        df = df.div(df.sum(axis=1),axis=0)
        titulo = "Distribución de la variable %s según la variable %s" % (var,variable_predict)
        g = df.plot(kind='barh',stacked=True,legend = True, figsize = (10,9), \
                    xlim = (0,1),title = titulo, width = 0.8)
        vals = g.get_xticks()
        g.set_xticklabels(['{:.0%}'.format(x) for x in vals])
        g.legend(loc='upper center', bbox_to_anchor=(1.08, 1), shadow=True, ncol=1)
        for bars in g.containers:
            plt.setp(bars, width=.9)
        for i in range(df.shape[0]):
            countv = 0 
            for v in df.iloc[i]:
                g.text(np.mean([countv,countv+v]) - 0.03, i , '{:.1%}'.format(v), color='black', fontweight='bold')
                countv = countv + v
                
                
    def poder_predictivo_numerica(self,var:str):
        "Función para ver la distribución de una variable numérica respecto a la predecir"
        sns.FacetGrid(self.datos, hue=self.predecir, height=6).map(sns.kdeplot, var, shade=True).add_legend()

    def predictNew(self,df):
        '''
        Realiza nuevas predicciones con un dataframe nuevo con las mismas caracteristicas del de entrenamiento
        '''
        self.X_test=df.drop(self.predecir,axis=1)
        self.y_test=df[self.predecir]
        return self.fit_predict_resultados()

In [None]:
# importar la base de datos
bd_prestamos = pd.read_csv('Loan_Default.csv', index_col=0)
bd_prestamos.head(10)
bd_prestamos=bd_prestamos.dropna(subset=['Status'],axis=0)

# alternativa pa quitar nans
count_nan_all = bd_prestamos.isna().sum()
print('Se quitaron las siguientes columnas por datos insuficientes:\n'+'-'*60)
for idx, val in enumerate(bd_prestamos.columns):
    if count_nan_all[val]>20_000 and bd_prestamos[val].dtype in ('float64', 'int', 'float'):
        bd_prestamos = bd_prestamos.drop(val, axis=1)
        print(f'{val:38}',f'{count_nan_all[val]} datos faltantes')

In [None]:
print('\nAnalizaremos las siguientes variables:\n'+'-'*63)
pending=[]
count_nan_all = bd_prestamos.isna().sum()
for idx, val in enumerate(bd_prestamos.columns):
    if count_nan_all[val] and bd_prestamos[val].dtype in ('float64', 'int', 'float'):
        print(f'{val:38}',f'{count_nan_all[val]:8} datos faltantes')
        pending.append(val)

En este caso, quitaremos los valores faltantes de la columna 'term' ya que son muy pocos para proporcionarnos algo de informacion


In [None]:
bd_prestamos=bd_prestamos.dropna(subset=['term'],axis=0)
if 'term' in pending:pending.remove('term')

Quitamos los outliers de la muestra


In [None]:
def remove_outliers_inplace(df, columns, z_score_threshold=3):
    """
    Removes outliers for specified columns in the original DataFrame.

    Parameters:
        - df: The DataFrame to modify in place.
        - columns: A list of column names for which to remove outliers.
        - z_score_threshold: The threshold for identifying outliers based on z-scores.

    Returns:
        None (modifies the original DataFrame in place).
    """
    for col in columns:
        # Check if the specified column exists in the DataFrame
        if col in df.columns:
            # Calculate the z-scores for non-NaN values in the column
            non_nan_values = df[col].notna()
            z_scores = np.abs((df.loc[non_nan_values, col] - df.loc[non_nan_values, col].mean()) / df.loc[non_nan_values, col].std())

            # Identify rows with z-scores exceeding the threshold
            outliers = df.index[non_nan_values][z_scores > z_score_threshold]

            # Remove rows with outliers in place
            df.drop(outliers, inplace=True)

# Example usage:
# df is your original DataFrame, which may contain NaN values
# columns_to_remove_outliers is a list of column names where you want to remove outliers
# The function will modify the original DataFrame in-place for the specified columns.
columns_to_remove_outliers = pending
remove_outliers_inplace(bd_prestamos, columns_to_remove_outliers)

Dividimos el resto de valores en intervalos optimos


In [None]:
def split_numeric_values_to_intervals_inplace(df, columns):
    """
    Splits numeric values in specified columns into intervals for the original DataFrame.

    Parameters:
        - df: The DataFrame to modify in place.
        - columns: A list of column names for which to split numeric values into intervals.

    Returns:
        None (modifies the original DataFrame in place).
    """
    for col in columns:
        # Check if the specified column exists in the DataFrame
        if col in df.columns:
            # Select non-NaN values in the column
            non_nan_values = df[col].notna()
            values = df.loc[non_nan_values, col]

            # Calculate the number of intervals based on the values using the Freedman-Diaconis rule
            iqr = np.percentile(values, 75) - np.percentile(values, 25)
            bin_width = 2.0 * iqr / np.power(len(values), 1/3)
            num_intervals = int(np.ceil((values.max() - values.min()) / bin_width))

            # Create intervals and assign them to the DataFrame
            intervals = pd.cut(values, bins=num_intervals, precision=2)
            df.loc[non_nan_values, col] = intervals.astype(str)
columns_to_modify = pending
split_numeric_values_to_intervals_inplace(bd_prestamos, columns_to_modify)

Convertimos los valores faltantes en una nueva clase llamada "No disponible"


In [None]:
bd_prestamos.fillna('No disponible')
print('Valores unicos por columna:\n'+'-'*30)
for col in pending:
    print(f'{col:25}',bd_prestamos[col].nunique())

Mapeamos los valores unicos de las variables categoricas, incluyendo las que acabamos de cambiar


In [None]:
for col in list(bd_prestamos.columns)[0:-1]:
  unqvals = []
  if bd_prestamos[col].nunique()>160: continue
  for unqval in bd_prestamos[col].unique():
      unqvals.append(unqval)
  unique_vals_map = {unq:i for i,unq in enumerate(unqvals)}
  if unique_vals_map:
    bd_prestamos[col] = bd_prestamos[col].map(unique_vals_map)

count_nan_all = bd_prestamos.isna().sum()
nada=''
print(f'Variable{nada:20}Valores perdidos\n'+'-'*45)
for idx, val in enumerate(bd_prestamos.columns):
    print(f'{val:35}{count_nan_all[val]}')

Ahora calcularemos las correlaciones


In [None]:
dataplot = sns.heatmap(bd_prestamos.corr(), cmap= sns.cubehelix_palette(as_cmap = True ),
                       annot= False)

Podemos ver que hay variables con 100% de correlacion. Veremos cuales son


In [None]:
# Calculate the correlation matrix
correlation_matrix = bd_prestamos.corr()

# Find columns with correlation >= 0.9
highly_correlated_cols = []
for col1 in correlation_matrix.columns:
    for col2 in correlation_matrix.columns:
        if col1 != col2 and abs(correlation_matrix[col1][col2]) >= 0.9:
            highly_correlated_cols.append((col1, col2))

# Print highly correlated columns
nada=''
print(f'Variable 1{nada:20}Variable 2{nada:20}Correlacion')
for col1, col2 in highly_correlated_cols:

    print(f"{col1:30}{col2:32} {correlation_matrix[col1][col2]:.2f}")

bd_prestamos=bd_prestamos.drop(['construction_type','Security_Type'], axis=1)

Ahora cuantificaremos la informacion que cada variable nos porporciona para una prediccion y quitaremos las que menos nos sirvan


In [None]:
from sklearn.metrics import mutual_info_score

info_by_var=list()
vars_out=list()
quedan=['Gender','Region','age','income']
def entropy_based_predictive_power(df: pd.DataFrame, target: str, predictor: str) -> float:
    # Calculate the mutual information between the predictor and target variables
    mi = mutual_info_score(df[target], df[predictor])
    return mi
nada=''
print(f'Variable{nada:20}Medida de poder predictivo (entropia){nada:20}Suficiente')
for col in list(bd_prestamos.columns)[0:-1]:
    mi=entropy_based_predictive_power(bd_prestamos[[col,'Status']],'Status',col)
    txt='Si' if mi>.0009 else 'No'
    if mi<.0009 and col not in quedan: vars_out.append(col)
    print(f'{col:45}{mi:.5f}{nada:37}{txt}')

Definimos la nueva base de datos con nuestros predictores


In [None]:
# quitar columnas que no sirven
bd_predictiva = bd_prestamos.drop(vars_out, axis=1)
bd_predictiva = bd_predictiva.drop_duplicates()

bd_predictiva


Aqui hacemos analisis de factores y agarramos los mas importantes


In [None]:
from factor_analyzer import FactorAnalyzer
def factor_analysis(df: pd.DataFrame, n_factors: int = 15, rotation: str = 'varimax', method: str = 'lda') -> None:
    '''
    - df: Un dataframe con variables correlacionadas
    - n_factors: Numero de factores obtenidos a partir de las variables
    - method: alguno de los siguientes ['ml', 'mle', 'uls', 'minres', 'principal']

    '''
    fa = FactorAnalyzer(n_factors=n_factors, rotation=rotation, method=method)
    fa.fit(df)
    #Metodos ['ml', 'mle', 'uls', 'minres', 'principal']

    # Obtener cargas
    loadings = fa.loadings_

    # Obtener varianzas
    variance = fa.get_factor_variance()

    # Obtener comunalidades
    communalities = fa.get_communalities()

    # Formatear Output
    loadings_table = 'Cargas Factoriales:\n'
    loadings_table += 'Variable\t' + '\t'.join([f'Factor {i+1}' for i in range(n_factors)]) + '\tFactor mas repr.'+'\tComunalidad\n'
    for variable, loading,communality in zip(df.columns, loadings,communalities):
        most_representative_factor = np.argmax(loading) + 1
        commie = communality
        loadings_table += f'{variable:<15}\t' + '\t'.join([f'{loading[i]:<10.2f}' for i in range(n_factors)]) + f'\tF{most_representative_factor} : {loading[np.argmax(loading)]:.2f}'+f'\t{commie:>23.2f}\n'

    a=''
    variance_table = 'Varianza explicada por cada factor:\n'
    variance_table += f'Varianza{a:>13}\t' + '\t'.join([f'{variance[0][i]:<10.2f}' for i in range(n_factors)]) + f'Tot: {sum([variance[0][i] for i in range(n_factors)]):.3f}\n'
    variance_table += f'%Var{a:>18}\t' + '\t'.join([f'{variance[1][i]:<10.2f}' for i in range(n_factors)]) + f'Tot: {sum([variance[1][i] for i in range(n_factors)]):.3f}\n'
    variance_table += f'%Var. cumulativa\t' + '\t'.join([f'{variance[2][i]:<10.3f}' for i in range(n_factors)]) + '\n'

    print(loadings_table)
    print(variance_table)

In [None]:
factor_analysis(bd_predictiva.drop('Status',axis=1),n_factors= 12,rotation='varimax',method='principal')


Reducimos las dimensiones del dataste para eliminar ruido en nuestras variables predictoras


In [None]:
from sklearn.decomposition import PCA

def reduce_dimensions(dataframe:pd.DataFrame, target_variable:str, method:str='PCA', num_components=None, num_clusters=None):

# Separate predictor variables and target variable
    X = dataframe.drop(target_variable, axis=1)
    y = dataframe[target_variable]

    if method == 'PCA':
        # Perform PCA
        pca = PCA(n_components=num_components)
        reduced_data = pca.fit_transform(X)

    elif method == 'KMeans':
        # Perform K-Means clustering
        kmeans = KMeans(n_clusters=num_clusters)
        reduced_data = kmeans.fit_transform(X)

    elif method == 'FactorAnalysis':
        # Perform Factor Analysis
        fa = FactorAnalyzer(n_factors=num_components, rotation='varimax')
        fa.fit(X)
        reduced_data = fa.transform(X)

    else:
        raise ValueError("Invalid method. Choose from 'PCA', 'KMeans', or 'FactorAnalysis'.")

    # Create a DataFrame with reduced dimensions
    reduced_df = pd.DataFrame(reduced_data, columns=[f'Dimension_{i + 1}' for i in range(reduced_data.shape[1])])

    # Add the target variable back to the DataFrame
    reduced_df[target_variable] = y

    return reduced_df

Para determinar los mejores parametros del modelo


# Linear Discriminant Analysis


In [None]:
lda = LinearDiscriminantAnalysis()
scoringLDA = Analisis_Predictivo(bd_predictiva, predecir="Status", modelo=lda,
                                       train_size=0.8, random_state=0)
scoringLDA.distribucion_variable_predecir()
plt.show()
resLDA = scoringLDA.fit_predict_resultados()
sns.heatmap(resLDA, annot=True, fmt='3.0f', xticklabels=('Falsos Positivos','Verdaderos Positivos'), 
            yticklabels=('Verdaderos Positivos','Falsos Positivos'), cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted Labels') 
plt.ylabel('True Labels')
plt.title('Matriz de Confusion (LDA)', y=1.05, size=15)

# Quadratic Discriminant Analysis


In [None]:
qda = QuadraticDiscriminantAnalysis()
scoringQDA = Analisis_Predictivo(bd_predictiva, predecir="Status", modelo=qda,
                                       train_size=0.8, random_state=0)
scoringQDA.distribucion_variable_predecir()
plt.show()
resQDA = scoringQDA.fit_predict_resultados()
sns.heatmap(resQDA, annot=True, fmt='3.0f', xticklabels=('Falsos Positivos','Verdaderos Positivos'), 
            yticklabels=('Verdaderos Positivos','Falsos Positivos'), cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted Labels') 
plt.ylabel('True Labels')
plt.title('Matriz de Confusion (QDA)', y=1.05, size=15)

# Gaussian Naive Bayes


In [None]:
bys = GaussianNB()
scoringBYS = Analisis_Predictivo(bd_predictiva, predecir="Status", modelo=bys,
                                       train_size=0.8, random_state=0)
scoringBYS.distribucion_variable_predecir()
plt.show()
resBYS = scoringBYS.fit_predict_resultados()
sns.heatmap(resBYS, annot=True, fmt='3.0f', xticklabels=('Falsos Positivos','Verdaderos Positivos'), 
            yticklabels=('Verdaderos Positivos','Falsos Positivos'), cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted Labels') 
plt.ylabel('True Labels')
plt.title('Matriz de Confusion (Naive Bayes)', y=1.05, size=15)

In [None]:
# Entrenar el modelo
bys.fit(scoringBYS.X_train, scoringBYS.y_train)

r_probs = [0 for _ in range(len(scoringBYS.y_test))] 
dt_probs = bys.predict_proba(scoringBYS.X_test) 
dt_probs = dt_probs[:, 1]

r_auc = roc_auc_score(scoringBYS.y_test , r_probs)
dt_auc = roc_auc_score(scoringBYS.y_test , dt_probs)

r_fpr, r_tpr, _ = roc_curve(scoringBYS.y_test, r_probs)
dt_fpr , dt_tpr , _ = roc_curve(scoringBYS.y_test,dt_probs)

plt.plot(r_fpr , r_tpr, linestyle = '--', label='Predicción aleatoria'% r_auc)
plt.plot(dt_fpr , dt_tpr, marker='.', label='Naive Bayes'% dt_auc)

plt.title('Gráfica ROC')
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
plt.legend() 
plt.show()

# LogitBoost

In [None]:
from logitboost import LogitBoost as lgbt
lboost = lgbt(n_estimators=100, random_state=0)
scoringLboost = Analisis_Predictivo(bd_predictiva, predecir="Status", modelo=lboost,
                                       train_size=0.8, random_state=0)
scoringLboost.distribucion_variable_predecir()
plt.show()
resboost = scoringLboost.fit_predict_resultados()
sns.heatmap(resboost, annot=True, fmt='3.0f', xticklabels=('Falsos Positivos','Verdaderos Positivos'), 
            yticklabels=('Verdaderos Positivos','Falsos Positivos'), cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted Labels') 
plt.ylabel('True Labels')
plt.title('Matriz de Confusion (LogitBoost)', y=1.05, size=15)

# Random Forest


In [None]:
# crear modelo
# 100 arbolitos en el bosque
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=1)

scoringrf = Analisis_Predictivo(bd_predictiva, predecir="Status", modelo=rf,
                                       train_size=0.8, random_state=0)
scoringrf.distribucion_variable_predecir()
plt.show()
resrf = scoringrf.fit_predict_resultados()
sns.heatmap(resrf, annot=True, fmt='3.0f', xticklabels=('Falsos Positivos','Verdaderos Positivos'), 
            yticklabels=('Verdaderos Positivos','Falsos Positivos'), cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted Labels') 
plt.ylabel('True Labels')
plt.title('Matriz de Confusion (Random Forest)', y=1.05, size=15)

In [None]:
# Entrenar el modelo
rf.fit(scoringrf.X_train, scoringrf.y_train)

r_probs = [0 for _ in range(len(scoringrf.y_test))] 
dt_probs = rf.predict_proba(scoringrf.X_test) 
dt_probs = dt_probs[:, 1]

r_auc = roc_auc_score(scoringrf.y_test , r_probs)
dt_auc = roc_auc_score(scoringrf.y_test , dt_probs)

r_fpr, r_tpr, _ = roc_curve(scoringrf.y_test, r_probs)
dt_fpr , dt_tpr , _ = roc_curve(scoringrf.y_test,dt_probs)

plt.plot(r_fpr , r_tpr, linestyle = '--', label='Predicción aleatoria'% r_auc)
plt.plot(dt_fpr , dt_tpr, marker='.', label='Random Forest'% dt_auc)

plt.title('Gráfica ROC')
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
plt.legend() 
plt.show()

In [None]:
sns.histplot(dt_probs, bins=20, kde=True, color='blue')
plt.ylabel('Frecuencia')
plt.title('Histograma de Predicciones del Modelo Random Forest')
plt.show()

Que pasa si cambiamos la proporcion de datos


In [None]:
dftrain=bd_predictiva.sample(frac=.8)
dftest=bd_predictiva.drop(dftrain.index)
dftrain=pd.concat((dftrain,dftrain[dftrain['Status']==1]))
dftrain=pd.concat((dftrain,dftrain[dftrain['Status']==1]))
plt.hist(dftrain['Status'],bins=3)
plt.title('Proporcion de Pagadores y no pagadores conjunto de entrenamiento')

In [None]:
plt.hist(dftest['Status'],bins=3)
plt.title('Proporcion de Pagadores y no pagadores en el conjunto de validacion')

# LogitBoost 2


In [None]:
from logitboost import LogitBoost as lgbt
lboost = lgbt(n_estimators=100, random_state=0)
scoringLboost = Analisis_Predictivo(dftrain, predecir="Status", modelo=lboost,
                                       train_size=0.7, random_state=1)
scoringLboost.distribucion_variable_predecir()
plt.show()
resboost = scoringLboost.fit_predict_resultados()

In [None]:
resboost = scoringLboost.predictNew(dftest)
sns.heatmap(resboost, annot=True, fmt='3.0f', xticklabels=('Falsos Positivos','Verdaderos Positivos'), 
            yticklabels=('Verdaderos Positivos','Falsos Positivos'), cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted Labels') 
plt.ylabel('True Labels')
plt.title('Matriz de Confusion LogitBoost (Datos duplicados)', y=1.05, size=15)

# RandomForest 2

In [None]:
# crear modelo
# 100 arbolitos en el bosque
from sklearn.ensemble import RandomForestClassifier
rf2 = RandomForestClassifier(n_estimators=120,criterion='gini',max_depth=12)

scoringrf2 = Analisis_Predictivo(dftrain, predecir="Status", modelo=rf2,
                                       train_size=0.3, random_state=42)
scoringrf2.distribucion_variable_predecir()
plt.show()
resrf2 = scoringrf2.fit_predict_resultados()

In [None]:
resrf2 = scoringrf2.predictNew(dftest)
sns.heatmap(resrf, annot=True, fmt='3.0f', xticklabels=('Falsos Positivos','Verdaderos Positivos'), 
            yticklabels=('Verdaderos Positivos','Falsos Positivos'), cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted Labels') 
plt.ylabel('True Labels')
plt.title('Matriz de Confusion Random Forest (Datos duplicados)', y=1.05, size=15)

In [None]:
# Entrenar el modelo
rf2.fit(scoringrf2.X_train, scoringrf2.y_train)

r_probs = [0 for _ in range(len(scoringrf2.y_test))] 
dt_probs = rf.predict_proba(scoringrf2.X_test) 
dt_probs = dt_probs[:, 1]

r_auc = roc_auc_score(scoringrf2.y_test , r_probs)
dt_auc = roc_auc_score(scoringrf2.y_test , dt_probs)

r_fpr, r_tpr, _ = roc_curve(scoringrf2.y_test, r_probs)
dt_fpr , dt_tpr , _ = roc_curve(scoringrf2.y_test,dt_probs)

plt.plot(r_fpr , r_tpr, linestyle = '--', label='Predicción aleatoria'% r_auc)
plt.plot(dt_fpr , dt_tpr, marker='.', label='Random Forest 2'% dt_auc)

plt.title('Gráfica ROC')
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
plt.legend() 
plt.show()

In [None]:
sns.histplot(dt_probs, bins=20, kde=True, color='blue')
plt.ylabel('Frecuencia')
plt.title('Histograma de Predicciones del Modelo Random Forest')
plt.show()

In [None]:
dict={}
for i,j in zip(rf2.feature_importances_,rf2.feature_names_in_):
    print(f'{i:.4f}',j)
    dict[j]=i

from collections import OrderedDict
od=OrderedDict(sorted(dict.items()))

In [None]:
for col in dftrain.columns:
    plt.suptitle(col)
    plt.subplot(2,1,1)
    plt.title('Pagadores')
    plt.hist(dftrain[dftrain['Status']==1][col])
    plt.subplot(2,1,2)
    plt.title('No Pagadores')
    plt.hist(dftrain[dftrain['Status']==0][col])
    plt.tight_layout()
    plt.show()