# Árboles de Decisión Random Forest (Clasificación)

## 1. Librerias y configuraciones previas


In [1]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np


# Almacenar en caché los resultados de funciones en el disco
# ==============================================================================
import joblib


# Gestion de librerias
# ==============================================================================
from importlib import reload


# Matemáticas y estadísticas
# ==============================================================================
import math


# Preprocesado y modelado
# ==============================================================================
import math

#Separar los datos entrenamiento y prueba
from sklearn.model_selection import train_test_split


#Escalar Variables
from sklearn.preprocessing import MinMaxScaler


#Evaluación del modelo
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve


#Creación de modelo
from sklearn.ensemble import RandomForestClassifier


#configuracion de hiperparámetros
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns


# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

## 2. Funciones

In [2]:
#reload(utils.funciones)

# Funciones externas
# ==============================================================================
from utils.funciones import multiple_plot, plot_roc_curve

## 3. Carga del dataset

In [3]:
#Se crea un dataframe d con los datos obtenidos de archivo de entrada
d=pd.read_csv('./datasets/02_GermanCredit_Prep.csv')

In [4]:
## Cargar datos con colab
## =============================================================================

#from google.colab import drive
#import os

#drive.mount('/gdrive')

In [5]:
#os.chdir("/gdrive/MyDrive/ModelosCuantitativosPython/Notebooks")
#!ls

In [6]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138 entries, 0 to 1137
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   checking_account_status   1138 non-null   object
 1   loan_duration_mo          1138 non-null   int64 
 2   credit_history            1138 non-null   object
 3   purpose                   1138 non-null   object
 4   loan_amount               1138 non-null   int64 
 5   savings_account_balance   1138 non-null   object
 6   time_employed_yrs         1138 non-null   object
 7   payment_pcnt_income       1138 non-null   int64 
 8   gender_status             1138 non-null   object
 9   other_signators           1138 non-null   object
 10  time_in_residence         1138 non-null   int64 
 11  property                  1138 non-null   object
 12  age_yrs                   1138 non-null   int64 
 13  other_credit_outstanding  1138 non-null   object
 14  home_ownership          

## 4. Visualización de datos

### Variables de entrada

In [7]:
#Lista de variables categóricas
catCols = d.select_dtypes(include = ["object", 'category']).columns.tolist()

d[catCols].head(2)

Unnamed: 0,checking_account_status,credit_history,purpose,savings_account_balance,time_employed_yrs,gender_status,other_signators,property,other_credit_outstanding,home_ownership,job_category,telephone,foreign_worker
0,< 0 DM,critical account - other non-bank loans,car,< 100 DM,1 - 4 years,female-divorced/separated/married,co-applicant,real estate,none,own,skilled,none,yes
1,< 0 DM,current loans paid,car,< 100 DM,1 - 4 years,male-married/widowed,none,real estate,none,own,unskilled-resident,none,yes


In [8]:
#Lista de variables numéricas

numCols=d.select_dtypes(include = ['float64','float64','int32','int64']).columns.tolist()

d[numCols].head(2)

Unnamed: 0,loan_duration_mo,loan_amount,payment_pcnt_income,time_in_residence,age_yrs,number_loans,dependents,bad_credit
0,12,3499,3,2,29,2,1,1
1,12,1168,4,3,27,1,1,0


In [9]:
##Visualización de frecuencia de instancias para variables categóricas
#multiple_plot(3, d , catCols, None, 'countplot', 'Frecuencia de instancias para variables categóricas',30)

In [10]:
##Visualización de variables numéricas
#multiple_plot(1, d , numCols, None, 'scatterplot', 'Relación entre las variables numéricas',30)

In [11]:
#Eliminar la variable de salida de la lista de variable numéricas
numCols.remove('bad_credit')

### Variable de salida

In [12]:
# Distriución de la variable de salida

d.groupby('bad_credit').bad_credit.count().sort_values(ascending=False)

bad_credit
0    569
1    569
Name: bad_credit, dtype: int64

In [13]:
##Visualización de la variable de salida
#multiple_plot(1, d , None, 'bad_credit', 'countplot', 'Gráfica de frecuencia de bad Credit',0)

## 5. Transformación de datos

### Creación de variables Dummies

In [14]:
# Aplicación de la función de usuario Dummies: one-hot encoding

d =pd.get_dummies(d, drop_first=1)

d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138 entries, 0 to 1137
Data columns (total 46 columns):
 #   Column                                                  Non-Null Count  Dtype
---  ------                                                  --------------  -----
 0   loan_duration_mo                                        1138 non-null   int64
 1   loan_amount                                             1138 non-null   int64
 2   payment_pcnt_income                                     1138 non-null   int64
 3   time_in_residence                                       1138 non-null   int64
 4   age_yrs                                                 1138 non-null   int64
 5   number_loans                                            1138 non-null   int64
 6   dependents                                              1138 non-null   int64
 7   bad_credit                                              1138 non-null   int64
 8   checking_account_status_< 0 DM                          11

## 6. Creación del modelo

### Dividir el conjunto de datos

In [15]:
#Se establece las variables de entrada 'X' y la variable de salida 'y'

X = d.drop(columns ='bad_credit')
y = d['bad_credit']

# la validación cruzada se realiza sobre todo el dataset
X_Completo = X
y_Completo = y

### Escalar Variables

In [16]:
#Se establecen las variables numéricas a escalar

#Num_vars se le asigna la lista con las variables numerivas para posteriormente escalarlas
num_vars = numCols

print(num_vars)

['loan_duration_mo', 'loan_amount', 'payment_pcnt_income', 'time_in_residence', 'age_yrs', 'number_loans', 'dependents']


In [17]:
#Se establecen las variables numéricas a escalar

#Num_vars se le asigna la lista con las variables numerivas para posteriormente escalarlas
num_vars = numCols

# Se crea un objeto MinMaxScaler
scaler = MinMaxScaler()

# Se escalan los valores del dataset entrenamiento y prueba de las columnas numéricas
X_Completo[num_vars] = scaler.fit_transform(X_Completo[num_vars])

X_Completo[num_vars].head()

Unnamed: 0,loan_duration_mo,loan_amount,payment_pcnt_income,time_in_residence,age_yrs,number_loans,dependents
0,0.117647,0.235579,0.666667,0.333333,0.166667,0.333333,0.0
1,0.117647,0.061857,1.0,0.666667,0.12963,0.0,0.0
2,0.294118,0.148681,1.0,0.333333,0.166667,0.0,0.0
3,0.073529,0.078253,1.0,0.333333,0.111111,0.0,0.0
4,0.338235,0.616709,0.333333,0.333333,0.12963,0.333333,0.0


In [18]:
# Guardar el scaler
joblib.dump(scaler, './modelos/scaler/minmaxFull_GermanCredits.pkl')

['./modelos/scaler/minmaxFull_GermanCredits.pkl']

### Creación del modelo

#### 6.1 Grid Search basado en out-of-bag score

In [19]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = ParameterGrid(
                {'n_estimators': [150, 200, 250],
                 'max_features': [5, 7, 9],
                 'max_depth'   : [None, 3, 10, 20],
                 'criterion'   : ['gini', 'entropy']
                }
            )

# Loop para ajustar un modelo con cada combinación de hiperparámetros
# ==============================================================================
resultados = {'params': [], 'oob_accuracy': []}

for params in param_grid:

    modelo = RandomForestClassifier(
                oob_score    = True,
                n_jobs       = -1,
                random_state = 123,
                ** params
             )

    modelo.fit(X_Completo, y_Completo)


    resultados['params'].append(params)
    resultados['oob_accuracy'].append(modelo.oob_score_)
    print(f"Modelo: {params} \u2713")

# Resultados
# ==============================================================================
resultados = pd.DataFrame(resultados)
resultados = pd.concat([resultados, resultados['params'].apply(pd.Series)], axis=1)
resultados = resultados.sort_values('oob_accuracy', ascending=False)
resultados = resultados.drop(columns = 'params')

Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'n_estimators': 200} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'n_estimators': 250} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 7, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 7, 'n_estimators': 200} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 7, 'n_estimators': 250} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 9, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 9, 'n_estimators': 200} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 9, 'n_estimators': 250} ✓
Modelo: {'criterion': 'gini', 'max_depth': 3, 'max_features': 5, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': 3, 'max_features': 5, 'n_estimators': 200} ✓
Model

In [20]:
resultados.head()

Unnamed: 0,oob_accuracy,criterion,max_depth,max_features,n_estimators
1,0.907733,gini,,5,200
2,0.905975,gini,,5,250
0,0.904218,gini,,5,150
28,0.903339,gini,20.0,5,200
29,0.90246,gini,20.0,5,250


### 8.2 Grid Search basado en validación cruzada

In [21]:
#Definicion del modelo
modelrf = RandomForestClassifier(random_state = 123)


# Grid de hiperparámetros evaluados

grid_param = {'n_estimators': [100, 120],
                          'max_features': [5, 7, 9, 11],
                          'max_depth'   : [3, 5, 10, 15, 20],
                          'criterion'   : ['gini', 'entropy']
                         }


# definicion de la variable con el número de pliegues
CV = 10

# las metricas sobre las se evaluará el modelo
scoring = 'f1'

# Búsqueda por grid search con validación cruzada
grid_rf = GridSearchCV(
                        estimator  = modelrf,
                        param_grid = grid_param,
                        scoring    = scoring,
                        cv = CV,
                        n_jobs     = - 1,
                        refit      = True,
                        verbose    = 4,
                        return_train_score = True
                       )

grid_rf.fit(X_Completo, y_Completo)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [22]:
# Obtener los resultados del grid search
results_grid_rf = pd.DataFrame(grid_rf.cv_results_)

# Seleccionar las columnas deseadas
columns_grid_rf = ['param_criterion'] + \
                ['param_max_depth'] + \
                ['param_max_features'] + \
                ['param_n_estimators'] + \
                ['mean_test_score', 'std_test_score']  + \
                [f'split{i}_test_score' for i in range(CV)]

# Filtrar y mostrar los resultados
results_grid_rf_filtered = results_grid_rf[columns_grid_rf]

# Crear la columna scoreWithStd: f1 / std
results_grid_rf_filtered['scoreWithStd'] = results_grid_rf_filtered.apply(
    lambda row: (row['mean_test_score']) / (row['std_test_score']) if row['std_test_score'] != 0 else 0,
    axis=1
)

# Encuentra el índice del máximo valor en la columna scoreWithStd
indice_max_scoreWithStd = results_grid_rf_filtered['scoreWithStd'].idxmax()

# Mostrar los scores promedios por cada parámetro
results_grid_rf_filtered[['param_criterion','param_max_depth','param_max_features','param_n_estimators', 'mean_test_score', 'std_test_score', 'scoreWithStd']]

Unnamed: 0,param_criterion,param_max_depth,param_max_features,param_n_estimators,mean_test_score,std_test_score,scoreWithStd
0,gini,3,5,100,0.738405,0.034794,21.222040
1,gini,3,5,120,0.729042,0.033176,21.975230
2,gini,3,7,100,0.723982,0.038458,18.825274
3,gini,3,7,120,0.723212,0.037957,19.053465
4,gini,3,9,100,0.721448,0.041492,17.387712
...,...,...,...,...,...,...,...
75,entropy,20,7,120,0.892182,0.038537,23.151362
76,entropy,20,9,100,0.886337,0.048912,18.120932
77,entropy,20,9,120,0.888632,0.048210,18.432459
78,entropy,20,11,100,0.883357,0.045022,19.620406


In [23]:
# Top 3 de los mejores scores
results_grid_rf_filtered.sort_values(by='scoreWithStd', ascending=False).head(10)[
    ['param_criterion','param_max_depth','param_max_features','param_n_estimators'
     , 'mean_test_score', 'std_test_score', 'scoreWithStd']]

Unnamed: 0,param_criterion,param_max_depth,param_max_features,param_n_estimators,mean_test_score,std_test_score,scoreWithStd
10,gini,5,7,100,0.766622,0.023385,32.782192
16,gini,10,5,100,0.870902,0.027096,32.141505
22,gini,10,11,100,0.875496,0.027658,31.653767
57,entropy,10,5,120,0.863299,0.027609,31.268287
17,gini,10,5,120,0.868546,0.028156,30.847265
50,entropy,5,7,100,0.756558,0.025055,30.19531
11,gini,5,7,120,0.762666,0.025269,30.181863
59,entropy,10,7,120,0.868757,0.029436,29.513018
12,gini,5,9,100,0.754629,0.026122,28.888528
52,entropy,5,9,100,0.757183,0.026269,28.823873


In [24]:
# Seleccionar el registro correspondiente al índice máximo
registro_max_scoreWithStd = results_grid_rf_filtered.loc[indice_max_scoreWithStd]

registro_max_scoreWithStd.transpose()

param_criterion            gini
param_max_depth               5
param_max_features            7
param_n_estimators          100
mean_test_score        0.766622
std_test_score         0.023385
split0_test_score      0.771654
split1_test_score      0.775862
split2_test_score      0.754386
split3_test_score      0.727273
split4_test_score      0.777778
split5_test_score      0.737705
split6_test_score      0.783333
split7_test_score      0.747826
split8_test_score      0.809917
split9_test_score      0.780488
scoreWithStd          32.782192
Name: 10, dtype: object

In [25]:
results_grid_rf_filtered.loc[indice_max_scoreWithStd]['mean_test_score']

0.766622158080798

In [26]:
# Usar los mejores parámetros para crear el modelo
modelrf.set_params(criterion = results_grid_rf_filtered.loc[indice_max_scoreWithStd]['param_criterion']
                        , max_depth = results_grid_rf_filtered.loc[indice_max_scoreWithStd]['param_max_depth']
                        , max_features = results_grid_rf_filtered.loc[indice_max_scoreWithStd]['param_max_features']
                        , n_estimators = results_grid_rf_filtered.loc[indice_max_scoreWithStd]['param_n_estimators'])

modelrf.fit(X_Completo, y_Completo)

### Guardar modelo

In [27]:
#Se guarda el modelo de Random Forest
joblib.dump(modelrf, './modelos/clasificacion/RForest_CV.pkl')

['./modelos/clasificacion/RForest_CV.pkl']

### Importancia de predictores

#### Importancia por pureza de nodos

In [28]:
importancia_predictores = pd.DataFrame(
                            {'predictor': X_Completo.columns,
                             'importancia': modelrf.feature_importances_}
                            )
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores.sort_values('importancia', ascending=False).head(10)

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia
9,checking_account_status_none,0.16236
4,age_yrs,0.106789
1,loan_amount,0.09388
0,loan_duration_mo,0.081387
7,checking_account_status_< 0 DM,0.077346
38,home_ownership_own,0.048158
2,payment_pcnt_income,0.029406
10,credit_history_critical account - other non-ba...,0.027715
30,gender_status_male-single,0.021565
24,time_employed_yrs_4 - 7 years,0.021427


### Predicción de probabilidades

In [29]:
# Predicción de probabilidades
# ==============================================================================
predicciones = modelrf.predict_proba(X = X_Completo)
predicciones[:5, :]

array([[0.42509071, 0.57490929],
       [0.33021667, 0.66978333],
       [0.5235206 , 0.4764794 ],
       [0.62105549, 0.37894451],
       [0.59885666, 0.40114334]])

In [30]:
# Clasificación empleando la clase de mayor probabilidad
# ==============================================================================
df_predicciones = pd.DataFrame(data=predicciones, columns=['0', '1'])
df_predicciones['clasificacion_default_0.5'] = np.where(df_predicciones['0'] > df_predicciones['1'], 0, 1)
df_predicciones.head(5)

Unnamed: 0,0,1,clasificacion_default_0.5
0,0.425091,0.574909,1
1,0.330217,0.669783,1
2,0.523521,0.476479,0
3,0.621055,0.378945,0
4,0.598857,0.401143,0


In [31]:
# Clasificación final empleando un threshold de 0.8 para la clase 1.
# ==============================================================================
df_predicciones['clasificacion_custom_0.8'] = np.where(df_predicciones['1'] > 0.8, 1, 0)
df_predicciones.iloc[4:10, :]

Unnamed: 0,0,1,clasificacion_default_0.5,clasificacion_custom_0.8
4,0.598857,0.401143,0,0
5,0.769866,0.230134,0,0
6,0.455945,0.544055,1,0
7,0.42566,0.57434,1,0
8,0.610525,0.389475,0,0
9,0.759278,0.240722,0,0


#### Referencias


- Random Forest
    - https://www.cienciadedatos.net/documentos/py08_random_forest_python.html
    - https://fhernanb.github.io/libro_mod_pred/rand-forests.html
    - https://quantdare.com/decision-trees-gini-vs-entropy/
    