# Caso práctico: _Random Forest_


## IMPORTS


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score, classification_report
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from scipy.stats import randint
import warnings
warnings.filterwarnings('ignore')

## Funciones auxiliares

In [20]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)


In [21]:
def remove_labels(df, label_name):
    X = df.drop([label_name, 'CLIENTNUM'], axis=1)  # Eliminar columna no relevante
    y = df[label_name].copy()
    return (X, y)

In [22]:
class MultiColumnLabelEncoder:
    def __init__(self, columns=None):
        self.columns = columns
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    
    def transform(self, X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = output[col].astype('category').cat.codes
        else:
            for col in output.columns:
                if output[col].dtype == 'object':
                    output[col] = output[col].astype('category').cat.codes
        return output

## 1. Lectura del conjunto de datos

In [23]:
# Carga de datos
df = pd.read_csv('BankChurners.csv')

## 2. Columnas a evaluar

In [24]:
# Análisis de columnas

print("\n=== COLUMNAS ===")
print("Numéricas:", list(df.select_dtypes(include=np.number).columns))
print("Categóricas:", list(df.select_dtypes(exclude=np.number).columns))



=== COLUMNAS ===
Numéricas: ['CLIENTNUM', 'Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
Categóricas: ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']


In [25]:
# Valores nulos e infinitos
print("\n=== VALORES NULOS ===")
print(df.isnull().sum())
print("\n=== VALORES INFINITOS ===")
print(np.isinf(df.select_dtypes(include=np.number)).sum())



=== VALORES NULOS ===
CLIENTNUM                   0
Attrition_Flag              0
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64

=== VALORES INFINITOS ===
CLIENTNUM                   0
Customer_Age                0
Dependent_count             0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_A

## 3. Visualización del conjunto de datos

In [26]:
df.head(10)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0
5,713061558,Existing Customer,44,M,2,Graduate,Married,$40K - $60K,Blue,36,...,1,2,4010.0,1247,2763.0,1.376,1088,24,0.846,0.311
6,810347208,Existing Customer,51,M,4,Unknown,Married,$120K +,Gold,46,...,1,3,34516.0,2264,32252.0,1.975,1330,31,0.722,0.066
7,818906208,Existing Customer,32,M,0,High School,Unknown,$60K - $80K,Silver,27,...,2,2,29081.0,1396,27685.0,2.204,1538,36,0.714,0.048
8,710930508,Existing Customer,37,M,3,Uneducated,Single,$60K - $80K,Blue,36,...,2,0,22352.0,2517,19835.0,3.355,1350,24,1.182,0.113
9,719661558,Existing Customer,48,M,2,Graduate,Single,$80K - $120K,Blue,36,...,3,3,11656.0,1677,9979.0,1.524,1441,32,0.882,0.144


In [27]:
df.describe()

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0
mean,739177600.0,46.32596,2.346203,35.928409,3.81258,2.341167,2.455317,8631.953698,1162.814061,7469.139637,0.759941,4404.086304,64.858695,0.712222,0.274894
std,36903780.0,8.016814,1.298908,7.986416,1.554408,1.010622,1.106225,9088.77665,814.987335,9090.685324,0.219207,3397.129254,23.47257,0.238086,0.275691
min,708082100.0,26.0,0.0,13.0,1.0,0.0,0.0,1438.3,0.0,3.0,0.0,510.0,10.0,0.0,0.0
25%,713036800.0,41.0,1.0,31.0,3.0,2.0,2.0,2555.0,359.0,1324.5,0.631,2155.5,45.0,0.582,0.023
50%,717926400.0,46.0,2.0,36.0,4.0,2.0,2.0,4549.0,1276.0,3474.0,0.736,3899.0,67.0,0.702,0.176
75%,773143500.0,52.0,3.0,40.0,5.0,3.0,3.0,11067.5,1784.0,9859.0,0.859,4741.0,81.0,0.818,0.503
max,828343100.0,73.0,5.0,56.0,6.0,6.0,6.0,34516.0,2517.0,34516.0,3.397,18484.0,139.0,3.714,0.999


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10127 non-null  int64  
 1   Attrition_Flag            10127 non-null  object 
 2   Customer_Age              10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education_Level           10127 non-null  object 
 6   Marital_Status            10127 non-null  object 
 7   Income_Category           10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive_12_mon    10127 non-null  int64  
 12  Contacts_Count_12_mon     10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_

In [29]:
print("Longitud del conjunto de datos:", len(df))
print("Número de características del conjunto de datos:", len(df.columns))

Longitud del conjunto de datos: 10127
Número de características del conjunto de datos: 21


In [30]:
df["Attrition_Flag"].value_counts()

Attrition_Flag
Existing Customer    8500
Attrited Customer    1627
Name: count, dtype: int64

In [31]:
# Instanciar la clase MultiColumnLabelEncoder
multi_encoder = MultiColumnLabelEncoder(columns=['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'])

# Transformamos el dataframe
df_encoded = multi_encoder.fit_transform(df)

# Mostramos el dataframe con las columnas categóricas convertidas a valores numéricos
df_encoded


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,1,3,3,1,2,0,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,0,5,2,2,4,0,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,1,3,2,1,3,0,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.000
3,769911858,Existing Customer,40,0,4,3,3,4,0,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.760
4,709106358,Existing Customer,40,1,3,5,1,2,0,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,Existing Customer,50,1,2,2,2,1,0,40,...,2,3,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462
10123,710638233,Attrited Customer,41,1,2,6,0,1,0,25,...,2,3,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511
10124,716506083,Attrited Customer,44,0,1,3,1,4,0,36,...,3,4,5409.0,0,5409.0,0.819,10291,60,0.818,0.000
10125,717406983,Attrited Customer,30,1,2,2,3,1,0,36,...,3,3,5281.0,0,5281.0,0.535,8395,62,0.722,0.000


### Buscando correlaciones

## 4. División del conjunto de datos

In [32]:
# Dividimos el conjunto de datos
train_set, val_set, test_set = train_val_test_split(df_encoded)

In [33]:
X_train, y_train = remove_labels(train_set, 'Attrition_Flag')
X_val, y_val = remove_labels(val_set, 'Attrition_Flag')
X_test, y_test = remove_labels(test_set, 'Attrition_Flag')

## 5. Escalado robusto

In [34]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## 6. EXPERIMENTOS CON PCA

In [35]:
print("\n=== EXPERIMENTOS PCA ===")
best_pca_score = 0
best_n = 2

for n in [2, 3, 4]:
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_val_pca = pca.transform(X_val_scaled)
    
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_pca, y_train)
    
    val_pred = model.predict(X_val_pca)
    score = f1_score(y_val, val_pred, average='weighted')
    
    print(f"Componentes: {n} | F1 Score: {score:.4f}")
    
    if score > best_pca_score:
        best_pca_score = score
        best_n = n

print(f"\nMejor configuración PCA: {best_n} componentes (F1: {best_pca_score:.4f})")



=== EXPERIMENTOS PCA ===
Componentes: 2 | F1 Score: 0.7753
Componentes: 3 | F1 Score: 0.8041
Componentes: 4 | F1 Score: 0.8284

Mejor configuración PCA: 4 componentes (F1: 0.8284)


## 7. GRIDSEARCHCV

In [36]:
print("\n=== GRIDSEARCHCV (15 COMBINACIONES) ===")
param_grid = {
    'n_estimators': [50, 100, 150],       # 3 opciones
    'max_depth': [None, 10, 20, 30],      # 4 opciones
    'min_samples_split': [2, 5],          # 2 opciones
    'max_features': ['sqrt']              # 1 opción
    # Total combinaciones: 3 * 4 * 2 * 1 = 24 (Ajustado a 15)
}

# Ajuste manual para obtener exactamente 15 combinaciones
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
    # Total combinaciones: 3 * 2 * 3 * 2 = 36 → Seleccionamos primeras 15
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=6,
    scoring='f1_weighted',
    n_jobs=-1,
    refit=True
)

grid_search.fit(X_train_scaled, y_train)

# Mostrar todos los resultados
print("\nPuntajes de todos los modelos:")
results = grid_search.cv_results_
for i in range(len(results['params'])):
    print(f"Modelo {i+1}:")
    print(f"Parámetros: {results['params'][i]}")
    print(f"F1 promedio (6-fold): {results['mean_test_score'][i]:.4f}")
    print("---------------------------------------------------")

# Mejor modelo
print("\nMejores parámetros GridSearch:")
print(grid_search.best_params_)
print(f"Mejor F1 promedio validación: {grid_search.best_score_:.4f}")

# Testeo final
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test_scaled)
print("\nResultado en Test (Mejor Modelo):")
print(classification_report(y_test, y_test_pred))
print(f"F1 Score Test: {f1_score(y_test, y_test_pred, average='weighted'):.4f}")


=== GRIDSEARCHCV (15 COMBINACIONES) ===

Puntajes de todos los modelos:
Modelo 1:
Parámetros: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 50}
F1 promedio (6-fold): 0.9573
---------------------------------------------------
Modelo 2:
Parámetros: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
F1 promedio (6-fold): 0.9593
---------------------------------------------------
Modelo 3:
Parámetros: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 150}
F1 promedio (6-fold): 0.9589
---------------------------------------------------
Modelo 4:
Parámetros: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 50}
F1 promedio (6-fold): 0.9554
---------------------------------------------------
Modelo 5:
Parámetros: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}
F1 promedio (6-fold): 0.9581
---------------------

## 8. RANDOMIZEDSEARCHCV

In [37]:
print("\n=== RANDOMIZEDSEARCHCV ===")
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 11),
    'max_features': ['sqrt', 'log2']
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_dist,
    n_iter=10,
    cv=6,
    scoring='f1_weighted',
    n_jobs=-1
)

random_search.fit(X_train_scaled, y_train)

print("Mejores parámetros RandomizedSearch:")
print(random_search.best_params_)
print(f"Mejor F1 (validación): {random_search.best_score_:.4f}")

# Evaluación en test
best_model_random = random_search.best_estimator_
test_pred_random = best_model_random.predict(X_test_scaled)
print("\nResultados en Test (RandomizedSearch):")
print(classification_report(y_test, test_pred_random))



=== RANDOMIZEDSEARCHCV ===
Mejores parámetros RandomizedSearch:
{'max_depth': 28, 'max_features': 'sqrt', 'min_samples_split': 7, 'n_estimators': 279}
Mejor F1 (validación): 0.9598

Resultados en Test (RandomizedSearch):
                   precision    recall  f1-score   support

Attrited Customer       0.96      0.82      0.89       333
Existing Customer       0.97      0.99      0.98      1693

         accuracy                           0.97      2026
        macro avg       0.96      0.91      0.93      2026
     weighted avg       0.97      0.97      0.96      2026



## 9. SELECCIÓN DE CARACTERÍSTICAS

In [38]:
print("\n=== SELECCIÓN DE CARACTERÍSTICAS ===")
# Entrenar modelo para importancia de características
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_scaled, y_train)

# Obtener importancia
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
sorted_importances = importances.sort_values(ascending=False)

print("\n10 características más importantes:")
print(sorted_importances.head(10))

# Modelo con top 5 características
top_5 = sorted_importances.head(5).index
model_top = RandomForestClassifier(n_estimators=100)
model_top.fit(X_train_scaled[:, X_train.columns.isin(top_5)], y_train)
val_pred_top = model_top.predict(X_val_scaled[:, X_train.columns.isin(top_5)])
print(f"\nF1 con top 5 características: {f1_score(y_val, val_pred_top, average='weighted'):.4f}")

# Modelo con peores 5 características
bottom_5 = sorted_importances.tail(5).index
model_bottom = RandomForestClassifier(n_estimators=100)
model_bottom.fit(X_train_scaled[:, X_train.columns.isin(bottom_5)], y_train)
val_pred_bottom = model_bottom.predict(X_val_scaled[:, X_train.columns.isin(bottom_5)])
print(f"F1 con peores 5 características: {f1_score(y_val, val_pred_bottom, average='weighted'):.4f}")


=== SELECCIÓN DE CARACTERÍSTICAS ===

10 características más importantes:
Total_Trans_Amt             0.185937
Total_Trans_Ct              0.176205
Total_Revolving_Bal         0.112299
Total_Ct_Chng_Q4_Q1         0.104726
Avg_Utilization_Ratio       0.067813
Total_Relationship_Count    0.062258
Total_Amt_Chng_Q4_Q1        0.056591
Credit_Limit                0.034963
Customer_Age                0.034580
Avg_Open_To_Buy             0.031376
dtype: float64

F1 con top 5 características: 0.9271
F1 con peores 5 características: 0.7619
