## RANDOM FOREST

In [2]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
from tqdm import tqdm

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score , cohen_kappa_score, roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

## RANDOM FOREST

Probamos con los datos estandarizados y balanceados

In [3]:
df_esta_balan = pd.read_csv("../ficheros/04-Churm_balan_estan.csv", index_col = 0)
df_esta_balan.head()

Unnamed: 0,NumOfProducts,HasCrCard,IsActiveMember,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Exited
0,1,1,0,-2.260159,-0.088299,1.378686,0.689342,0.447917,0,1,0,0,1,0
1,1,1,0,0.375385,-0.435342,-0.004426,0.829959,1.58244,1,0,0,0,1,0
2,3,1,1,-2.020565,0.374424,1.032908,-0.044648,1.489538,1,0,0,0,1,1
3,1,0,0,0.239962,0.143063,0.341352,0.700505,0.000708,0,0,1,0,1,0
4,1,1,1,0.823323,0.605786,-1.387538,0.440337,1.626914,0,0,1,1,0,0


In [4]:
# separamos los datos en X e y

X = df_esta_balan.drop("Exited", axis = 1)
y = df_esta_balan["Exited"]

In [5]:
# Dividimos nuestros datos entre train y test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
# Creamos un diccionario con los hiperparámetros
param = {"max_depth": [14, 16], 
        "max_features": [3,4], 
        "min_samples_split": [15, 20],
        "min_samples_leaf": [8, 10, 20]} 

In [7]:
gs_rf = GridSearchCV(
            estimator=RandomForestClassifier(random_state=42), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1) # para que no nos printee ningún mensaje en pantalla
        

In [8]:
# Entrenamos el modelo que acabamos de definir en el GridSearch

gs_rf.fit(x_train, y_train)

In [9]:
# saquemos ahora cual es nuestro mejor bosque
bosque = gs_rf.best_estimator_
bosque

In [10]:
# Hacemos las predicciones
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [11]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kappa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

In [12]:
# sacamos las métricas para ver si hay overfitting o underfitting

dt_results = metricas(y_test, y_pred_test_rf,y_train,  y_pred_train_rf, "Random Forest Balan I")
dt_results

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.85059,0.85727,0.850442,0.853843,0.701038,test,Random Forest Balan I
1,0.905157,0.908734,0.899383,0.904034,0.810292,train,Random Forest Balan I


Tenemos algo de overfitting, aunque no es muy grande, y un buen valor de kappa

Hacemos otra prueba modificando algunos de los parámetros

In [13]:
param1 = {"max_depth": [12, 14], # Bajamos un poco la profundidad para ver si reducimos el overfitting
        "max_features": [1,2,3], # También reducimos el numero de variables predictoras
        "min_samples_split": [20, 40],
        "min_samples_leaf": [20,30]} 

In [14]:
gs_rf1 = GridSearchCV(
            estimator=RandomForestClassifier(random_state=42), 
            param_grid= param1, 
            cv=10,
            verbose=-1)

In [15]:
gs_rf1.fit(x_train, y_train)

In [16]:
bosque1 = gs_rf1.best_estimator_
bosque1

In [17]:
y_pred_test_rf1 = bosque1.predict(x_test)
y_pred_train_rf1 = bosque1.predict(x_train)

In [18]:
dt_results1 = metricas(y_test, y_pred_test_rf1,y_train,  y_pred_train_rf1, "Random Forest Balan II")
dt_results1

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.822434,0.83139,0.820354,0.825835,0.644749,test,Random Forest Balan II
1,0.855747,0.86006,0.847473,0.85372,0.711455,train,Random Forest Balan II


Seguimos teniendo algo de overfitting, pero además nos ha bajado el kappa

Probamos ahora con los datos balanceados sin estandarizar

In [19]:
df_balan_sinestan = pd.read_csv("../ficheros/05-Churm_balan_sinest.csv", index_col= 0)
df_balan_sinestan.head(2)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Exited
0,775,29,10,0.0,2,1,1,68143.93,0,0,1,0,1,0
1,647,26,8,109958.15,1,1,1,136592.24,0,0,1,1,0,1


In [20]:
X2 = df_balan_sinestan.drop("Exited", axis= 1)
y2 = df_balan_sinestan["Exited"]

In [21]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [22]:
gs_rf2 = GridSearchCV(
            estimator=RandomForestClassifier(random_state=42), 
            param_grid= param1, # Probamos con los parámetros que ya hemos definido anteriormente
            cv=10,
            verbose=-1)

In [23]:
gs_rf2.fit(x_train2, y_train2)

In [24]:
bosque2 = gs_rf2.best_estimator_
bosque2

In [25]:
y_pred_test_rf2 = bosque2.predict(x_test2)
y_pred_train_rf2 = bosque2.predict(x_train2)

In [26]:
dt_results2 = metricas(y_test2, y_pred_test_rf2,y_train2,  y_pred_train_rf2, "Random Forest Balan Sinestand")
dt_results2

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.867994,0.878953,0.851109,0.864807,0.735906,test,Random Forest Balan Sinestand
1,0.879602,0.900469,0.854079,0.876661,0.759227,train,Random Forest Balan Sinestand


Tenemos poco overfitting y un buen valor de kappa. 

In [27]:
resultados = pd.concat([dt_results, dt_results1, dt_results2], axis= 0)
resultados

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.85059,0.85727,0.850442,0.853843,0.701038,test,Random Forest Balan I
1,0.905157,0.908734,0.899383,0.904034,0.810292,train,Random Forest Balan I
0,0.822434,0.83139,0.820354,0.825835,0.644749,test,Random Forest Balan II
1,0.855747,0.86006,0.847473,0.85372,0.711455,train,Random Forest Balan II
0,0.867994,0.878953,0.851109,0.864807,0.735906,test,Random Forest Balan Sinestand
1,0.879602,0.900469,0.854079,0.876661,0.759227,train,Random Forest Balan Sinestand


Los mejores resultados los tenemos en los **datos balanceados sin estandarizar**. Ya que, aunque todavía tenemos algo de overfitting, no es muy grande, y todas las métricas son bastante buenas. 

- Tenemos un **kappa** de 0.73, que aunque podría ser mejor, es un buen valor.
- El **accuracy** o exactitud es del 0.867, lo que quiere decir que hemos acertado en el 87% de las predicciones.
- La **precision** es de 0.878, por lo que el 88% de las predicciones positivas fueron correctas.
- El **recall** o sensibilidad tiene un valor de 0.85, por lo que hemos capturado el 85% de los positivos.
- El **F1**, por tanto, también está en esos valores del 86%.