# Pair Programming Random Forest

Hasta ahora hemos ajustado el modelo usando una Regresión Logística, pero como hemos aprendido, podemos usar el Random Forest en este tipo de problemas. Los objetivos de este pair programming :
- Ajustad un modelo de Random Forest a nuestros datos.
- Calculad las métricas a nuestro nuevo modelo.
- Comparad las métricas con los modelos hechos hasta ahora. ¿Cuál es mejor?

In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
from tqdm import tqdm

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score , cohen_kappa_score, roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_pickle('../datos/Invistico_Airline_v3.pkl')
df.head(3)

Unnamed: 0,satisfaction,gender,customer_type,type_of_travel,class,seat_comfort,departure/arrival_time_convenient,gate_location,inflight_wifi_service,inflight_entertainment,online_support,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,arrival_delay_in_minutes,age,flight_distance
0,1,1,2,1,1,0,0,2,2,4,2,3,0,3,5,3,2,0.0,1.041667,-1.400844
1,1,0,2,1,2,0,0,3,0,2,2,4,4,4,2,3,2,305.0,0.291667,0.454852
2,1,1,2,1,1,0,0,3,2,0,2,3,3,4,4,4,2,0.0,-1.041667,0.179747


In [4]:
X1 = df.drop("satisfaction", axis = 1)
y1 = df["satisfaction"]

In [7]:
X1.drop("arrival_delay_in_minutes", axis=1, inplace=True)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [9]:
param = {"max_depth": [2,4, 6, 10, 12, 14], # teniendo en cuenta que teníamos overfitting tendremos que reducir la profundidad del modelo, la nuestra anterior era de 17. Bajaremos mucho este valor ya que teníamos un overfitting muy claro
        "max_features": [1,2,3,4],# calculamos en celdas anteriores, probaremos a hacer el modelo como una variable, 2, 3 y 4. Ponemos como límite el 4 ya que es el resultado de la raiz cuadrada. 
        # estos dos hiperparámetros son más difíciles de definir, pero usualmente se suelen elegir los siguientes valores
        "min_samples_split": [5, 10, 20],
        "min_samples_leaf": [5,10,20]} 

In [10]:
gs_rf = GridSearchCV(
            estimator=RandomForestClassifier(random_state=42), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1) # para que no nos printee ningún mensaje en pantalla
        

In [11]:
gs_rf.fit(x_train, y_train)

In [12]:
bosque = gs_rf.best_estimator_
bosque

Calculamos las métricas para saber como de bueno es

In [13]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [14]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

In [15]:
dt_results = metricas(y_test, y_pred_test_rf,y_train,  y_pred_train_rf, "Random Forest")
dt_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.949299,0.954622,0.95322,0.95392,0.897569,test,Random Forest
1,0.959366,0.964166,0.961381,0.962772,0.918047,train,Random Forest


In [17]:
dt_results.to_pickle('../datos/resultados_metricas2.pkl')

In [5]:
df_resul = pd.read_pickle('../datos/resultados_metricas.pkl')
df_resul

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.834463,0.848152,0.851829,0.849986,0.665344,test,Regresión logistica
1,0.832826,0.845113,0.849875,0.847487,0.662538,train,Regresión logistica
0,0.935864,0.938441,0.945528,0.941971,0.870293,test,Decission Tree Esta I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree Esta I
0,0.922582,0.932807,0.926089,0.929436,0.843693,test,Decision tree Esta II
1,0.929993,0.938843,0.932659,0.935741,0.858859,train,Decision tree Esta II


In [3]:
dt_results = pd.read_pickle('../datos/resultados_metricas2.pkl')
dt_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.949299,0.954622,0.95322,0.95392,0.897569,test,Random Forest
1,0.959366,0.964166,0.961381,0.962772,0.918047,train,Random Forest


Aquí deberiamos concatenar las dos tablas pero 

In [7]:
df_todos_resultados = pd.concat([dt_results, df_resul], axis = 0)
df_todos_resultados

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.949299,0.954622,0.95322,0.95392,0.897569,test,Random Forest
1,0.959366,0.964166,0.961381,0.962772,0.918047,train,Random Forest
0,0.834463,0.848152,0.851829,0.849986,0.665344,test,Regresión logistica
1,0.832826,0.845113,0.849875,0.847487,0.662538,train,Regresión logistica
0,0.935864,0.938441,0.945528,0.941971,0.870293,test,Decission Tree Esta I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree Esta I
0,0.922582,0.932807,0.926089,0.929436,0.843693,test,Decision tree Esta II
1,0.929993,0.938843,0.932659,0.935741,0.858859,train,Decision tree Esta II


# comentamos métrica:

In [None]:
# vamos a crearnos un dataframe 
importancia_predictores_esta = pd.DataFrame(
                            {'predictor': x_train1.columns,
                             'importancia': mejor_modelo.feature_importances_}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores_esta.sort_values(by=["importancia"], ascending=False, inplace = True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores_esta

In [None]:

plt.figure(figsize=(10,6))
sns.barplot(x = "importancia", y = "predictor", data = importancia_predictores, palette="viridis");
plt.show()