### Pair Programming V: Decission Tree

Los objetivos de este pair programming :
- Ajustad un modelo de Decision Tree a nuestros datos.
- Calculad las métricas a nuestro nuevo modelo.
- Comparad las métricas con el modelo hecho hasta ahora. ¿Cuál es mejor?

In [3]:
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score , cohen_kappa_score, roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_pickle('data/airline_estand_encod.pkl')
df.head(2)

Unnamed: 0,satisfaction,gender,customer_type,age,type_of_travel,class,flight_distance,seat_comfort,food_and_drink,gate_location,...,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,dep_conv_0,dep_conv_1,dep_conv_2,dep_conv_3,dep_conv_4,dep_conv_5
0,1,1,1,0.25,1,0,-0.122137,4,4,3,...,2,4,3,0.0,0,0,0,0,0,1
1,0,1,1,0.583333,1,0,-0.715013,0,0,2,...,0,0,0,0.0,0,0,1,0,0,0


In [6]:
# separamos los datos en X e y

X1 = df.drop("satisfaction", axis = 1)
y1 = df["satisfaction"]

# separamos en train y test
x_train1, x_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

# creamos el objeto del modelo, al igual que hacíamos en la regresión lineal
arbol = DecisionTreeClassifier(random_state =0)

# ajustamos el modelo, igual que en la regresión lienal. 
arbol.fit(x_train1, y_train1)

# hacemos las predicciones sobre los dos set de datos el X_test y el X_train
y_pred_test_esta = arbol.predict(x_test1)
y_pred_train_esta = arbol.predict(x_train1)

In [7]:
# vamos a hacernos una función para sacar las métricas igual que hicimos con al regresión lineal.


def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)
    
    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kappa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

In [8]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados

dt_results1 = metricas(y_test1, y_pred_test_esta, y_train1, y_pred_train_esta, "Decission Tree Esta I")
dt_results1

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.9265,0.932759,0.932588,0.932674,0.851753,test,Decission Tree Esta I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree Esta I


Como era de esperar, tenemos un gran overfitting. Ajustamos hiperparámetros:

In [11]:
max_features = np.sqrt(len(x_train1.columns))
print(f'Max_features: {max_features}')
print(f'Max_depth: {arbol.tree_.max_depth}')

Max_features: 5.0990195135927845
Max_depth: 32


In [12]:
# lo primero que tenemos que hacer es definir un diccionario con los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [2, 4, 6, 10, 12, 14],
        "max_features": [1, 2, 3, 4, 5],
        "min_samples_split": [50, 100, 200, 350],
        "min_samples_leaf": [50, 100, 200]}

In [13]:
gs = GridSearchCV(
            estimator=DecisionTreeClassifier(random_state= 42),
            param_grid= param,
            cv=10,
            verbose=0)

In [14]:
gs.fit(x_train1, y_train1)

mejor_modelo = gs.best_estimator_
mejor_modelo

In [15]:
y_pred_test_esta2 = mejor_modelo.predict(x_test1)
y_pred_train_esta2 = mejor_modelo.predict(x_train1)

dt_results2 = metricas(y_test1, y_pred_test_esta2, y_train1,  y_pred_train_esta2, "Decision tree Esta II")
dt_results2

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.8949,0.905744,0.901264,0.903498,0.788119,test,Decision tree Esta II
1,0.904475,0.914193,0.910807,0.912497,0.807332,train,Decision tree Esta II


In [9]:
# Traemos las métricas que teníamos:

resultados = pd.read_csv('data/metricas.csv')

In [16]:
resultados = pd.concat([resultados , dt_results2], axis= 0)
resultados

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.874159,0.871902,0.877096,0.874491,0.748319,test,Regresión logistica Bal
1,0.875948,0.872365,0.880783,0.876554,0.751895,train,Regresión logistica Bal
2,0.8714,0.88054,0.884411,0.882471,0.740499,test,Regresión logistica Esta
3,0.878975,0.883505,0.896955,0.890179,0.755421,train,Regresión logistica Esta
4,0.8701,0.873966,0.890456,0.882134,0.737491,test,Regresión logistica
5,0.875725,0.875239,0.901207,0.888033,0.748477,train,Regresión logistica
0,0.8949,0.905744,0.901264,0.903498,0.788119,test,Decision tree Esta II
1,0.904475,0.914193,0.910807,0.912497,0.807332,train,Decision tree Esta II


Vemos que los resultados de todos nuestros modelos son buenos, pero sin duda, el que mejor resultados ha aportado, de momento, es el Decision Tree (kappa = 78%, accuracy= 89% y precision= 90%)

Vamos a ver cuáles son nuestros mejores predictores:

In [33]:
importancia_predictores_esta = pd.DataFrame(
                            {'predictor': x_train1.columns,
                             'importancia': mejor_modelo.feature_importances_}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores_esta.sort_values(by=["importancia"], ascending=False, inplace = True)
importancia_predictores_esta = importancia_predictores_esta.reset_index(drop= True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")


importancia_predictores_esta

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia
0,inflight_entertainment,0.333922
1,ease_of_online_booking,0.278383
2,seat_comfort,0.086407
3,onboard_service,0.07227
4,leg_room_service,0.052297
5,class,0.033468
6,food_and_drink,0.020079
7,type_of_travel,0.018343
8,gender,0.016797
9,customer_type,0.015944


Únicamente teníamos una variable categórica spliteada en columnas (por considerar que no tenía orden), y aunque los valores no son altos, vamos a unificarla en una única variable para poder comprobar bien el orden de importancia de todas las variables:

In [34]:
# Creamos nuevo dataframe con todas las categorías de nuestra variable a unir:
time_convenient = importancia_predictores_esta.iloc[[12, 15, -4, -3, -2, -1]]
time_convenient

Unnamed: 0,predictor,importancia
12,dep_conv_5,0.006859
15,dep_conv_4,0.0052
22,dep_conv_1,0.000737
23,dep_conv_2,2e-05
24,dep_conv_0,0.0
25,dep_conv_3,0.0


In [35]:
# Eliminamos la variable spliteada y comprobamos cuál es la última fila para añadirla después:
importancia_predictores_esta.drop(time_convenient.index, inplace = True)

importancia_predictores_esta

Unnamed: 0,predictor,importancia
0,inflight_entertainment,0.333922
1,ease_of_online_booking,0.278383
2,seat_comfort,0.086407
3,onboard_service,0.07227
4,leg_room_service,0.052297
5,class,0.033468
6,food_and_drink,0.020079
7,type_of_travel,0.018343
8,gender,0.016797
9,customer_type,0.015944


In [36]:
# nos creamos nueva fila con el resultado de la suma
importancia_predictores_esta.loc[12] =  ["time_convenient", time_convenient["importancia"].sum()]

# ordenamos el df
importancia_predictores_esta.sort_values(by = "importancia", ascending = False, inplace = True)

importancia_predictores_esta = importancia_predictores_esta.reset_index(drop= True)

importancia_predictores_esta

Unnamed: 0,predictor,importancia
0,inflight_entertainment,0.333922
1,ease_of_online_booking,0.278383
2,seat_comfort,0.086407
3,onboard_service,0.07227
4,leg_room_service,0.052297
5,class,0.033468
6,food_and_drink,0.020079
7,type_of_travel,0.018343
8,gender,0.016797
9,customer_type,0.015944


Con respecto a los datos, vemos que:
- Las dos variables que más peso tienen en la predicción (juntas predicen el 60% de la variable respuesta) son "inflight_entertainment" (predice casi el 34% de la satisfacción de los clientes) y "ease_of_online_booking" (que predice prácticamente el 28% de la satisfaccón de los clientes).

___
#### Vamos a ver ahora cómo quedaría el modelo con los datos balanceados:

In [37]:
df_bal = pd.read_pickle('data/airline_estand_encod_bal.pkl')
df_bal.head(2)

Unnamed: 0,gender,customer_type,age,type_of_travel,class,flight_distance,seat_comfort,food_and_drink,gate_location,inflight_wifi_service,...,cleanliness,online_boarding,departure_delay_in_minutes,dep_conv_0,dep_conv_1,dep_conv_2,dep_conv_3,dep_conv_4,dep_conv_5,satisfaction
1,1,1,0.583333,1,0,-0.715013,0,0,2,2,...,0,0,0.0,0,0,1,0,0,0,0
3,0,0,-0.625,0,1,0.145038,0,0,0,4,...,3,5,0.0,0,0,0,0,0,1,0


In [38]:
# separamos los datos en X e y

X2 = df_bal.drop("satisfaction", axis = 1)
y2 = df_bal["satisfaction"]

# separamos en train y test
x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

# creamos el objeto del modelo, al igual que hacíamos en la regresión lineal
arbol2 = DecisionTreeClassifier(random_state =0)

# ajustamos el modelo, igual que en la regresión lienal. 
arbol2.fit(x_train2, y_train2)

# hacemos las predicciones sobre los dos set de datos el X_test y el X_train
y_pred_test_bal = arbol.predict(x_test2)
y_pred_train_bal = arbol.predict(x_train2)

In [39]:
# Vamos a usar los mismos hiperparámetros:

gs.fit(x_train2, y_train2)

mejor_modelo_bal = gs.best_estimator_
mejor_modelo_bal

In [41]:
y_pred_test_bal = mejor_modelo_bal.predict(x_test2)
y_pred_train_bal = mejor_modelo_bal.predict(x_train2)

dt_results3 = metricas(y_test2, y_pred_test_bal, y_train2,  y_pred_train_bal, "Decision tree Bal")
dt_results3

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.900187,0.901661,0.898279,0.899967,0.800375,test,Decision tree Bal
1,0.905038,0.907517,0.902013,0.904757,0.810075,train,Decision tree Bal


In [42]:
resultados = pd.concat([resultados , dt_results3], axis= 0)
resultados

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.874159,0.871902,0.877096,0.874491,0.748319,test,Regresión logistica Bal
1,0.875948,0.872365,0.880783,0.876554,0.751895,train,Regresión logistica Bal
2,0.8714,0.88054,0.884411,0.882471,0.740499,test,Regresión logistica Esta
3,0.878975,0.883505,0.896955,0.890179,0.755421,train,Regresión logistica Esta
4,0.8701,0.873966,0.890456,0.882134,0.737491,test,Regresión logistica
5,0.875725,0.875239,0.901207,0.888033,0.748477,train,Regresión logistica
0,0.8949,0.905744,0.901264,0.903498,0.788119,test,Decision tree Esta II
1,0.904475,0.914193,0.910807,0.912497,0.807332,train,Decision tree Esta II
0,0.900187,0.901661,0.898279,0.899967,0.800375,test,Decision tree Bal
1,0.905038,0.907517,0.902013,0.904757,0.810075,train,Decision tree Bal


In [49]:
resultados.to_csv('data/metricas.csv', index=False)

Sorprendentemente, de momento, el modelo que mejor funciona es el Decision Tree con los datos de df balanceado. Vamos a comprobar si el orden de importancia se mantiene:

In [44]:
# Vamos a ver cuáles son nuestros mejores predictores:

importancia_predictores_bal = pd.DataFrame(
                            {'predictor': x_train2.columns,
                             'importancia': mejor_modelo_bal.feature_importances_}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores_bal.sort_values(by=["importancia"], ascending=False, inplace = True)
importancia_predictores_bal = importancia_predictores_bal.reset_index(drop= True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")


importancia_predictores_bal

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia
0,ease_of_online_booking,0.279576
1,inflight_entertainment,0.253534
2,customer_type,0.116599
3,seat_comfort,0.082395
4,food_and_drink,0.04906
5,class,0.030672
6,online_boarding,0.026244
7,type_of_travel,0.024008
8,baggage_handling,0.020261
9,gender,0.018217


In [46]:
# Creamos nuevo dataframe con todas las categorías de nuestra variable a unir:
time_convenient_bal = importancia_predictores_bal.iloc[[14, 19, -4, -3, -2, -1]]
time_convenient_bal

Unnamed: 0,predictor,importancia
14,dep_conv_5,0.009645
19,dep_conv_0,0.003004
22,dep_conv_4,0.001358
23,dep_conv_1,0.000257
24,dep_conv_3,5.2e-05
25,dep_conv_2,1.4e-05


In [47]:
# Eliminamos la variable spliteada y comprobamos cuál es la última fila para añadirla después:
importancia_predictores_bal.drop(time_convenient_bal.index, inplace = True)

importancia_predictores_bal

Unnamed: 0,predictor,importancia
0,ease_of_online_booking,0.279576
1,inflight_entertainment,0.253534
2,customer_type,0.116599
3,seat_comfort,0.082395
4,food_and_drink,0.04906
5,class,0.030672
6,online_boarding,0.026244
7,type_of_travel,0.024008
8,baggage_handling,0.020261
9,gender,0.018217


In [48]:
# nos creamos nueva fila con el resultado de la suma
importancia_predictores_bal.loc[19] =  ["time_convenient", time_convenient_bal["importancia"].sum()]

# ordenamos el df
importancia_predictores_bal.sort_values(by = "importancia", ascending = False, inplace = True)

importancia_predictores_bal = importancia_predictores_bal.reset_index(drop= True)

importancia_predictores_bal

Unnamed: 0,predictor,importancia
0,ease_of_online_booking,0.279576
1,inflight_entertainment,0.253534
2,customer_type,0.116599
3,seat_comfort,0.082395
4,food_and_drink,0.04906
5,class,0.030672
6,online_boarding,0.026244
7,type_of_travel,0.024008
8,baggage_handling,0.020261
9,gender,0.018217
