# Lección 5: Decision Tree

### Ejercicios de pair programming 31 de enero.

In [1]:
# Importamos las librerías necesarias
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score , cohen_kappa_score, roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [33]:
# abrimos el df con los datos estandarizados y codificados
df_est = pd.read_csv("datos/est_enc_airlines.csv", index_col = 0)
df_est.head()

Unnamed: 0,satisfaction,gender,customer_type,age,type_of_travel,class,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,...,ease_of_online_booking,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes,distance
0,1,1,1,1.041667,1,0,-1.5,-1.5,-1.5,-0.5,...,-0.333333,-1.0,-1.333333,-0.5,2.0,-0.5,-1.0,0.0,0.0,0.0
1,1,0,1,0.291667,1,2,-1.5,-1.5,-1.5,0.0,...,-0.333333,0.0,0.0,0.0,-1.0,-0.5,-1.0,25.833333,23.461538,0.0
2,1,1,1,-1.041667,1,0,-1.5,-1.5,-1.5,0.0,...,-0.666667,-1.0,-0.333333,0.0,1.0,0.0,-1.0,0.0,0.0,0.0
3,1,1,1,0.833333,1,0,-1.5,-1.5,-1.5,0.0,...,-1.0,-3.0,-1.333333,-1.5,1.0,-1.5,-0.5,0.0,0.0,0.0
4,1,1,1,1.25,1,0,-1.5,-1.5,-1.5,0.0,...,-0.666667,-2.0,-1.333333,-1.0,1.0,-1.0,0.5,0.0,0.0,0.0


In [3]:
# separamos los datos en X e y
X1 = df_est.drop("satisfaction", axis = 1)
y1 = df_est["satisfaction"]

In [4]:
# separamos en train y test
x_train1, x_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [5]:
# creamos el objeto del modelo, al igual que hacíamos en la regresión lineal
arbol = DecisionTreeClassifier(random_state =0)

In [6]:
# ajustamos el modelo, igual que en la regresión lienal. 
arbol.fit(x_train1, y_train1)

In [7]:
#fig = plt.figure(figsize = (10,6))
#tree.plot_tree(arbol, feature_names = x_train1.columns, filled = True)
#plt.show()

In [8]:
# max features. Como vemos, debemos poner en nuestro modelo una profudidad máxima de 4
max_features = np.sqrt(len(x_train1.columns))
max_features

4.69041575982343

In [9]:
# max depth
print(arbol.tree_.max_depth)

34


In [10]:
# hacemos las predicciones sobre los dos set de datos el X_test y el X_train
y_pred_test_esta = arbol.predict(x_test1)
y_pred_train_esta = arbol.predict(x_train1)

In [11]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

In [12]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados
dt_results1 = metricas(y_test1, y_pred_test_esta, y_train1, y_pred_train_esta, "Decission Tree Esta I")
dt_results1

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.940945,0.94475,0.948185,0.946465,0.880623,test,Decission Tree Esta I
1,0.99999,1.0,0.999982,0.999991,0.999981,train,Decission Tree Esta I


In [13]:
param = {"max_depth": [2,4,6],
        "max_features": [1,2,3,4],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [14]:
gs = GridSearchCV(
            estimator=DecisionTreeClassifier(random_state= 42), 
            param_grid= param, 
            cv=10, 
            verbose=-1)

In [15]:
gs.fit(x_train1, y_train1)

In [16]:
mejor_modelo = gs.best_estimator_
mejor_modelo

In [17]:
#fig = plt.figure(figsize=(40, 20))
#tree.plot_tree(mejor_modelo, feature_names=x_train1.columns, filled=True);

In [18]:
y_pred_test_esta2 = mejor_modelo.predict(x_test1)
y_pred_train_esta2 = mejor_modelo.predict(x_train1)

In [19]:
dt_results2 = metricas(y_test1, y_pred_test_esta2, y_train1,  y_pred_train_esta2, "Decision tree Esta II")
dt_results2

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.870034,0.861253,0.910636,0.885256,0.735679,test,Decision tree Esta II
1,0.869379,0.859231,0.910101,0.883935,0.734867,train,Decision tree Esta II


In [20]:
# vamos  a juntar los dataframes de los resultados de los modelos para poder compararlos mejor
df_decision_results = pd.concat([dt_results1, dt_results2], axis = 0)
df_decision_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.940945,0.94475,0.948185,0.946465,0.880623,test,Decission Tree Esta I
1,0.99999,1.0,0.999982,0.999991,0.999981,train,Decission Tree Esta I
0,0.870034,0.861253,0.910636,0.885256,0.735679,test,Decision tree Esta II
1,0.869379,0.859231,0.910101,0.883935,0.734867,train,Decision tree Esta II


In [21]:
df_resultado = pd.read_csv("datos/resultados_airlines_logistica.csv", index_col = 0)
df_resultado


Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.581267,0.569088,0.986085,0.721681,0.077901,test,Regresión logistica
1,0.835319,0.849344,0.849329,0.849337,0.667762,train,Regresión logistica
0,0.837157,0.852453,0.851619,0.852036,0.670988,test,Regresión logistica Esta
1,0.835329,0.849347,0.849347,0.849347,0.667781,train,Regresión logistica Esta


In [27]:
df_DT_LR_results = pd.concat([df_resultado, df_decision_results], axis = 0).reset_index(drop=True)
df_DT_LR_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.581267,0.569088,0.986085,0.721681,0.077901,test,Regresión logistica
1,0.835319,0.849344,0.849329,0.849337,0.667762,train,Regresión logistica
2,0.837157,0.852453,0.851619,0.852036,0.670988,test,Regresión logistica Esta
3,0.835329,0.849347,0.849347,0.849347,0.667781,train,Regresión logistica Esta
4,0.940945,0.94475,0.948185,0.946465,0.880623,test,Decission Tree Esta I
5,0.99999,1.0,0.999982,0.999991,0.999981,train,Decission Tree Esta I
6,0.870034,0.861253,0.910636,0.885256,0.735679,test,Decision tree Esta II
7,0.869379,0.859231,0.910101,0.883935,0.734867,train,Decision tree Esta II


In [28]:
df_DT_LR_results.drop([0,1,4,5], axis = 0, inplace = True)
df_DT_LR_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
2,0.837157,0.852453,0.851619,0.852036,0.670988,test,Regresión logistica Esta
3,0.835329,0.849347,0.849347,0.849347,0.667781,train,Regresión logistica Esta
6,0.870034,0.861253,0.910636,0.885256,0.735679,test,Decision tree Esta II
7,0.869379,0.859231,0.910101,0.883935,0.734867,train,Decision tree Esta II


In [31]:
df_DT_LR_results.style.background_gradient(cmap='Spectral')

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
2,0.837157,0.852453,0.851619,0.852036,0.670988,test,Regresión logistica Esta
3,0.835329,0.849347,0.849347,0.849347,0.667781,train,Regresión logistica Esta
6,0.870034,0.861253,0.910636,0.885256,0.735679,test,Decision tree Esta II
7,0.869379,0.859231,0.910101,0.883935,0.734867,train,Decision tree Esta II


Después de realizar los modelos de regresión logística y de Decision Tree, por el momento, nuestro mejor resultado es este último. 
Realizado con nuestras variables estandarizadas y codificadas.

- **Accuracy:** Nuestro modelo acierta el 87% de las veces.
  
- **Precision:** Nuestro valor es de 0.86, eso significa que nuestro modelo se equivocará un 14% de las veces que prediga que un cliente no estará satisfecho.
  
- **Recall:** Tenemos un valor de 0.91, es decir que nuestro modelo es capaz de identificar un 91% de los pasajeros insatisfechos
  
- **Kappa:** Nuestro valor es de 0.73; lo que implica que la concordancia de nuestros datos es buena y nuestro modelo no está acertando sus predicciones al azar.
  
- **F1:** Se acerca mucho a 1, por lo que nuestra media es muy buena.

In [32]:
# vamos a guardar este dataframe en un csv para 
df_DT_LR_results.to_csv("datos/resultados_airlines_LR_DT.csv")