In [2]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score , cohen_kappa_score, roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_pickle("../datos/setas/setas_balance.pkl")

In [4]:
df.head()

Unnamed: 0,has-ring,cap-diameter_e,stem-height_e,stem-width_e,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_o,cap-shape_p,cap-shape_s,...,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,season_a,season_s,season_u,season_w,class
0,1,-0.724215,-0.195515,-0.322348,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,2.094163,2.136781,0.553065,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,-0.551846,-0.167804,-0.780044,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,-0.648615,0.243234,-0.220331,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,0,0.65171,-0.634264,0.787428,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1


In [5]:
df.isnull().sum()

has-ring          0
cap-diameter_e    0
stem-height_e     0
stem-width_e      0
cap-shape_b       0
                 ..
season_a          0
season_s          0
season_u          0
season_w          0
class             0
Length: 89, dtype: int64

In [6]:
# separamos los datos en X e y

X1 = df.drop("class", axis = 1)
y1 = df["class"]

In [7]:
# separamos en train y test
x_train1, x_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [8]:
# creamos el objeto del modelo, al igual que hacíamos en la regresión lineal
arbol = DecisionTreeClassifier(random_state =0)

# ajustamos el modelo, igual que en la regresión lienal. 
arbol.fit(x_train1, y_train1)

In [None]:
"""

fig = plt.figure(figsize = (10,6))
tree.plot_tree(arbol, feature_names = x_train1.columns, filled = True)
plt.show()

"""

In [9]:
# max features. Como vemos, debemos poner en nuestro modelo una profudidad máxima de 9. 

max_features = np.sqrt(len(x_train1.columns))
max_features

9.38083151964686

In [10]:
# max depth

print(arbol.tree_.max_depth)

28


In [11]:
# hacemos las predicciones sobre los dos set de datos el X_test y el X_train
y_pred_test_esta = arbol.predict(x_test1)
y_pred_train_esta = arbol.predict(x_train1)

In [26]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kappa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

In [27]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados

dt_results1 = metricas(y_test1, y_pred_test_esta, y_train1, y_pred_train_esta, "Decission Tree Esta I")
dt_results1

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.994217,0.993426,0.995061,0.994243,0.988434,test,Decission Tree Esta I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree Esta I


In [28]:
# lo primero que tenemos que hacer es definir un diccionario con los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [16, 18, 20, 22, 24, 26, 28], # teniendo en cuenta que teníamos overfitting tendremos que reducir la profundidad del modelo, la nuestra anterior era de 17. Bajaremos mucho este valor ya que teníamos un overfitting muy claro
        "max_features": [4,5,6,7,8,9],# calculamos en celdas anteriores, probaremos a hacer el modelo como una variable, 2, 3 y 4. Ponemos como límite el 4 ya que es el resultado de la raiz cuadrada. 
        # estos dos hiperparámetros son más difíciles de definir, pero usualmente se suelen elegir los siguientes valores
        "min_samples_split": [50, 100, 200],
        "min_samples_leaf": [50,100, 200]} 

In [16]:
# una vez creado el diccionario iniciaremos el modelo con GridSearch

gs = GridSearchCV(
            estimator=DecisionTreeClassifier(random_state= 42), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1) # para que no nos printee ningún mensaje en pantalla

In [18]:
# ajustamos el modelo que acabamos de definir en el GridSearch

gs.fit(x_train1, y_train1)

In [19]:
# este método nos esta diciendo que el mejor modelo es aquel que tiene una profundidad de 6, que usa 4 variables predictoras para construir el modelo y que tiene  un min_samples_leaf y un min_samples_split de 10. 
mejor_modelo = gs.best_estimator_
mejor_modelo

Max_depth = 26
max_features = 9
min_samples_leaf = 50
min_samples_split = 50
random_state = 42

In [None]:
"""

# veamos ahora que pinta tiene nuestro árbol

fig = plt.figure(figsize=(40, 20))
tree.plot_tree(mejor_modelo, feature_names=x_train1.columns, filled=True);

"""

In [20]:
y_pred_test_esta2 = mejor_modelo.predict(x_test1)
y_pred_train_esta2 = mejor_modelo.predict(x_train1)

In [29]:
dt_results2 = metricas(y_test1, y_pred_test_esta2, y_train1,  y_pred_train_esta2, "Decision tree Esta II")
dt_results2

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.897321,0.875277,0.927563,0.900662,0.794594,test,Decision tree Esta II
1,0.903066,0.88048,0.932542,0.905763,0.806142,train,Decision tree Esta II


In [30]:
# vamos  a juntar los dataframes de los resultados de los modelos para poder compararlos mejor

df_decision_results = pd.concat([dt_results1, dt_results2], axis = 0)
df_decision_results

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.994217,0.993426,0.995061,0.994243,0.988434,test,Decission Tree Esta I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree Esta I
0,0.897321,0.875277,0.927563,0.900662,0.794594,test,Decision tree Esta II
1,0.903066,0.88048,0.932542,0.905763,0.806142,train,Decision tree Esta II


In [31]:
# si recodáis, en la clase de métricas guardamos en un csv los resultados de las métricas del modelo
# vamos a cargar ese csv para comparar todos los modelos que hemos hecho, y comparar cuál de ellos es el mejor

df_logistic_results = pd.read_csv("../datos/setas/metricas_RLogistica.csv", index_col = 0)
df_logistic_results

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.782486,0.767488,0.812794,0.789492,0.564871,test,Regresión logistica
1,0.780696,0.765383,0.808967,0.786572,0.561414,train,Regresión logistica


In [32]:
# concatenamos todos los resultados

df_DT_LR_results = pd.concat([df_logistic_results, df_decision_results], axis = 0).reset_index(drop=True)
df_DT_LR_results

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.782486,0.767488,0.812794,0.789492,0.564871,test,Regresión logistica
1,0.780696,0.765383,0.808967,0.786572,0.561414,train,Regresión logistica
2,0.994217,0.993426,0.995061,0.994243,0.988434,test,Decission Tree Esta I
3,1.0,1.0,1.0,1.0,1.0,train,Decission Tree Esta I
4,0.897321,0.875277,0.927563,0.900662,0.794594,test,Decision tree Esta II
5,0.903066,0.88048,0.932542,0.905763,0.806142,train,Decision tree Esta II


In [33]:
# pongamos un poco de color a nuestro dataframe para ver la comparación de los datos de una forma un poco más amigable. 
df_DT_LR_results.style.background_gradient(cmap='seismic')

Unnamed: 0,accuracy,precision,recall,f1,kappa,set,modelo
0,0.782486,0.767488,0.812794,0.789492,0.564871,test,Regresión logistica
1,0.780696,0.765383,0.808967,0.786572,0.561414,train,Regresión logistica
2,0.994217,0.993426,0.995061,0.994243,0.988434,test,Decission Tree Esta I
3,1.0,1.0,1.0,1.0,1.0,train,Decission Tree Esta I
4,0.897321,0.875277,0.927563,0.900662,0.794594,test,Decision tree Esta II
5,0.903066,0.88048,0.932542,0.905763,0.806142,train,Decision tree Esta II


In [34]:
# ademas vamos a guardar este dataframe en un csv para 

df_DT_LR_results.to_csv("../datos/setas/metricas_modelos.csv")

In [37]:
# vamos a crearnos un dataframe 
importancia_predictores = pd.DataFrame(
                            {'predictor': x_train1.columns,
                             'importancia': mejor_modelo.feature_importances_}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores.sort_values(by=["importancia"], ascending=False, inplace = True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia
1,cap-diameter_e,0.112858
3,stem-width_e,0.097080
2,stem-height_e,0.080843
66,stem-color_w,0.056497
18,cap-surface_s,0.047723
...,...,...
63,stem-color_p,0.000000
74,ring-type_r,0.000000
16,cap-surface_k,0.000000
72,ring-type_m,0.000000


In [38]:
df.head(2)

Unnamed: 0,has-ring,cap-diameter_e,stem-height_e,stem-width_e,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_o,cap-shape_p,cap-shape_s,...,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,season_a,season_s,season_u,season_w,class
0,1,-0.724215,-0.195515,-0.322348,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,2.094163,2.136781,0.553065,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [53]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [83]:
df_predictores = importancia_predictores.reset_index(drop=True)

In [84]:
df_predictores
4,5,8,16,22,26,

Unnamed: 0,predictor,importancia
0,cap-diameter_e,0.1128579
1,stem-width_e,0.09707992
2,stem-height_e,0.08084313
3,stem-color_w,0.05649693
4,cap-surface_s,0.04772294
5,cap-surface_y,0.03368866
6,gill-attachment_x,0.03312576
7,cap-shape_b,0.03243806
8,cap-surface_g,0.02920571
9,gill-color_w,0.02568867


In [None]:
# lo primero que hacemos es crearnos un dataframe con los valores solo de stem_color, es decir, la primera y antepenúltima fila
stem_color= importancia_predictores.iloc[[4,35,64,68,70,73,75,79,81,82,83,84,88]]
stem_color

In [87]:
# lo primero que hacemos es crearnos un dataframe con los valores solo de stem_color, es decir, la primera y antepenúltima fila
stem_color= importancia_predictores.iloc[[3,34,63,67,69,71,74,78,80,81,82,83,87]]
stem_color

Unnamed: 0,predictor,importancia
66,stem-color_w,0.05649693
56,stem-color_e,0.006468066
67,stem-color_y,0.0002635588
61,stem-color_n,0.0001180127
58,stem-color_g,8.569263e-06
64,stem-color_r,9.195086e-08
55,stem-color_b,0.0
57,stem-color_f,0.0
65,stem-color_u,0.0
60,stem-color_l,0.0


In [86]:
# hacemos lo mismo para cap-surface_s
cap_surface = df_predictores.loc[[4,5,8,16,22,26,38,43,58,79,85]]
cap_surface

Unnamed: 0,predictor,importancia
4,cap-surface_s,0.047723
5,cap-surface_y,0.033689
8,cap-surface_g,0.029206
16,cap-surface_t,0.015991
22,cap-surface_e,0.014343
26,cap-surface_h,0.011506
38,cap-surface_d,0.006145
43,cap-surface_w,0.005597
58,cap-surface_i,0.000713
79,cap-surface_l,0.0


In [88]:
# y para maturity
gill_attachment = df_predictores.loc[[6,17,19,23,24,25,36]]
gill_attachment

Unnamed: 0,predictor,importancia
6,gill-attachment_x,0.033126
17,gill-attachment_f,0.015594
19,gill-attachment_p,0.015314
23,gill-attachment_s,0.014124
24,gill-attachment_d,0.012759
25,gill-attachment_a,0.012452
36,gill-attachment_e,0.006429


In [89]:
# hacemos lo mismo para cap_shape
cap_shape = df_predictores.loc[[7,15,32,35,52,59,66]]
cap_shape

Unnamed: 0,predictor,importancia
7,cap-shape_b,0.032438
15,cap-shape_x,0.01602
32,cap-shape_f,0.006959
35,cap-shape_c,0.006456
52,cap-shape_o,0.003302
59,cap-shape_s,0.000561
66,cap-shape_p,0.000133


In [90]:
# hacemos lo mismo para gill_color
gill_color = df_predictores.loc[[9,10,13,14,18,29,31,37,44,49,55,70]]
gill_color

Unnamed: 0,predictor,importancia
9,gill-color_w,0.025689
10,gill-color_n,0.024244
13,gill-color_p,0.020259
14,gill-color_y,0.018873
18,gill-color_o,0.015586
29,gill-color_r,0.008083
31,gill-color_u,0.007029
37,gill-color_g,0.006381
44,gill-color_e,0.005482
49,gill-color_k,0.003896


In [91]:
# hacemos lo mismo para gill_color
does_bruise_or_bleed= df_predictores.loc[[11,33]]
does_bruise_or_bleed

Unnamed: 0,predictor,importancia
11,does-bruise-or-bleed_f,0.023566
33,does-bruise-or-bleed_t,0.006669


In [92]:
# hacemos lo mismo para gill_color
ring_type= df_predictores.loc[[12,30,40,47,56,65,84,86]]
ring_type

Unnamed: 0,predictor,importancia
12,ring-type_f,0.020599
30,ring-type_z,0.007469
40,ring-type_p,0.005778
47,ring-type_l,0.004195
56,ring-type_g,0.00144
65,ring-type_e,0.000162
84,ring-type_r,0.0
86,ring-type_m,0.0


In [95]:
# hacemos lo mismo para gill_color
cap_color= df_predictores.loc[[20,28,39,46,48,51,53,60,64,68,72,73]]
cap_color

Unnamed: 0,predictor,importancia
20,cap-color_n,0.014896
28,cap-color_w,0.010841
39,cap-color_p,0.006077
46,cap-color_e,0.00536
48,cap-color_r,0.004008
51,cap-color_y,0.003499
53,cap-color_u,0.002811
60,cap-color_k,0.000473
64,cap-color_o,0.000194
68,cap-color_b,3.6e-05


In [96]:
# hacemos lo mismo para gill_color
habitat= df_predictores.loc[[27,42,50,57,61,75,76,77]]
habitat

Unnamed: 0,predictor,importancia
27,habitat_d,0.01099
42,habitat_h,0.005684
50,habitat_m,0.003812
57,habitat_g,0.000898
61,habitat_l,0.000427
75,habitat_w,0.0
76,habitat_u,0.0
77,habitat_p,0.0


In [98]:
# hacemos lo mismo para gill_color
season= df_predictores.loc[[41,45,54,62]]
season

Unnamed: 0,predictor,importancia
41,season_s,0.005729
45,season_u,0.005395
54,season_a,0.002075
62,season_w,0.000317


In [None]:
# eliminamos esas filas del dataframe donde tenemos los valores de importancia

importancia_predictores_esta.drop(adulto.index, inplace = True)
importancia_predictores_esta.drop(madurez.index, inplace = True)
importancia_predictores_esta.drop(embarque.index, inplace = True)
importancia_predictores_esta.drop(solos.index, inplace = True)

In [None]:
importancia_predictores_esta

In [None]:
# nos creamos nuevas filas con el resultado de la suma

importancia_predictores_esta.loc[5] =  ["adult_male", adulto["importancia"].sum()]
importancia_predictores_esta.loc[6] =  ["maturity", madurez["importancia"].sum()]
importancia_predictores_esta.loc[7] =  ["embark", embarque["importancia"].sum()]
importancia_predictores_esta.loc[8] =  ["alone", solos["importancia"].sum()]

# ordenamos el df

importancia_predictores_esta.sort_values(by = "importancia", ascending = False, inplace = True)
importancia_predictores_esta

In [None]:
# por último ploteamos los resultados para verlo de una forma más amigable. 

plt.figure(figsize=(10,6))
sns.barplot(x = "importancia", y = "predictor", data = importancia_predictores_esta, palette="viridis");
plt.show()