# Decision Tree

In [1]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

import sys
sys.path.append("../../my-staff")
from utils import pipline_evaluation
from database import load_ds, load_params
from globals import SEED

# Load dataset

In [2]:
X, y, features_names, class_names = load_ds("all")
X.shape

(293, 25)

In [3]:
model = DecisionTreeClassifier(random_state=SEED)

## ALL

In [4]:
params = load_params("DecisionTreeClassifier")

In [5]:
params

{'max_depth': [None, 4, 8, 10],
 'min_samples_split': [2, 8, 16, 32],
 'min_samples_leaf': [2, 8, 16, 32, 48],
 'max_features': [None, 'sqrt', 'log2'],
 'criterion': ['gini', 'entropy', 'log_loss']}

In [6]:
results_all = pipline_evaluation(X.values, y, model, params)

----------------------------------------
GridSearchCV f1 0.7085250296226947 {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 32}
Train classes: [134 100]
Test classes: [34 25]
----------------------------------------
Métricas de evaluación:
Precision: 0.7500
Recall: 0.8400
F1-score: 0.7925
Brier Score: 0.1451
----------------------------------------
GridSearchCV f1 0.7307855469231718 {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 16}
Train classes: [134 100]
Test classes: [34 25]
----------------------------------------
Métricas de evaluación:
Precision: 0.7000
Recall: 0.5600
F1-score: 0.6222
Brier Score: 0.2023
----------------------------------------
GridSearchCV f1 0.6729166666666667 {'criterion': 'entropy', 'max_depth': 4, 'max_features': 'log2', 'min_samples_leaf': 8, 'min_samples_split': 2}
Train classes: [134 100]
Test classes: [34 25]
----------------------

In [7]:
print(results_all)

$0.6881 \pm 0.06$ & $0.1908 \pm 0.03$ &


## Univar

In [8]:
X_uni, y_uni, features_names_uni, class_names_uni = load_ds("univariant_15")


In [9]:
results_uni = pipline_evaluation(X_uni.values, y_uni, model, params)

----------------------------------------
GridSearchCV f1 0.664824148282795 {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 32}
Train classes: [134 100]
Test classes: [34 25]
----------------------------------------
Métricas de evaluación:
Precision: 0.8000
Recall: 0.8000
F1-score: 0.8000
Brier Score: 0.1425
----------------------------------------
GridSearchCV f1 0.6995070195026495 {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 48, 'min_samples_split': 2}
Train classes: [134 100]
Test classes: [34 25]
----------------------------------------
Métricas de evaluación:
Precision: 0.7500
Recall: 0.6000
F1-score: 0.6667
Brier Score: 0.1886
----------------------------------------
GridSearchCV f1 0.6942512465979298 {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 32, 'min_samples_split': 2}
Train classes: [134 100]
Test classes: [34 25]
----------------------

In [10]:
print(results_uni)

$0.6877 \pm 0.06$ & $0.1951 \pm 0.04$ &


# Multi

In [11]:
X_mult, y_mult, features_names_mult, class_names_mult = load_ds("multivariant_15")

X_mult.shape

(293, 15)

In [12]:
results_multi = pipline_evaluation(X_mult.values, y_mult, model, params)

----------------------------------------
GridSearchCV f1 0.6807114423393494 {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 16}
Train classes: [134 100]
Test classes: [34 25]
----------------------------------------
Métricas de evaluación:
Precision: 0.7692
Recall: 0.8000
F1-score: 0.7843
Brier Score: 0.1656
----------------------------------------
GridSearchCV f1 0.6810218147052355 {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 32}
Train classes: [134 100]
Test classes: [34 25]
----------------------------------------
Métricas de evaluación:
Precision: 0.7778
Recall: 0.5600
F1-score: 0.6512
Brier Score: 0.1875
----------------------------------------
GridSearchCV f1 0.7080820173019421 {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 2}
Train classes: [134 100]
Test classes: [34 25]
----------------------

In [13]:
print(results_multi)

$0.6988 \pm 0.05$ & $0.1895 \pm 0.02$ &


## Wrapper


In [14]:
from utils import wrapper_ds

In [15]:
x_wrapper = wrapper_ds(X, y, model, 15)

Index(['N_Days', 'Age', 'Albumin', 'Alk_Phos', 'SGOT', 'Tryglicerides',
       'Platelets', 'Prothrombin', 'Drug_Placebo', 'Spiders_N', 'Spiders_Y',
       'Hepatomegaly_Y', 'Ascites_Y', 'Edema_S', 'Edema_Y'],
      dtype='object')


In [16]:
results_wrapper = pipline_evaluation(x_wrapper, y, model, params)

----------------------------------------
GridSearchCV f1 0.666106250276463 {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 16}
Train classes: [134 100]
Test classes: [34 25]
----------------------------------------
Métricas de evaluación:
Precision: 0.6429
Recall: 0.7200
F1-score: 0.6792
Brier Score: 0.2107
----------------------------------------
GridSearchCV f1 0.6879009805839074 {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 32}
Train classes: [134 100]
Test classes: [34 25]
----------------------------------------
Métricas de evaluación:
Precision: 0.7727
Recall: 0.6800
F1-score: 0.7234
Brier Score: 0.1689
----------------------------------------
GridSearchCV f1 0.704298682762305 {'criterion': 'gini', 'max_depth': 4, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Train classes: [134 100]
Test classes: [34 25]
--------------------------------

In [17]:
print(results_wrapper)

$0.6644 \pm 0.05$ & $0.2162 \pm 0.03$ &


# Visualización del modelo

In [18]:
from sklearn.tree import plot_tree

In [19]:
# Pintamos el árbol para sacar conclusiones
plt.figure(figsize=(40, 15))
plot_tree(model, filled=True, feature_names=features_names, class_names=class_names, rounded=True)
plt.show()

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

<Figure size 4000x1500 with 0 Axes>

In [None]:
print("Número de nodos en el árbol de decisión:", model.tree_.node_count)
print("Número de hojas", model.get_n_leaves())

In [None]:
print("Training score: ", model.score(X_train[features], y_train))
print("Test score: ", model.score(X_test[features], y_test))

# Visualización de características principales

In [None]:
def main_features(clf, feature_names):
    importance = clf.feature_importances_
    # summarize feature importance

    #for i, v in enumerate(importance):
    #    print('Feature: %s, Score: %.5f' % (feature_names[i], v))

    # plot feature importance
    #plt.figure(figsize=(20, 10))

    plt.title('Caracteristicas principales')  # Agregar título al gráfico
    plt.barh([x for x in range(len(importance))], importance, tick_label=feature_names)
    #plt.xticks(rotation=90)
    plt.show()

In [None]:
main_features(model, features_names)