# Sesión 14: Árboles de decisión (II)

### Carga de módulos necesarios

In [None]:
#Importando nuestras librerías

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc

In [None]:
#Cargando el dataset

df = pd.read_csv('data/data_creditos.csv')

### Análisis exploratorio de datos

In [None]:
# Analizando la variable target

print('Dimensión de la bd:', df.shape, '\n')

print(df.dtypes)

In [None]:
df.tail()

In [None]:
#Creación columna Target

df['Target'] = df['Credito'].map({"Malo":1,"Bueno":0})


In [None]:
df.head()

In [None]:
pd.crosstab(df.Target, df.Credito)

In [None]:
df.head()

In [None]:
sns.catplot('Target', data = df, kind = 'count')
plt.show()

In [None]:
# Analizamos las variables numéricas

df.describe()

In [None]:
df.Edad.hist()
plt.show()

In [None]:
sns.boxplot(y = df.Edad, x = df.Target)
plt.show()

In [None]:
df.dtypes

In [None]:
# Eliminar variables no necesarias

df.drop('Unnamed: 0', axis = 1, inplace= True)
df.drop('Credito', axis = 1, inplace= True)

In [None]:
df.head()

In [None]:
# Analisamos variables categóricas

df.describe(include= 'O')

In [None]:
df['Ingresos'].value_counts()

In [None]:
# Analisis gráfico

for x in ['Ingresos','Tarjetas','Educacion','Auto_creditos']:
    sns.catplot(x, data=df, hue='Target', kind='count')

In [None]:
#Revisión de nulos

df.isnull().sum()

In [None]:
# Creación de variables dummies

df = pd.get_dummies(df, drop_first= True)

In [None]:
df.head()

### Obtenemos las variables para el modelo

In [None]:
df['Edad_round'] = round(df.Edad)

In [None]:
df.head()

In [None]:
# Separación de predictores

X = df.drop(['Target','Edad'], axis = 1)
Y = df['Target']

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
#Separación en Train - Test

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
#Objeto del arbol de decisión para clasificar

tree = DecisionTreeClassifier()
tree

In [None]:
#Entrenamiento del arbol (data de train)

tree_model = tree.fit(X_train, Y_train)

In [None]:
#Importancia de variables

pd.Series(data=tree_model.feature_importances_, index=X.columns)

### Obtenemos las predicciones

In [None]:
#Predicción del modelo (data de test)

Y_pred = tree_model.predict(X_test)

In [None]:
Y_pred[:5]

In [None]:
np.array(Y_test[:5])

### Obtenemos las métricas

In [None]:
#Cálculo del accuracy (data de test)

accuracy_score(Y_test, Y_pred)

In [None]:
#Matriz de confusión (data de test)

confusion_matrix(Y_test, Y_pred)

In [None]:
#AUC (data de test)

fpr, tpr, threshold = roc_curve(Y_test, Y_pred)
auc(fpr, tpr)

### Graficamos el arbol obtenido

In [None]:
#Gráfico del árbol de decisión
from graphviz import Source
from sklearn.tree import export_graphviz
from IPython.display import Image
from pydotplus import graph_from_dot_data

In [None]:
dot_data = export_graphviz(tree_model, filled = True,
                           feature_names=X.columns,
                          special_characters = True)

graph = graph_from_dot_data(dot_data)
graph.write_png('tree.png')
Image(graph.create_png())

## Realizamos un nuevo modelo más acotado:

In [None]:
tree2 = DecisionTreeClassifier(max_depth = 3)
tree_model2 = tree2.fit(X_train, Y_train)

In [None]:
Y_pred = tree_model2.predict(X_test)
accuracy_score(Y_test, Y_pred)

In [None]:
confusion_matrix(Y_test, Y_pred)

In [None]:
fpr, tpr, threshold = roc_curve(Y_test, Y_pred)
auc(fpr, tpr)

### Graficamos el nuevo arbol obtenido

In [None]:
dot_data = export_graphviz(tree_model2, filled = True,
                           feature_names=X.columns,
                          special_characters = True)

graph = graph_from_dot_data(dot_data)
graph.write_png('tree2.png')
Image(graph.create_png())

### Evaluación de varios niveles

In [None]:
from sklearn import metrics

for crit in ['entropy','gini']:
    for mDepth in [3,6,9,12,15]:
        drugTree_loop = DecisionTreeClassifier(criterion=crit, max_depth = mDepth)
        drugTree_loop.fit(X_train,Y_train)
        drugTree_loop = drugTree_loop.predict(X_train)
        fpr, tpr, threshold = roc_curve(Y_train, drugTree_loop)
        print("Accuracy: "+str(mDepth)+" "+str(metrics.accuracy_score(Y_train, drugTree_loop)) +
              "\nAUC: "+str(mDepth)+" "+str(metrics.auc(fpr, tpr)) +
              "\nAccuracy Score: "+str(mDepth)+" "+str(metrics.accuracy_score(Y_train, drugTree_loop)) +
              "\nPrecision Score: "+str(mDepth)+" "+str(metrics.precision_score(Y_train, drugTree_loop)) +
              "\nRecall Score: "+str(mDepth)+" "+str(metrics.recall_score(Y_train, drugTree_loop)) +
              "\nF1 Score: "+str(mDepth)+" "+str(metrics.f1_score(Y_train, drugTree_loop)) + "\n")

In [None]:
conda install graphviz

In [None]:
!pip install graphviz