In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree
import graphviz
import pydotplus
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

## Ejemplo Fraudes

In [3]:
fraudes = pd.read_csv('Fraude.csv')
fraudes.head()

Unnamed: 0,ID,Reembolso,EdoCivil,Ingresos,Fraude
0,1,Sí,Soltero,125000,No
1,2,No,Casado,100000,No
2,3,No,Soltero,70000,No
3,4,Sí,Casado,120000,No
4,5,No,Divorciado,95000,Sí


In [13]:
caracteristica = ['Reembolso', 'EdoCivil', 'Ingresos']
x = fraudes[caracteristica].values
y = fraudes.Fraude # variable dependiente Fraude

In [15]:
Cod_Reembolso = preprocessing.LabelEncoder()
Cod_Reembolso.fit(['Sí', 'No'])
x[:,0] =Cod_Reembolso.transform(x[:,0])

Cod_EdoCivil = preprocessing.LabelEncoder()
Cod_EdoCivil.fit(['Soltero', 'Casado', 'Divorciado'])
x[:,1] =Cod_EdoCivil.transform(x[:,1])

In [None]:
# verificar la nueva tabla, donde ya no aparece texto sino números
# para las variables Reembolso y EdoCivil
x[0:5]

In [22]:
# creacion de los datos de entrenamiento y prueba

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [23]:
# crear objeto de arbol de decision
clf = DecisionTreeClassifier()

# Aplicacion del arbol de decision a los grupos de entrenamiento
clf = clf.fit(x_train, y_train)

# prediccion de los datos de prueba
y_pred = clf.predict(x_test)

In [None]:
x_test

In [None]:
y_pred

In [None]:
y_test

In [None]:
# matriz de confusion
cm=confusion_matrix(y_test, y_pred)
cm

In [None]:
# Estadisticas del modelo
print(classification_report(y_test, y_pred))

In [32]:
# creacion de informacion del árbol (reglas de decision)

dot_data = tree.export_graphviz(clf, out_file=None, feature_names=caracteristica, class_names=['No Fraude', 'Fraude'], filled=True, rounded=True, special_characters=True)

#creacion de la gráfica árbol

graph =  pydotplus.graph_from_dot_data(dot_data)


In [33]:
# creacion del árbol en PDF

graph.write_pdf("Fraude.pdf")

# creacion del árbol en PNG

graph.write_png("Fraude.png")

image = Image.open("Fraude.png")
image.show()

## Ejemplo II Pima Indians Diabetes

In [4]:
pima = pd.read_csv('diabetes.csv')
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
col_names = ['pregnant','glucose','bp','skin','insulin','bmi','pedigree','age','label']

In [6]:
pima = pd.read_csv('diabetes.csv', header=None, names=col_names)
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0


In [7]:
# borrar la primera fila que contiene los nombres de las columnas
df = pima.iloc[1:,:]
df

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0
5,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
764,10,101,76,48,180,32.9,0.171,63,0
765,2,122,70,27,0,36.8,0.34,27,0
766,5,121,72,23,112,26.2,0.245,30,0
767,1,126,60,0,0,30.1,0.349,47,1


In [9]:
feature_cols = ['pregnant','insulin','bmi','age','glucose','bp','pedigree']
X = df[feature_cols]
Y = df.label

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

## Realizando el árbol con Gini

In [12]:
# crear objeto de arbol de decision
clf = DecisionTreeClassifier(criterion='gini')

# Aplicacion del arbol de decision a los grupos de entrenamiento
clf = clf.fit(X_train, Y_train)

# prediccion de los datos de prueba
y_pred = clf.predict(X_test)

In [13]:
# Matriz de confusion
cm=confusion_matrix(Y_test, y_pred)
cm

array([[83, 16],
       [25, 30]])

In [14]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.84      0.80        99
           1       0.65      0.55      0.59        55

    accuracy                           0.73       154
   macro avg       0.71      0.69      0.70       154
weighted avg       0.73      0.73      0.73       154



In [15]:
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=feature_cols, class_names=['Normal', 'Diabetes'], filled=True, rounded=True, special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data)

In [16]:
# Guardar el árbol en PDF
graph.write_pdf("Diabetes.pdf")

# Guardar el árbol en PNG
graph.write_png("Diabetes.png")

image = Image.open("Diabetes.png")
image.show()

## Realizando el arbol con Entropy y profundidad limitada

In [33]:
# crear objeto de arbol de decision
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)

# Aplicacion del arbol de decision a los grupos de entrenamiento
clf = clf.fit(X_train, Y_train)

# prediccion de los datos de prueba
y_pred = clf.predict(X_test)

In [34]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85        99
           1       0.76      0.64      0.69        55

    accuracy                           0.80       154
   macro avg       0.79      0.76      0.77       154
weighted avg       0.80      0.80      0.79       154



In [35]:
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=feature_cols, class_names=['Normal', 'Diabetes'], filled=True, rounded=True, special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data)

In [36]:
# Adición de colores en el árbol

from six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz

In [37]:
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=feature_cols, class_names=['Normal', 'Diabetes'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

In [39]:
# Guardar el árbol en PDF
from PIL import Image
graph.write_pdf("Diabetes1.pdf")

# Guardar el árbol en PNG
graph.write_png("Diabetes1.png")

image = Image.open("Diabetes1.png")
image.show()