In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import warnings 
warnings.filterwarnings("ignore")

In [2]:
drugs = pd.read_csv("Drugs Data.csv")
drugs

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [3]:
feature_cols = ['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']
x = drugs[feature_cols].values
y = drugs.Drug

In [4]:
from sklearn import preprocessing

cod_Sex = preprocessing.LabelEncoder()
cod_Sex.fit(['F', 'M'])
x[:, 1] = cod_Sex.transform(x[:, 1])

cod_Bp = preprocessing.LabelEncoder()
cod_Bp.fit(['HIGH', 'NORMAL', 'LOW'])
x[:, 2] = cod_Bp.transform(x[:, 2])

cod_Cholesterol = preprocessing.LabelEncoder()
cod_Cholesterol.fit(['NORMAL', 'HIGH'])
x[:, 3] = cod_Cholesterol.transform(x[:, 3])

In [5]:
x[:6]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043],
       [22, 0, 2, 0, 8.607]], dtype=object)

In [6]:
y[:6]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
5    drugX
Name: Drug, dtype: object

In [7]:
# Conjuntos de entrenamiento y de prueba
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 1)

# Modelo Decision Tree con criterio "gini"

In [9]:
# Modelo arbol de decision criterio "gini"
clf = DecisionTreeClassifier(criterion = "gini")  # probado para el piso max_depth = 4
clf = clf.fit(x_train, y_train)

In [10]:
# matriz de confusion
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 4,  0,  0,  0,  0],
       [ 0,  2,  0,  0,  0],
       [ 0,  0,  4,  0,  0],
       [ 0,  0,  0, 13,  0],
       [ 0,  0,  0,  0, 17]])

In [11]:
# Estadisticas de desempeño
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       drugA       1.00      1.00      1.00         4
       drugB       1.00      1.00      1.00         2
       drugC       1.00      1.00      1.00         4
       drugX       1.00      1.00      1.00        13
       drugY       1.00      1.00      1.00        17

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



# Modelo Decision Tree con criterio "entropy"

In [13]:
# Modelo arbol de decision criterio "gini"
clf = DecisionTreeClassifier(criterion = "entropy") # probado para el piso max_depth = 6, 
clf = clf.fit(x_train, y_train)                     # no deja aumentar mas pisos de los que da por defecto

In [14]:
# matriz de confusion
y_pred = clf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 4,  0,  0,  0,  0],
       [ 0,  2,  0,  0,  0],
       [ 0,  0,  4,  0,  0],
       [ 0,  0,  0, 13,  0],
       [ 0,  0,  0,  0, 17]])

In [15]:
# Estadisticas de desempeño
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       drugA       1.00      1.00      1.00         4
       drugB       1.00      1.00      1.00         2
       drugC       1.00      1.00      1.00         4
       drugX       1.00      1.00      1.00        13
       drugY       1.00      1.00      1.00        17

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [16]:
from sklearn import tree
import graphviz
import pydotplus
from PIL import Image

In [17]:
# Creacion de informacion para el arbol (reglas de decision)
dot_data = tree.export_graphviz(clf, out_file = None, feature_names = feature_cols)

# Creacion de la grafica del arbol
graph = pydotplus.graph_from_dot_data(dot_data)

In [18]:
# Creacion del arbol en formato PNG
#graph.write_png("droges1.png")

#image = Image.open("droges1.png")
#image.show()

# Prediccion para el nuevo paciente

In [20]:
import numpy as np
new_px = [[50, 0, 0, 1, 15.302]]
x_new = np.asarray(new_px)
x_new

array([[50.   ,  0.   ,  0.   ,  1.   , 15.302]])

In [40]:
pred = clf.predict(x_new)
pred

array(['drugY'], dtype=object)