In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
file_path = "drug200.csv"
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
5,22,F,NORMAL,HIGH,8.607,drugX
6,49,F,NORMAL,HIGH,16.275,drugY
7,41,M,LOW,HIGH,11.037,drugC
8,60,M,NORMAL,HIGH,15.171,drugY
9,43,M,LOW,NORMAL,19.368,drugY


In [5]:
df.shape

(200, 6)

In [7]:
X = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [9]:
y = df[['Drug']]
y[0:5]

Unnamed: 0,Drug
0,drugY
1,drugC
2,drugC
3,drugX
4,drugY


In [11]:
from sklearn import preprocessing
le_Sex = preprocessing.LabelEncoder()
le_Sex.fit(['F','M', 'UNKNOWN'])
X[:,1] = le_Sex.transform(X[:,1])

le_BP = preprocessing.LabelEncoder()
le_BP.fit(['LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])

le_Ch = preprocessing.LabelEncoder()
le_Ch.fit(['NORMAL', 'HIGH'])
X[:,3] = le_Ch.transform(X[:,3])

X[0:10]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043],
       [22, 0, 2, 0, 8.607],
       [49, 0, 2, 0, 16.275],
       [41, 1, 1, 0, 11.037],
       [60, 1, 2, 0, 15.171],
       [43, 1, 1, 1, 19.368]], dtype=object)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3 , random_state = 3)

In [15]:
X_train.shape , X_test.shape

((140, 5), (60, 5))

## Modeling

In [18]:
drugTree = DecisionTreeClassifier(criterion = "entropy", max_depth = 5)
drugTree

In [20]:
drugTree.fit(X_train, y_train)

### Predict

In [23]:
y_pred = drugTree.predict(X_test)

In [25]:
print(y_pred [0:8])
print(y_test [0:8])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX' 'drugC' 'drugY' 'drugA']
      Drug
40   drugY
51   drugX
139  drugX
197  drugX
170  drugX
82   drugC
183  drugY
46   drugA


### Evaluation

In [28]:
from sklearn import metrics
print("DecisionTrees Accuracy:", metrics.accuracy_score(y_test, y_pred))

DecisionTrees Accuracy: 0.9833333333333333


In [30]:
!pip install pydotplus



In [32]:
from io import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree

In [34]:
pip install graphViz

Note: you may need to restart the kernel to use updated packages.


In [36]:
dot_data = StringIO()
filename = "drugtree.png"
featureNames = df.columns[0:5]
out=tree.export_graphviz(drugTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_train), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

InvocationException: GraphViz's executables not found