In [1]:
def warn(*args,**kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
import sys
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree

In [5]:
df=pd.read_csv("drug200.csv")

The table shows records from the drug200 dataset which has the following columns:
1. Age: Age of the patient.
2. Sex: gender (M=male, F= female)
3. BP: Blood Pressure Level (e.g., High, Low, Normal)
4. Cholesterol: Cholesterol level (e.g., High, Low, Normal)
5. Na_to_K: Sodium_to_potassium ratio in the blood
6. Drug: Te prescribed drug for the patient

In [7]:
df.head(100)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
95,36,M,LOW,NORMAL,11.424,drugX
96,58,F,LOW,HIGH,38.247,drugY
97,56,F,HIGH,HIGH,25.395,drugY
98,20,M,HIGH,NORMAL,35.639,drugY


In [9]:
df.shape

(200, 6)

In [11]:
df.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

### Data separation
In a typical machine learning problem, the dataset is devidevc in to two parts:
1. **X(features)**: These are the input variables which are used to predict. in our datasset (drug200), the features include;
   *'Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K'*
2. **y(target)**: This is the output variable also called dependent variable. Its the variable that the model is trying to predict. In our dataset, the    target variale is the **Drug*

In [13]:
X=df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']]

y=df[['Drug']]


As you may figure out , some of the features in this dataset are categorical, such as Sex or BP. unfortunately, Sklearn Decision Trees does not handle categorical variables. We can still convert these features to numerical values using the LabelEncoder()method to convert the categorical variable into dummy/indicator variables.

In [16]:
#(from sklearn import preprocessing

#le_sex = preprocessing.LabelEncoder()
#le_sex.fit(['F','M'])
#X[:,1]=le_sex.transform(X['Sex'])




from sklearn import preprocessing

le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X['Sex']=le_sex.transform(X['Sex'])

le_BP=preprocessing.LabelEncoder()
le_BP.fit(['LOW','NORMAL','HIGH'])
X['BP']=le_BP.transform(X['BP'])

le_chol=preprocessing.LabelEncoder()
le_chol.fit(['NORMAL','HIGH'])
X['Cholesterol']=le_chol.transform(X['Cholesterol'])

X[0:5]

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,23,0,0,0,25.355
1,47,1,1,0,13.093
2,47,1,1,0,10.114
3,28,0,2,0,7.798
4,61,0,1,0,18.043


In [18]:
X

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,23,0,0,0,25.355
1,47,1,1,0,13.093
2,47,1,1,0,10.114
3,28,0,2,0,7.798
4,61,0,1,0,18.043
...,...,...,...,...,...
195,56,0,1,0,11.567
196,16,1,1,0,12.006
197,52,1,2,0,9.894
198,23,1,2,1,14.020


### Train test split in python
When building a machine learning model, it is important to divide your dataset in to two main parts:
-**Training set**: Used to train the model
-**Test set**: Used to evaluate the model's perfomance on unseen data
in python we use train test split function from the sklearn.model_selection module to achieve this

In [20]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

In [24]:
X_train.shape

(140, 5)

In [26]:
X_test.shape

(60, 5)

In [28]:
y_test.shape

(60, 1)

In [30]:
drugtree=DecisionTreeClassifier(criterion='entropy',max_depth=4)
drugtree

In [32]:
drugtree.fit(X_train,y_train)

In [34]:
predtree=drugtree.predict(X_test)

In [36]:
print(predtree[0:20])
print(y_test[0:20].T)

['drugY' 'drugX' 'drugX' 'drugX' 'drugX' 'drugC' 'drugY' 'drugA' 'drugB'
 'drugA' 'drugY' 'drugA' 'drugY' 'drugY' 'drugX' 'drugY' 'drugX' 'drugX'
 'drugB' 'drugX']
        40     51     139    197    170    82     183    46     70     100  \
Drug  drugY  drugX  drugX  drugX  drugX  drugC  drugY  drugA  drugB  drugA   

        179    83     25     190    159    173    95     3      41     58   
Drug  drugY  drugA  drugY  drugY  drugX  drugY  drugX  drugX  drugB  drugX  


## Evaluation
Lets check the accuracy of our model

In [39]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTree's Accuracy: ", metrics.accuracy_score(y_test,predtree))


DecisionTree's Accuracy:  0.9833333333333333


In [40]:
from sklearn.metrics import classification_report,precision_score
print('Confussion matric:', classification_report(predtree,y_test))


Confussion matric:               precision    recall  f1-score   support

       drugA       1.00      1.00      1.00         7
       drugB       1.00      1.00      1.00         5
       drugC       1.00      1.00      1.00         5
       drugX       0.95      1.00      0.98        20
       drugY       1.00      0.96      0.98        23

    accuracy                           0.98        60
   macro avg       0.99      0.99      0.99        60
weighted avg       0.98      0.98      0.98        60



## Predicting unknown data using the model we created

In [56]:
new_data={'Age':[30,46,24,19,29,47,50,17,61,54,23,57,46],'Sex':[0,0,1,0,1,1,1,0,0,1,0,1,1],'BP':[0,0,2,0,1,1,2,1,0,2,2,2,1],
          'Cholesterol':[0,0,0,1,0,1,1,1,0,0,0,1,1],'Na_to_K':[6.118,8.900,28.774,16.007,14.001,20.889,13.223,14.661,19.467,13.887,32.004,17.890,26.768]
         }

In [58]:
new_df=pd.DataFrame(new_data)

In [60]:
new_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,30,0,0,0,6.118
1,46,0,0,0,8.9
2,24,1,2,0,28.774
3,19,0,0,1,16.007
4,29,1,1,0,14.001
5,47,1,1,1,20.889
6,50,1,2,1,13.223
7,17,0,1,1,14.661
8,61,0,0,0,19.467
9,54,1,2,0,13.887


In [62]:
new_pred=drugtree.predict(new_df)

In [64]:
new_pred

array(['drugA', 'drugA', 'drugY', 'drugY', 'drugC', 'drugY', 'drugX',
       'drugY', 'drugY', 'drugX', 'drugY', 'drugY', 'drugY'], dtype=object)

In [66]:
new_data["Predicted Drug"]=new_pred

In [70]:
pd.DataFrame(new_data)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Predicted Drug
0,30,0,0,0,6.118,drugA
1,46,0,0,0,8.9,drugA
2,24,1,2,0,28.774,drugY
3,19,0,0,1,16.007,drugY
4,29,1,1,0,14.001,drugC
5,47,1,1,1,20.889,drugY
6,50,1,2,1,13.223,drugX
7,17,0,1,1,14.661,drugY
8,61,0,0,0,19.467,drugY
9,54,1,2,0,13.887,drugX
