In [10]:
from io import StringIO
from sklearn import tree
from sklearn.model_selection import train_test_split
import pandas as pd
import pydotplus
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import preprocessing
from sklearn import utils
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [75]:
dataset = pd.read_csv('drug200.csv')
dataset

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [76]:
dataset["BP"] = dataset.BP.map({"HIGH" : 2,"LOW" : 0,"NORMAL": 1})
dataset['Cholesterol'] = dataset.Cholesterol.map({"HIGH" : 2,"LOW" : 0,"NORMAL": 1})
sexs = dataset.Sex.values.reshape(-1,1)
ohc = OneHotEncoder(handle_unknown = 'ignore')
dummies_sex = ohc.fit_transform(sexs).toarray()
dataset = dataset.join(pd.DataFrame(dummies_sex, columns=ohc.categories_[0]))
drug_type = {
    "DrugY":0,
    "drugX":1,
    'drugA':2,
    'drugC':3,
    'drugB':4
}

dataset

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,F,M
0,23,F,2,2,25.355,DrugY,1.0,0.0
1,47,M,0,2,13.093,drugC,0.0,1.0
2,47,M,0,2,10.114,drugC,0.0,1.0
3,28,F,1,2,7.798,drugX,1.0,0.0
4,61,F,0,2,18.043,DrugY,1.0,0.0
...,...,...,...,...,...,...,...,...
195,56,F,0,2,11.567,drugC,1.0,0.0
196,16,M,0,2,12.006,drugC,0.0,1.0
197,52,M,1,2,9.894,drugX,0.0,1.0
198,23,M,1,1,14.020,drugX,0.0,1.0


In [78]:
dataset.drop('Sex',inplace = True,axis = 1)

In [79]:
dataset

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,F,M,result
0,23,2,2,25.355,1.0,0.0,0
1,47,0,2,13.093,0.0,1.0,3
2,47,0,2,10.114,0.0,1.0,3
3,28,1,2,7.798,1.0,0.0,1
4,61,0,2,18.043,1.0,0.0,0
...,...,...,...,...,...,...,...
195,56,0,2,11.567,1.0,0.0,3
196,16,0,2,12.006,0.0,1.0,3
197,52,1,2,9.894,0.0,1.0,1
198,23,1,1,14.020,0.0,1.0,1


In [77]:
dataset['result'] = dataset.Drug.map(drug_type)
dataset.drop('Drug',inplace = True,axis = 1)
dataset

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,F,M,result
0,23,F,2,2,25.355,1.0,0.0,0
1,47,M,0,2,13.093,0.0,1.0,3
2,47,M,0,2,10.114,0.0,1.0,3
3,28,F,1,2,7.798,1.0,0.0,1
4,61,F,0,2,18.043,1.0,0.0,0
...,...,...,...,...,...,...,...,...
195,56,F,0,2,11.567,1.0,0.0,3
196,16,M,0,2,12.006,0.0,1.0,3
197,52,M,1,2,9.894,0.0,1.0,1
198,23,M,1,1,14.020,0.0,1.0,1


In [69]:
sexs = dataset.Sex.values.reshape(-1,1)
ohc = OneHotEncoder(handle_unknown = 'ignore')
dummies_sex = ohc.fit_transform(sexs).toarray()
dataset = dataset.join(pd.DataFrame(dummies_sex, columns=ohc.categories_[0]))

In [71]:
dataset.drop('Sex',axis=1 ,inplace = True)
dataset

Unnamed: 0,Age,Cholesterol,Na_to_K,BPD,result,F,M
0,23,HIGH,25.355,2,0,1.0,0.0
1,47,HIGH,13.093,0,3,0.0,1.0
2,47,HIGH,10.114,0,3,0.0,1.0
3,28,HIGH,7.798,1,1,1.0,0.0
4,61,HIGH,18.043,0,0,1.0,0.0
...,...,...,...,...,...,...,...
195,56,HIGH,11.567,0,3,1.0,0.0
196,16,HIGH,12.006,0,3,0.0,1.0
197,52,HIGH,9.894,1,1,0.0,1.0
198,23,NORMAL,14.020,1,1,0.0,1.0


In [80]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)
X

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,F,M
0,23,2,2,25.355,1.0,0.0
1,47,0,2,13.093,0.0,1.0
2,47,0,2,10.114,0.0,1.0
3,28,1,2,7.798,1.0,0.0
4,61,0,2,18.043,1.0,0.0
...,...,...,...,...,...,...
195,56,0,2,11.567,1.0,0.0
196,16,0,2,12.006,0.0,1.0
197,52,1,2,9.894,0.0,1.0
198,23,1,1,14.020,0.0,1.0


In [81]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 1/3,random_state = 0)

In [61]:
X.drop('Sex',inplace = True,axis = 1)

In [62]:
X

Unnamed: 0,Age,Cholesterol,Na_to_K,BPD,F,M
0,23,HIGH,25.355,2,1.0,0.0
1,47,HIGH,13.093,0,0.0,1.0
2,47,HIGH,10.114,0,0.0,1.0
3,28,HIGH,7.798,1,1.0,0.0
4,61,HIGH,18.043,0,1.0,0.0
...,...,...,...,...,...,...
195,56,HIGH,11.567,0,1.0,0.0
196,16,HIGH,12.006,0,0.0,1.0
197,52,HIGH,9.894,1,0.0,1.0
198,23,NORMAL,14.020,1,0.0,1.0


In [82]:
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [83]:
y_pred


array([3, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 0, 0, 4, 3, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 3, 2, 1, 0, 0, 1, 0, 2, 1, 0, 2, 0, 1,
       0, 1, 0, 0, 2, 0, 0, 4, 0, 0, 3, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0,
       4], dtype=int64)

In [85]:
cm = confusion_matrix(y_test, y_pred)
print(accuracy_score(y_pred, y_test))
cm

0.9850746268656716


array([[34,  0,  0,  0,  0],
       [ 1, 19,  0,  0,  0],
       [ 0,  0,  5,  0,  0],
       [ 0,  0,  0,  4,  0],
       [ 0,  0,  0,  0,  4]], dtype=int64)