In [1]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import accuracy_score 

In [2]:
df=pd.read_csv("./prescription.csv")

In [3]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,disease,age,gender,severity,drug
0,diarrhea,4,male,LOW,promegranate drink
1,diarrhea,4,male,NORMAL,lime juice
2,diarrhea,5,male,LOW,promegranate drink
3,diarrhea,5,male,NORMAL,lime juice
4,diarrhea,6,male,LOW,promegranate drink
...,...,...,...,...,...
1283,diarrhea,60,male,NORMAL,kutajarishta
1284,diarrhea,60,male,LOW,ajamodarka
1285,diarrhea,60,female,HIGH,sitopaladi churna
1286,diarrhea,60,female,NORMAL,kutajarishta


In [4]:
df.columns

Index(['disease', 'age', 'gender', 'severity', 'drug'], dtype='object')

In [5]:
df.isnull().values.any()

False

In [6]:
X = df[["disease","age","gender","severity"]].values
y = df["drug"].values
print(X.shape)
print(y.shape)

(1288, 4)
(1288,)


In [7]:
from sklearn import preprocessing

label_encoders = {
    'disease': preprocessing.LabelEncoder().fit(['diarrhea', 'gastritis', 'arthritis', 'migraine']),
    'gender': preprocessing.LabelEncoder().fit(['female', 'male']),
    # 'age': preprocessing.LabelEncoder().fit(['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']),
    'severity': preprocessing.LabelEncoder().fit(['LOW', 'NORMAL', 'HIGH'])
}

X[:, 0] = label_encoders['disease'].transform(X[:, 0])
X[:, 2] = label_encoders['gender'].transform(X[:, 2])
# X[:, 1] = label_encoders['age'].transform(X[:, 1])
X[:, 3] = label_encoders['severity'].transform(X[:, 3])


In [8]:
X

array([[1, 4, 1, 1],
       [1, 4, 1, 2],
       [1, 5, 1, 1],
       ...,
       [1, 60, 0, 0],
       [1, 60, 0, 2],
       [1, 60, 0, 1]], dtype=object)

In [9]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X[:, 1] = scaler.fit_transform(X[:, 1].reshape(-1, 1)).flatten()

In [10]:
X

array([[1, -1.9093735317522067, 1, 1],
       [1, -1.9093735317522067, 1, 2],
       [1, -1.8449676782822495, 1, 1],
       ...,
       [1, 1.6973542625653917, 0, 0],
       [1, 1.6973542625653917, 0, 2],
       [1, 1.6973542625653917, 0, 1]], dtype=object)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [24]:
print("train data :", X_train.shape)
print("\ntest data :", X_test.shape)
print("\ntrain target :", y_train.shape)
print("\ntest target :",y_test.shape)

train data : (1030, 4)

test data : (258, 4)

train target : (1030,)

test target : (258,)


In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rf_classifier = RandomForestClassifier(n_estimators=500, random_state=1)
rf_classifier.fit(X_train, y_train)

In [27]:
y_pred = rf_classifier.predict(X_test)

In [28]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9302325581395349
