In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
RANDOM_STATE = 47

In [2]:
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('drug200.csv')

In [4]:
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [5]:
cat_variables = ['Sex','BP','Cholesterol']

In [6]:
df = pd.get_dummies(data = df,
                         prefix = cat_variables,
                         columns = cat_variables)

In [7]:
df

Unnamed: 0,Age,Na_to_K,Drug,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL
0,23,25.355,drugY,True,False,True,False,False,True,False
1,47,13.093,drugC,False,True,False,True,False,True,False
2,47,10.114,drugC,False,True,False,True,False,True,False
3,28,7.798,drugX,True,False,False,False,True,True,False
4,61,18.043,drugY,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...
195,56,11.567,drugC,True,False,False,True,False,True,False
196,16,12.006,drugC,False,True,False,True,False,True,False
197,52,9.894,drugX,False,True,False,False,True,True,False
198,23,14.020,drugX,False,True,False,False,True,False,True


In [8]:
var = [x for x in df.columns if x not in 'Drug']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df[var], df['Drug'], train_size = 0.8, random_state = RANDOM_STATE)

In [11]:
from sklearn.metrics import accuracy_score

In [12]:
model = RandomForestClassifier(
    n_estimators=75,
    max_depth=4,
    min_samples_split=5,
    random_state=RANDOM_STATE
)
model.fit(X_train, y_train)

In [13]:
yhat = model.predict(X_test)

In [14]:
print('Accuracy score: ',accuracy_score(yhat,y_test))

Accuracy score:  0.975


In [15]:
yhat = model.predict(X_train)

In [16]:
print('Accuracy score: ',accuracy_score(yhat,y_train))

Accuracy score:  1.0


This is just for practice. We could do models for Decision tree classifier and XGBoost as well.

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 75, 100],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 4, 6, 8]
}

rf = RandomForestClassifier(random_state=RANDOM_STATE)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=5,                # 5-fold cross-validation
                           scoring='accuracy',  # You can change this to 'f1_weighted' or others
                           n_jobs=-1,           # Use all CPU cores
                           verbose=1)

# Fit on training data
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
Best CV Accuracy: 1.0


In [18]:
# 3*4*4 = 48 candidates.

In [19]:
model = RandomForestClassifier(n_estimators= 50,max_depth=5,min_samples_split=2,random_state = RANDOM_STATE)
model.fit(X_train,y_train)

In [20]:
yhat = model.predict(X_test)

In [21]:
print('Accuracy score: ',accuracy_score(yhat,y_test))

Accuracy score:  0.975


In [22]:
yhat = model.predict(X_train)

In [23]:
print('Accuracy score: ',accuracy_score(yhat,y_train))

Accuracy score:  1.0


Thus, using Grid Search CV, we were able to find the optimal parameters for Random Forest Classifier

In [24]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [25]:
from xgboost import XGBClassifier
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

xgb = XGBClassifier( eval_metric='mlogloss', random_state=RANDOM_STATE)

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=xgb,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=1)

# Fit on training data
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}
Best CV Accuracy: 0.99375


In [26]:
model = XGBClassifier( eval_metric='mlogloss', random_state=RANDOM_STATE, n_estimators=50,subsample=0.8,max_depth=5,learning_rate=0.2)
model.fit(X_train,y_train)

In [27]:
yhat = model.predict(X_test)

In [28]:
print('Accuracy score: ',accuracy_score(yhat,y_test))

Accuracy score:  0.95


In [29]:
yhat = model.predict(X_train)

In [30]:
print('Accuracy score: ',accuracy_score(yhat,y_train))

Accuracy score:  1.0
