In [5]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

In [6]:
df = pd.read_csv('../Data/drug200.csv')
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [9]:
# Drop null if any
if df.isna().sum().sum() > 0:
    df = df.dropna()
# Strip Strings
df['BP'] = df['BP'].str.strip()
df['Cholesterol'] = df['Cholesterol'].str.strip()
df['Drug'] = df['Drug'].str.strip()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [17]:
# Create list of unique drugs
uniq_products= df['Drug'].unique()
print(f"The dataset has {len(uniq_products)} unique drugs")

The dataset has 5 unique drugs


In [11]:
print(uniq_products)

['DrugY' 'drugC' 'drugX' 'drugA' 'drugB']


In [14]:
# Encode object types
encoder = LabelEncoder()

df['Sex'] = encoder.fit_transform(df['Sex'])
df['BP'] = encoder.fit_transform(df['BP'])
df['Cholesterol'] = encoder.fit_transform(df['Cholesterol'])

In [15]:
# Create train and target dataset
X = df.drop(['Drug'], axis = 1)
y = df['Drug']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Define the hyperparameter grid for GridSearchCV
param_grid = {
    "n_estimators": [10, 20, 30, 40, 50],
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 4, 6, 8]
}

# Initialize the RandomForestClassifier
clf = RandomForestClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Train the best model on the entire data set
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 10}
Accuracy: 0.95
