In [8]:
import numpy as np
import pandas as pd
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [12]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, names=columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [34]:
cols_with_missing =  ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing] = df[cols_with_missing].replace(0, np.nan)

In [42]:
df.fillna(df.mean(), inplace=True)
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [50]:
y = df["Outcome"]
X = df.drop(columns=["Outcome"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 18)
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=4, scoring='accuracy').mean()
    return score

In [62]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=40)

[I 2025-05-14 17:01:51,443] A new study created in memory with name: no-name-f196d113-52e8-40c8-ae9f-755d619ae9e9
[I 2025-05-14 17:01:52,831] Trial 0 finished with value: 0.7670825906120023 and parameters: {'n_estimators': 92, 'max_depth': 13}. Best is trial 0 with value: 0.7670825906120023.
[I 2025-05-14 17:01:55,670] Trial 1 finished with value: 0.7654698242933538 and parameters: {'n_estimators': 182, 'max_depth': 10}. Best is trial 0 with value: 0.7670825906120023.
[I 2025-05-14 17:01:57,736] Trial 2 finished with value: 0.7605996944232238 and parameters: {'n_estimators': 131, 'max_depth': 15}. Best is trial 0 with value: 0.7670825906120023.
[I 2025-05-14 17:02:00,560] Trial 3 finished with value: 0.776844070961718 and parameters: {'n_estimators': 179, 'max_depth': 13}. Best is trial 3 with value: 0.776844070961718.
[I 2025-05-14 17:02:03,183] Trial 4 finished with value: 0.7736291486291486 and parameters: {'n_estimators': 177, 'max_depth': 9}. Best is trial 3 with value: 0.77684407

In [64]:
print(f"Best Trial Accuracy: ", study.best_trial.value)
print(f"Best Hyperparameters: ", study.best_trial.params)

Best Trial Accuracy:  0.7817566420507597
Best Hyperparameters:  {'n_estimators': 93, 'max_depth': 7}


In [66]:
new_model = RandomForestClassifier(n_estimators=93, max_depth=7)
new_model.fit(X_train, y_train)
pred = new_model.predict(X_test)
print(accuracy_score(y_test, pred))

0.7467532467532467
