In [4]:
import numpy as np 
import pandas as pd
import optuna as opt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
data = pd.read_csv(url, header=None, names=columns)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
cols_with_missing_values = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols_with_missing_values] = data[cols_with_missing_values].replace(0, np.nan)

data.fillna(data.mean(), inplace=True)

print(data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [7]:
x = data.drop('Outcome', axis=1)
y = data['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

print(f"Training set shape: {x_train.shape}, Test set shape: {x_test.shape}")

Training set shape: (614, 8), Test set shape: (154, 8)


In [15]:
search_space = {
    'n_estimators': (50, 100, 150),
    'max_depth': (5, 10, 15, 20)
}

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )

    scores = cross_val_score(model, x_train, y_train, cv=3, scoring='accuracy')
    return scores.mean()

In [17]:
study = opt.create_study(direction='maximize', sampler=opt.samplers.GridSampler(search_space))
study.optimize(objective, n_trials=50)

[I 2025-11-20 15:35:38,138] A new study created in memory with name: no-name-0d35dfbd-1b8f-4642-81e3-4180bf8a8737
[I 2025-11-20 15:35:38,270] Trial 0 finished with value: 0.7687151283277539 and parameters: {'n_estimators': 50, 'max_depth': 15}. Best is trial 0 with value: 0.7687151283277539.
[I 2025-11-20 15:35:38,630] Trial 1 finished with value: 0.7719591901801371 and parameters: {'n_estimators': 150, 'max_depth': 20}. Best is trial 1 with value: 0.7719591901801371.
[I 2025-11-20 15:35:38,865] Trial 2 finished with value: 0.7752351347042882 and parameters: {'n_estimators': 100, 'max_depth': 10}. Best is trial 2 with value: 0.7752351347042882.
[I 2025-11-20 15:35:39,116] Trial 3 finished with value: 0.7703491152558585 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 2 with value: 0.7752351347042882.
[I 2025-11-20 15:35:39,450] Trial 4 finished with value: 0.7719591901801371 and parameters: {'n_estimators': 150, 'max_depth': 5}. Best is trial 2 with value: 0.775235