In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [4]:
import pandas as pd
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

df = pd.read_csv(url,names = columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
for cols in df.columns:
  print(f"{cols} has value counts",df[df[cols].values==0][cols].value_counts())

Pregnancies has value counts Pregnancies
0    111
Name: count, dtype: int64
Glucose has value counts Glucose
0    5
Name: count, dtype: int64
BloodPressure has value counts BloodPressure
0    35
Name: count, dtype: int64
SkinThickness has value counts SkinThickness
0    227
Name: count, dtype: int64
Insulin has value counts Insulin
0    374
Name: count, dtype: int64
BMI has value counts BMI
0.0    11
Name: count, dtype: int64
DiabetesPedigreeFunction has value counts Series([], Name: count, dtype: int64)
Age has value counts Series([], Name: count, dtype: int64)
Outcome has value counts Outcome
0    500
Name: count, dtype: int64


In [13]:
import numpy as np

In [14]:
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

In [15]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,5
BloodPressure,35
SkinThickness,227
Insulin,374
BMI,11
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [16]:
df.fillna(df.mean(),inplace=True)

In [17]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [18]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [20]:
X=df.drop("Outcome",axis=1)
y=df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
X_test.shape

(231, 8)

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
  n_estimators = trial.suggest_int('n_estimators',50,200)
  max_depth = trial.suggest_int('max_depth',1,20)

  model = RandomForestClassifier(
      n_estimators=n_estimators,
      max_depth=max_depth,
      random_state=42
  )

  score = cross_val_score(model,X_train,y_train,cv=5).mean()

  return score

In [55]:
study = optuna.create_study(direction = "maximize", sampler = optuna.samplers.TPESampler())
study.optimize(objective,n_trials = 50)

[I 2025-05-11 11:18:32,565] A new study created in memory with name: no-name-def5e08b-ff87-4831-9fab-ba6ef9a75e1a
[I 2025-05-11 11:18:34,159] Trial 0 finished with value: 0.7652994115610937 and parameters: {'n_estimators': 70, 'max_depth': 16}. Best is trial 0 with value: 0.7652994115610937.
[I 2025-05-11 11:18:37,282] Trial 1 finished with value: 0.7671339563862929 and parameters: {'n_estimators': 174, 'max_depth': 11}. Best is trial 1 with value: 0.7671339563862929.
[I 2025-05-11 11:18:38,539] Trial 2 finished with value: 0.7634129456559363 and parameters: {'n_estimators': 147, 'max_depth': 6}. Best is trial 1 with value: 0.7671339563862929.
[I 2025-05-11 11:18:39,841] Trial 3 finished with value: 0.7634302526825891 and parameters: {'n_estimators': 143, 'max_depth': 20}. Best is trial 1 with value: 0.7671339563862929.
[I 2025-05-11 11:18:41,111] Trial 4 finished with value: 0.7634302526825891 and parameters: {'n_estimators': 138, 'max_depth': 19}. Best is trial 1 with value: 0.767133

In [56]:
print(study.best_trial)
print(study.best_trial.value)
print(study.best_trial.params)

FrozenTrial(number=44, state=1, values=[0.7727414330218069], datetime_start=datetime.datetime(2025, 5, 11, 11, 19, 31, 781525), datetime_complete=datetime.datetime(2025, 5, 11, 11, 19, 34, 256013), params={'n_estimators': 174, 'max_depth': 16}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=200, log=False, low=50, step=1), 'max_depth': IntDistribution(high=20, log=False, low=1, step=1)}, trial_id=44, value=None)
0.7727414330218069
{'n_estimators': 174, 'max_depth': 16}


In [35]:
from sklearn.metrics import accuracy_score

In [57]:
best_model = RandomForestClassifier(n_estimators=150,max_depth=9,random_state=42)
best_model.fit(X_train,y_train)
y_pred=best_model.predict(X_test)

print(f"AccuracyScore is :{accuracy_score(y_pred,y_test)}")

AccuracyScore is :0.7619047619047619


Different Sampler

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
  n_estimators = trial.suggest_int('n_estimators',50,200)
  max_depth = trial.suggest_int('max_depth',1,20)

  model = RandomForestClassifier(
      n_estimators=n_estimators,
      max_depth=max_depth,
      random_state=42
  )

  score = cross_val_score(model,X_train,y_train,cv=5).mean()

  return score

In [38]:
study = optuna.create_study(direction = "maximize", sampler=optuna.samplers.RandomSampler())
study.optimize(objective,n_trials = 60)

[I 2025-05-11 11:03:08,237] A new study created in memory with name: no-name-a2c9f407-09ba-4618-a32c-214786fe68c3
[I 2025-05-11 11:03:10,337] Trial 0 finished with value: 0.7577881619937694 and parameters: {'n_estimators': 117, 'max_depth': 14}. Best is trial 0 with value: 0.7577881619937694.
[I 2025-05-11 11:03:12,011] Trial 1 finished with value: 0.7522672204915196 and parameters: {'n_estimators': 54, 'max_depth': 3}. Best is trial 0 with value: 0.7577881619937694.
[I 2025-05-11 11:03:13,584] Trial 2 finished with value: 0.7485808238144687 and parameters: {'n_estimators': 175, 'max_depth': 2}. Best is trial 0 with value: 0.7577881619937694.
[I 2025-05-11 11:03:15,337] Trial 3 finished with value: 0.7689685012114919 and parameters: {'n_estimators': 191, 'max_depth': 11}. Best is trial 3 with value: 0.7689685012114919.
[I 2025-05-11 11:03:16,124] Trial 4 finished with value: 0.757822776047075 and parameters: {'n_estimators': 86, 'max_depth': 13}. Best is trial 3 with value: 0.768968501

In [39]:
print(study.best_trial)
print(study.best_trial.value)
print(study.best_trial.params)

FrozenTrial(number=47, state=1, values=[0.7746105919003116], datetime_start=datetime.datetime(2025, 5, 11, 11, 4, 23, 420877), datetime_complete=datetime.datetime(2025, 5, 11, 11, 4, 24, 431220), params={'n_estimators': 112, 'max_depth': 9}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=200, log=False, low=50, step=1), 'max_depth': IntDistribution(high=20, log=False, low=1, step=1)}, trial_id=47, value=None)
0.7746105919003116
{'n_estimators': 112, 'max_depth': 9}


In [40]:
best_ransample_model = RandomForestClassifier(**study.best_trial.params,random_state=42)
best_ransample_model.fit(X_train,y_train)
y_pred=best_ransample_model.predict(X_test)

print(f"AccuracyScore is :{accuracy_score(y_pred,y_test)}")

AccuracyScore is :0.7489177489177489


# Grid Search

In [45]:
search_space = {
    'n_estimators': [50, 100, 150, 200,250,300,350,400],
    'max_depth': [5, 10, 15, 20,25,30]
}

In [46]:
study = optuna.create_study(direction = "maximize", sampler=optuna.samplers.GridSampler(search_space))
study.optimize(objective,n_trials = 70)

[I 2025-05-11 11:08:52,428] A new study created in memory with name: no-name-097237da-49c0-4328-92f4-7d5d95e14220
[I 2025-05-11 11:08:55,257] Trial 0 finished with value: 0.7690377293181031 and parameters: {'n_estimators': 300, 'max_depth': 20}. Best is trial 0 with value: 0.7690377293181031.
[I 2025-05-11 11:08:57,350] Trial 1 finished with value: 0.7541017653167186 and parameters: {'n_estimators': 250, 'max_depth': 5}. Best is trial 0 with value: 0.7690377293181031.
[I 2025-05-11 11:08:58,740] Trial 2 finished with value: 0.7652821045344409 and parameters: {'n_estimators': 150, 'max_depth': 20}. Best is trial 0 with value: 0.7690377293181031.
[I 2025-05-11 11:09:02,835] Trial 3 finished with value: 0.770889581169955 and parameters: {'n_estimators': 350, 'max_depth': 20}. Best is trial 3 with value: 0.770889581169955.
[I 2025-05-11 11:09:03,322] Trial 4 finished with value: 0.7671685704395985 and parameters: {'n_estimators': 50, 'max_depth': 25}. Best is trial 3 with value: 0.77088958

In [47]:
print(study.best_trial)
print(study.best_trial.value)
print(study.best_trial.params)

FrozenTrial(number=3, state=1, values=[0.770889581169955], datetime_start=datetime.datetime(2025, 5, 11, 11, 8, 58, 741269), datetime_complete=datetime.datetime(2025, 5, 11, 11, 9, 2, 834832), params={'n_estimators': 350, 'max_depth': 20}, user_attrs={}, system_attrs={'search_space': {'max_depth': [5, 10, 15, 20, 25, 30], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400]}, 'grid_id': 3}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=200, log=False, low=50, step=1), 'max_depth': IntDistribution(high=20, log=False, low=1, step=1)}, trial_id=3, value=None)
0.770889581169955
{'n_estimators': 350, 'max_depth': 20}


In [48]:
best_gridsample_model = RandomForestClassifier(**study.best_trial.params,random_state=42)
best_gridsample_model.fit(X_train,y_train)
y_pred=best_gridsample_model.predict(X_test)

print(f"AccuracyScore is :{accuracy_score(y_pred,y_test)}")

AccuracyScore is :0.7402597402597403


In [49]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate,plot_param_importances,plot_timeline,plot_slice

# Grid Search Plots

In [53]:
# prompt: use all thse plots and show in subplots

import matplotlib.pyplot as plt


plot_optimization_history(study).show()
plot_parallel_coordinate(study).show()
plot_slice(study).show()

plt.tight_layout()
plt.show()


<Figure size 640x480 with 0 Axes>

# TPE Plots

In [58]:
# prompt: use all thse plots and show in subplots

import matplotlib.pyplot as plt


plot_optimization_history(study).show()
plot_parallel_coordinate(study).show()
plot_slice(study).show()

plt.tight_layout()
plt.show()


<Figure size 640x480 with 0 Axes>