In [2]:
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

df = pd.read_csv(url, names=columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Replace zeros with NaN in specific columns
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

# Impute missing values using mean
df.fillna(df.mean(), inplace=True)

print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
X = df.drop("Outcome", axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

scaler=StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(537, 8) (231, 8) (537,) (231,)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function for Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50,200)  # Number of trees in the forest (Range: 50 to 200)
    max_depth = trial.suggest_int('max_depth', 5,20)

    model = RandomForestClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        random_state=42
    ) 

    # Perform 3 fold cross val
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

    return score

In [8]:
# Creating a Study Object and optimizing the objective function
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())  # Sampler is default to TPE
study.optimize(objective, n_trials=50) # Number of trials to run

[I 2025-07-28 20:38:13,502] A new study created in memory with name: no-name-756b9923-821d-46d0-8f39-599d70d43661
[I 2025-07-28 20:38:13,884] Trial 0 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 149, 'max_depth': 7}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-07-28 20:38:14,120] Trial 1 finished with value: 0.7635009310986964 and parameters: {'n_estimators': 94, 'max_depth': 11}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-07-28 20:38:14,355] Trial 2 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 93, 'max_depth': 17}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-07-28 20:38:14,702] Trial 3 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 143, 'max_depth': 7}. Best is trial 3 with value: 0.7728119180633147.
[I 2025-07-28 20:38:14,982] Trial 4 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 123, 'max_depth': 5}. Best is trial 3 with value: 0.772811918

In [9]:
# Best Results
print("Best Trial Accuracy:", study.best_trial.value)
print("Best Hyperparameters:", study.best_trial.params)

Best Trial Accuracy: 0.7858472998137803
Best Hyperparameters: {'n_estimators': 121, 'max_depth': 15}


In [10]:
from sklearn.metrics import accuracy_score

# Train the model with the best hyperparameters
best_params = study.best_trial.params

best_model = RandomForestClassifier(**best_params, random_state=42)

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy of the Best Model:", test_accuracy)

Test Accuracy of the Best Model: 0.7575757575757576


We can Use Optuna for Different samplers as well - Method remains same just change in one line - 

**Random Sampler-** study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())

**Grid Search Sampler-** study = optuna.create_study(direction='maximize', sampler=optuna.samplers.GridSampler(search_space))

#### **Visualizations in Optuna -**

In [11]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [None]:
# 1) Optimization History - Tells how the values of the objective function change over trials
plot_optimization_history(study).show()

In [13]:
# 2) Parallel Coordinate Plot - Shows the relationship between hyperparameters and the objective function
plot_parallel_coordinate(study).show()

In [14]:
# 3) Slice Plot- Shows the distribution of hyperparameters across trials
plot_slice(study).show()

In [16]:
# 4) Contour Plot - Visualizes the relationship between two hyperparameters and the objective function
plot_contour(study).show()

In [17]:
# 5) Parameter Importances - Shows the importance of each hyperparameter
plot_param_importances(study).show()

#### **Optimizing Multiple ML Models Once to Find the Best Model**

We can use Optuna to Test different ML Models Once and Hypertune the Parameters Once only. 

One of the Best Features to Decide which Model and Hyperparameter to use 

In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [23]:
# Defining the objective function for multiple models

def Objective(trial):
    # Choose the Model to Tune
    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        # Hyperparameters for SVM
        c = trial.suggest_float('C', 0.1,100, log=True) # C is the regularization parameter
        kernel = trial.suggest_categorical('kernel', ['linear','rbf','poly', 'sigmoid']) # Kernel represents the type of SVM
        gamma = trial.suggest_categorical('gamma', ['scale','auto']) # Gamma represents the kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    
    elif classifier_name=='RandomForest':
        # Hyperparameters for Random Forest
        n_estimators = trial.suggest_int('n_estimators', 50,300)  # Represent the number of trees in the forest
        max_depth = trial.suggest_int('max_depth', 3,20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10) # Represents the minimum number of samples required to split an internal node
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)  # Represents the minimum number of samples required to be at a leaf node
        bootstrap = trial.suggest_categorical('bootstrap', [True, False]) # Whether bootstrap samples are used when building trees

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )
    
    elif classifier_name == 'GradientBoosting':
        # Hyperparameters for Gradient Boosting
        n_estimators = trial.suggest_int('n_estimators', 50,300)  # Represent the number of boosting stages to be run
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True) # Represents the step size shrinkage used in the update to prevent overfitting
        max_depth = trial.suggest_int('max_depth', 3, 20) 
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform 3 fold cross val
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [24]:
study = optuna.create_study(direction='maximize') # TPE is the default sampler
study.optimize(Objective, n_trials=100) 

[I 2025-07-29 03:30:09,485] A new study created in memory with name: no-name-c96f4e8d-3997-479e-be6f-c5e14e79eece


[I 2025-07-29 03:30:10,007] Trial 0 finished with value: 0.7746741154562384 and parameters: {'classifier': 'RandomForest', 'n_estimators': 233, 'max_depth': 11, 'min_samples_split': 9, 'min_samples_leaf': 8, 'bootstrap': False}. Best is trial 0 with value: 0.7746741154562384.
[I 2025-07-29 03:30:10,275] Trial 1 finished with value: 0.7597765363128491 and parameters: {'classifier': 'RandomForest', 'n_estimators': 107, 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 4, 'bootstrap': True}. Best is trial 0 with value: 0.7746741154562384.
[I 2025-07-29 03:30:10,722] Trial 2 finished with value: 0.7709497206703911 and parameters: {'classifier': 'RandomForest', 'n_estimators': 228, 'max_depth': 11, 'min_samples_split': 4, 'min_samples_leaf': 9, 'bootstrap': False}. Best is trial 0 with value: 0.7746741154562384.
[I 2025-07-29 03:30:12,260] Trial 3 finished with value: 0.7597765363128491 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 267, 'learning_rate': 0.017

In [26]:
# Best Results
print("Best Trial Accuracy:", study.best_trial.values)
print("Best Hyperparameters:", study.best_trial.params)

Best Trial Accuracy: [0.7895716945996275]
Best Hyperparameters: {'classifier': 'SVM', 'C': 0.12016359922855274, 'kernel': 'linear', 'gamma': 'auto'}


**Analysis on the Current Training**

In [28]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.774674,2025-07-29 03:30:09.486868,2025-07-29 03:30:10.007949,0 days 00:00:00.521081,,False,RandomForest,,,,11.0,8.0,9.0,233.0,COMPLETE
1,1,0.759777,2025-07-29 03:30:10.015208,2025-07-29 03:30:10.275508,0 days 00:00:00.260300,,True,RandomForest,,,,15.0,4.0,4.0,107.0,COMPLETE
2,2,0.770950,2025-07-29 03:30:10.277590,2025-07-29 03:30:10.722182,0 days 00:00:00.444592,,False,RandomForest,,,,11.0,9.0,4.0,228.0,COMPLETE
3,3,0.759777,2025-07-29 03:30:10.722182,2025-07-29 03:30:12.260175,0 days 00:00:01.537993,,,GradientBoosting,,,0.017873,14.0,7.0,2.0,267.0,COMPLETE
4,4,0.752328,2025-07-29 03:30:12.261178,2025-07-29 03:30:12.286609,0 days 00:00:00.025431,1.032669,,SVM,scale,sigmoid,,,,,,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.789572,2025-07-29 03:30:26.737351,2025-07-29 03:30:26.751789,0 days 00:00:00.014438,0.154576,,SVM,auto,linear,,,,,,COMPLETE
96,96,0.789572,2025-07-29 03:30:26.752311,2025-07-29 03:30:26.766627,0 days 00:00:00.014316,0.120978,,SVM,auto,linear,,,,,,COMPLETE
97,97,0.785847,2025-07-29 03:30:26.767634,2025-07-29 03:30:26.780743,0 days 00:00:00.013109,0.194718,,SVM,auto,linear,,,,,,COMPLETE
98,98,0.785847,2025-07-29 03:30:26.781778,2025-07-29 03:30:26.795391,0 days 00:00:00.013613,0.230037,,SVM,auto,linear,,,,,,COMPLETE


In [None]:
study.trials_dataframe()['params_classifier'].value_counts() # Shows the count of each classifier used in the trials

params_classifier
SVM                 79
RandomForest        11
GradientBoosting    10
Name: count, dtype: int64

In [None]:
study.trials_dataframe().groupby('params_classifier')['value'].mean() # Shows the mean accuracy for each classifier used in the trials

params_classifier
GradientBoosting    0.745624
RandomForest        0.765532
SVM                 0.774462
Name: value, dtype: float64

In [31]:
plot_param_importances(study).show()