In [29]:
# Import necessary libraries
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Pima Indian Diabetes dataset from sklearn
# Note: Scikit-learn's built-in 'load_diabetes' is a regression dataset.
# We will load the actual diabetes dataset from an external source
import pandas as pd

# Load the Pima Indian Diabetes dataset (from UCI repository)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
import numpy as np

# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [3]:
# Split into features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optional: Scale the data for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shape of the data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')


Training set shape: (537, 8)
Test set shape: (231, 8)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

#Define the objective Function
def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',50,200)
    max_depth=trial.suggest_int('max_depth',3,20)

    #Create the randomForestClassifier with suggested hyperparameter
    model=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=42)

    #perform 3-fold cross-validation and calculate accuracy
    score=cross_val_score(model,X_train,y_train,cv=3,scoring='accuracy').mean()

    return score #Return the accuracy score for optuna to maximize





In [5]:
#Create a study and optimize the objective function
study=optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler())  #We aim to maximize accuracy

study.optimize(objective,n_trials=50) #Run 50 trials to find the best hyperparameters

[I 2025-06-11 15:49:16,209] A new study created in memory with name: no-name-1ed9c4bc-2bdd-4f1f-9e8b-91f10496bf7d
[I 2025-06-11 15:49:16,871] Trial 0 finished with value: 0.7616387337057727 and parameters: {'n_estimators': 198, 'max_depth': 9}. Best is trial 0 with value: 0.7616387337057727.
[I 2025-06-11 15:49:17,492] Trial 1 finished with value: 0.7597765363128491 and parameters: {'n_estimators': 198, 'max_depth': 6}. Best is trial 0 with value: 0.7616387337057727.
[I 2025-06-11 15:49:17,870] Trial 2 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 103, 'max_depth': 7}. Best is trial 2 with value: 0.7690875232774674.
[I 2025-06-11 15:49:18,510] Trial 3 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 185, 'max_depth': 12}. Best is trial 3 with value: 0.7709497206703911.
[I 2025-06-11 15:49:19,160] Trial 4 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 185, 'max_depth': 15}. Best is trial 3 with value: 0.7709497

In [6]:
#Print Best Result
print(f'Best trial accuracy:{study.best_trial.value}')
print(f'Best hyperparameter:{study.best_trial.params}')

Best trial accuracy:0.7821229050279331
Best hyperparameter:{'n_estimators': 73, 'max_depth': 20}


In [7]:
from sklearn.metrics import accuracy_score

# Train a RandomForest Classifier using the best hyperparameters from optuna
best_model=RandomForestClassifier(**study.best_trial.params,random_state=42)

#Fit the model to the training data
best_model.fit(X_train,y_train)

#Make predictions on the test set
y_pred=best_model.predict(X_test)

#Calculate the accuracy on the test set
test_accuracy=accuracy_score(y_test,y_pred)

#print the test accuracy
print(f'Test accuracy with best hyperparameters:{test_accuracy:.2f}')

Test accuracy with best hyperparameters:0.75


In [None]:
#Random Sampler


#Create a study and optimize the objective function
study=optuna.create_study(direction='maximize',sampler=optuna.samplers.RandomSampler())  #We aim to maximize accuracy

study.optimize(objective,n_trials=50) #Run 50 trials to find the best hyperparameters

[I 2025-06-11 15:37:36,605] A new study created in memory with name: no-name-3c04e073-9816-4f2c-98d8-dcd8fe6d529b
[I 2025-06-11 15:37:37,286] Trial 0 finished with value: 0.7765363128491621 and parameters: {'n_estimators': 183, 'max_depth': 16}. Best is trial 0 with value: 0.7765363128491621.
[I 2025-06-11 15:37:37,835] Trial 1 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 155, 'max_depth': 14}. Best is trial 0 with value: 0.7765363128491621.
[I 2025-06-11 15:37:38,430] Trial 2 finished with value: 0.7765363128491621 and parameters: {'n_estimators': 168, 'max_depth': 16}. Best is trial 0 with value: 0.7765363128491621.
[I 2025-06-11 15:37:38,675] Trial 3 finished with value: 0.7783985102420857 and parameters: {'n_estimators': 65, 'max_depth': 19}. Best is trial 3 with value: 0.7783985102420857.
[I 2025-06-11 15:37:39,108] Trial 4 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 104, 'max_depth': 18}. Best is trial 3 with value: 0.77839

In [11]:
from sklearn.metrics import accuracy_score

# Train a RandomForest Classifier using the best hyperparameters from optuna
best_model=RandomForestClassifier(**study.best_trial.params,random_state=42)

#Fit the model to the training data
best_model.fit(X_train,y_train)

#Make predictions on the test set
y_pred=best_model.predict(X_test)

#Calculate the accuracy on the test set
test_accuracy=accuracy_score(y_test,y_pred)

#print the test accuracy
print(f'Test accuracy with best hyperparameters:{test_accuracy:.2f}')

Test accuracy with best hyperparameters:0.76


In [14]:
#Grid Search Sampler

search_space={'n_estimators':[50,100,150,200],
             'max_depth':[5,10,15,20]}

study=optuna.create_study(direction='maximize',sampler=optuna.samplers.GridSampler(search_space)) 

study.optimize(objective)

[I 2025-06-11 15:42:00,326] A new study created in memory with name: no-name-69e83283-6bfd-4b49-bcd5-48c075f3febc
[I 2025-06-11 15:42:00,659] Trial 0 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.7690875232774674.
[I 2025-06-11 15:42:01,182] Trial 1 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 0 with value: 0.7690875232774674.
[I 2025-06-11 15:42:01,357] Trial 2 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 50, 'max_depth': 15}. Best is trial 2 with value: 0.7728119180633147.
[I 2025-06-11 15:42:01,745] Trial 3 finished with value: 0.7653631284916201 and parameters: {'n_estimators': 100, 'max_depth': 15}. Best is trial 2 with value: 0.7728119180633147.
[I 2025-06-11 15:42:02,115] Trial 4 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 2 with value: 0.772811

In [15]:
from sklearn.metrics import accuracy_score

# Train a RandomForest Classifier using the best hyperparameters from optuna
best_model=RandomForestClassifier(**study.best_trial.params,random_state=42)

#Fit the model to the training data
best_model.fit(X_train,y_train)

#Make predictions on the test set
y_pred=best_model.predict(X_test)

#Calculate the accuracy on the test set
test_accuracy=accuracy_score(y_test,y_pred)

#print the test accuracy
print(f'Test accuracy with best hyperparameters:{test_accuracy:.2f}')

Test accuracy with best hyperparameters:0.74


In [9]:
from optuna.visualization import plot_optimization_history,plot_parallel_coordinate,plot_slice,plot_contour,plot_param_importances

In [10]:
plot_optimization_history(study).show()

In [11]:
plot_parallel_coordinate(study).show()

In [12]:
plot_slice(study).show()

In [13]:
plot_contour(study).show()

In [14]:
plot_param_importances(study).show()

## Optimizing Multiple ML Models

In [15]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

In [23]:
#Define the objective function for optuna

def objective(trial):
    #Choose the algorithm to tune
    classifier_name= trial.suggest_categorical('classifier', ['RandomForest', 'GradientBoosting', 'SVM'])
    if classifier_name == 'SVM':
        # SVM hyperparameters
        c=trial.suggest_float('C',0.1,100,log=True)
        kernel=trial.suggest_categorical('kernel',['linear','rbf','poly','sigmoid']
                                         )
        gamma=trial.suggest_categorical('gamma',['scale','auto']
                                        )
        model=SVC(C=c,kernel=kernel,gamma=gamma,random_state=42)

    elif classifier_name=='RandomForest':
        #Random Forest hyperparameter
        n_estimators=trial.suggest_int('n_estimators',50,300)
        max_depth=trial.suggest_int('max_depth',3,20)
        min_samples_split=trial.suggest_int('min_samples_split',2,10)
        min_samples_leaf=trial.suggest_int('min_samples_leaf',1,10)
        bootstrap=trial.suggest_categorical('bootstrap',[True,False])   

        model=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split)

    elif classifier_name=='GradientBoosting':
         #GradientBoosting hyperparameter
        n_estimators=trial.suggest_int('n_estimators',50,300)
        max_depth=trial.suggest_int('max_depth',3,20)
        min_samples_split=trial.suggest_int('min_samples_split',2,10)
        min_samples_leaf=trial.suggest_int('min_samples_leaf',1,10)
        bootstrap=trial.suggest_categorical('bootstrap',[True,False])

        model=GradientBoostingClassifier(n_estimators=n_estimators,max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split)
    #Perform cross-validation and return the mean accuracy

    score=cross_val_score(model,X_train,y_train,cv=3,scoring='accuracy').mean()
    return score


In [24]:
#Create a study and optimize it using CmaEsSampler
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=100)

[I 2025-06-11 16:53:21,374] A new study created in memory with name: no-name-22e91017-6908-4b73-9499-225200c6141d
[I 2025-06-11 16:53:21,399] Trial 0 finished with value: 0.7467411545623835 and parameters: {'classifier': 'SVM', 'C': 8.293476728871376, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.7467411545623835.
[I 2025-06-11 16:53:21,719] Trial 1 finished with value: 0.7541899441340781 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 68, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 1 with value: 0.7541899441340781.
[I 2025-06-11 16:53:22,583] Trial 2 finished with value: 0.7709497206703911 and parameters: {'classifier': 'RandomForest', 'n_estimators': 262, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 3, 'bootstrap': True}. Best is trial 2 with value: 0.7709497206703911.
[I 2025-06-11 16:53:24,223] Trial 3 finished with value: 0.7355679702048418 and parameters: {'classifier': 'Gra

In [25]:
#Print Best Result
print(f'Best trial accuracy:{study.best_trial.value}')
print(f'Best hyperparameter:{study.best_trial.params}')

Best trial accuracy:0.7895716945996275
Best hyperparameter:{'classifier': 'SVM', 'C': 0.13774702528130273, 'kernel': 'linear', 'gamma': 'auto'}


In [26]:
#Print the study in a dataframe
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.746741,2025-06-11 16:53:21.375747,2025-06-11 16:53:21.399669,0 days 00:00:00.023922,8.293477,,SVM,auto,rbf,,,,,COMPLETE
1,1,0.754190,2025-06-11 16:53:21.400419,2025-06-11 16:53:21.719557,0 days 00:00:00.319138,,True,GradientBoosting,,,4.0,2.0,2.0,68.0,COMPLETE
2,2,0.770950,2025-06-11 16:53:21.720768,2025-06-11 16:53:22.583754,0 days 00:00:00.862986,,True,RandomForest,,,16.0,3.0,7.0,262.0,COMPLETE
3,3,0.735568,2025-06-11 16:53:22.584531,2025-06-11 16:53:24.223205,0 days 00:00:01.638674,,True,GradientBoosting,,,16.0,8.0,5.0,193.0,COMPLETE
4,4,0.765363,2025-06-11 16:53:24.224273,2025-06-11 16:53:24.966293,0 days 00:00:00.742020,,True,RandomForest,,,15.0,3.0,3.0,225.0,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.787709,2025-06-11 16:53:42.237837,2025-06-11 16:53:42.261908,0 days 00:00:00.024071,0.178593,,SVM,auto,linear,,,,,COMPLETE
96,96,0.763501,2025-06-11 16:53:42.266828,2025-06-11 16:53:43.094434,0 days 00:00:00.827606,,True,RandomForest,,,3.0,8.0,6.0,276.0,COMPLETE
97,97,0.787709,2025-06-11 16:53:43.095716,2025-06-11 16:53:43.114918,0 days 00:00:00.019202,0.176149,,SVM,auto,linear,,,,,COMPLETE
98,98,0.785847,2025-06-11 16:53:43.115855,2025-06-11 16:53:43.135451,0 days 00:00:00.019596,0.205876,,SVM,auto,linear,,,,,COMPLETE


In [28]:
study.trials_dataframe()['params_classifier'].value_counts()

params_classifier
SVM                 77
RandomForest        12
GradientBoosting    11
Name: count, dtype: int64