<a href="https://colab.research.google.com/github/Bisma-Shafiq/Deep-Learning_Pytorch/blob/main/Optuna_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [2]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)


In [5]:
# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [7]:
# x,y
x = df.drop('Outcome',axis=1)
y = df['Outcome']

In [8]:
# train test split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [9]:
# Scaliing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [10]:
x_train.shape , x_test.shape

((614, 8), (154, 8))

In [15]:
# Random Forest
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Objective Function
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators',50,200)
  max_depth = trial.suggest_int('max_depth',3,20)


  # random forest classifier with hyperparameters
  model = RandomForestClassifier(
      n_estimators=n_estimators,
      max_depth=max_depth,
      random_state=42
  )
  # perform 5-fold cross validation then calculate average(mean())

  score = cross_val_score(model,x_train,y_train,cv=5).mean()
  return score


In [17]:
# create study
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-01-23 09:27:53,815] A new study created in memory with name: no-name-5ba9c286-5f1f-47f0-9ebe-15f25fef770b
[I 2025-01-23 09:28:00,080] Trial 0 finished with value: 0.765480474476876 and parameters: {'n_estimators': 185, 'max_depth': 18}. Best is trial 0 with value: 0.765480474476876.
[I 2025-01-23 09:28:02,205] Trial 1 finished with value: 0.7638677862188459 and parameters: {'n_estimators': 107, 'max_depth': 11}. Best is trial 0 with value: 0.765480474476876.
[I 2025-01-23 09:28:03,418] Trial 2 finished with value: 0.7671598027455685 and parameters: {'n_estimators': 64, 'max_depth': 4}. Best is trial 2 with value: 0.7671598027455685.
[I 2025-01-23 09:28:05,295] Trial 3 finished with value: 0.7671064907370384 and parameters: {'n_estimators': 182, 'max_depth': 16}. Best is trial 2 with value: 0.7671598027455685.
[I 2025-01-23 09:28:07,524] Trial 4 finished with value: 0.767119818739171 and parameters: {'n_estimators': 199, 'max_depth': 9}. Best is trial 2 with value: 0.76715980274

In [20]:
# print accuracy
print(f"best accuracy:{study.best_value}")
print(f'best hyperparameters: {study.best_params}')

best accuracy:0.776969212315074
best hyperparameters: {'n_estimators': 77, 'max_depth': 6}


In [21]:
from sklearn.metrics import accuracy_score

best_model = RandomForestClassifier(**study.best_trial.params , random_state=42)
best_model.fit(x_train,y_train)
y_pred = best_model.predict(x_test)
# test accuracy

test_accuracy = accuracy_score(y_test,y_pred)
print(f'test accuracy: {test_accuracy}')

test accuracy: 0.7337662337662337


In [26]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice, plot_contour

plot_optimization_history(study).show()

In [23]:
plot_parallel_coordinate(study).show()

In [24]:
plot_slice(study).show()

In [27]:
plot_contour(study).show()