# Hyperparameter Tuning with Optuna

In [37]:
import pandas as pd
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import SVR

import optuna

In [38]:
df = sns.load_dataset('healthexp')

## Data Exploration

In [39]:
df.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [40]:
df.shape

(274, 4)

In [41]:
df.describe()

Unnamed: 0,Year,Spending_USD,Life_Expectancy
count,274.0,274.0,274.0
mean,1996.992701,2789.338905,77.909489
std,14.180933,2194.939785,3.276263
min,1970.0,123.993,70.6
25%,1985.25,1038.357,75.525
50%,1998.0,2295.578,78.1
75%,2009.0,4055.61,80.575
max,2020.0,11859.179,84.7


## Model Preparation

### Create Binary Numbers for Catagorical Column

In [42]:
df = pd.get_dummies(df)

In [43]:
df

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,0,0,1,0,0,0
1,1970,192.143,72.2,0,1,0,0,0,0
2,1970,123.993,71.9,0,0,0,1,0,0
3,1970,150.437,72.0,0,0,0,0,1,0
4,1970,326.961,70.9,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
269,2020,6938.983,81.1,0,0,1,0,0,0
270,2020,5468.418,82.3,0,1,0,0,0,0
271,2020,5018.700,80.4,0,0,0,1,0,0
272,2020,4665.641,84.7,0,0,0,0,1,0


### Assign the Features into A Set 'X' and Assign the Target Value to 'Y'

In [44]:
X = df.drop(['Life_Expectancy'], axis =1)

In [45]:
y = df['Life_Expectancy']

### Split the Data Between Training & Set

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2 , random_state = 54)

### Model Results Predection

In [81]:
def modelresults(predictions):
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    print('Mean absolute error on model is {:.4f}'.format(mae))
    print('')
    print('Mean squared error on model is {:.4f}'.format(mse))
    print('')
    print('The r2 score on model is {:.4f}'.format(r2))

### Random Forest Regressor Model

In [82]:
rfr = RandomForestRegressor(random_state = 34)

In [83]:
rfr.fit(X_train, y_train)

RandomForestRegressor(random_state=34)

In [84]:
y_pred = rfr.predict(X_test)

In [85]:
modelresults(y_pred)

Mean absolute error on model is 0.3114

Mean squared error on model is 0.1553

The r2 score on model is 0.9836


The results for this model is great. but for practice purposes, we will adapt the **Optuna model** in order to imporve the results by modifying the hyper parameters.

## Optuna Model

In [133]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
    criterion = trial.suggest_categorical('criterion', ['mse', 'mae'])
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        criterion=criterion
    )

    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs= -1)
    score = score.mean()  # Return the mean of the neg_mean_squared_error values as a single float
    
    return score
    
    

In [134]:
study = optuna.create_study(direction = 'maximize',
                           sampler = optuna.samplers.RandomSampler(seed=42)
)

[I 2023-11-06 20:03:14,547] A new study created in memory with name: no-name-24e0009f-9442-4135-99ac-ecf66fcba9dd


In [135]:
%%time
study.optimize(objective, n_trials = 200)

[I 2023-11-06 20:03:16,347] Trial 0 finished with value: -2.3082287521521803 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20, 'max_features': 'auto', 'criterion': 'mse'}. Best is trial 0 with value: -2.3082287521521803.
[I 2023-11-06 20:03:17,987] Trial 1 finished with value: -2.758021048664968 and parameters: {'n_estimators': 737, 'max_depth': 10, 'min_samples_split': 32, 'min_samples_leaf': 27, 'max_features': 'auto', 'criterion': 'mae'}. Best is trial 0 with value: -2.3082287521521803.
[I 2023-11-06 20:03:18,955] Trial 2 finished with value: -0.9776709996975494 and parameters: {'n_estimators': 489, 'max_depth': 21, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 'log2', 'criterion': 'mse'}. Best is trial 2 with value: -0.9776709996975494.
[I 2023-11-06 20:03:20,033] Trial 3 finished with value: -2.992133019586524 and parameters: {'n_estimators': 563, 'max_depth': 34, 'min_samples_split': 3, 'min_samples_leaf': 20

[I 2023-11-06 20:03:49,384] Trial 30 finished with value: -5.025141198101897 and parameters: {'n_estimators': 829, 'max_depth': 43, 'min_samples_split': 28, 'min_samples_leaf': 30, 'max_features': 'log2', 'criterion': 'mae'}. Best is trial 26 with value: -0.3896128260330344.
[I 2023-11-06 20:03:51,223] Trial 31 finished with value: -2.5731346367242622 and parameters: {'n_estimators': 817, 'max_depth': 46, 'min_samples_split': 12, 'min_samples_leaf': 13, 'max_features': 'sqrt', 'criterion': 'mae'}. Best is trial 26 with value: -0.3896128260330344.
[I 2023-11-06 20:03:52,667] Trial 32 finished with value: -0.2813504827848248 and parameters: {'n_estimators': 358, 'max_depth': 34, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'criterion': 'mae'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:03:53,554] Trial 33 finished with value: -0.29798421198328606 and parameters: {'n_estimators': 294, 'max_depth': 35, 'min_samples_split': 4, 'min_samples_l

[I 2023-11-06 20:04:27,454] Trial 60 finished with value: -2.4000627397102177 and parameters: {'n_estimators': 959, 'max_depth': 34, 'min_samples_split': 9, 'min_samples_leaf': 22, 'max_features': 'auto', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:04:28,957] Trial 61 finished with value: -3.504419819343667 and parameters: {'n_estimators': 795, 'max_depth': 31, 'min_samples_split': 28, 'min_samples_leaf': 18, 'max_features': 'sqrt', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:04:30,474] Trial 62 finished with value: -1.2248642139307016 and parameters: {'n_estimators': 780, 'max_depth': 35, 'min_samples_split': 23, 'min_samples_leaf': 7, 'max_features': 'log2', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:04:31,725] Trial 63 finished with value: -2.1692761147622597 and parameters: {'n_estimators': 494, 'max_depth': 47, 'min_samples_split': 12, 'min_samples_l

[I 2023-11-06 20:05:05,609] Trial 90 finished with value: -3.1347505361927697 and parameters: {'n_estimators': 987, 'max_depth': 44, 'min_samples_split': 5, 'min_samples_leaf': 30, 'max_features': 'auto', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:05:06,572] Trial 91 finished with value: -1.6753567897803023 and parameters: {'n_estimators': 402, 'max_depth': 42, 'min_samples_split': 2, 'min_samples_leaf': 11, 'max_features': 'log2', 'criterion': 'mae'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:05:08,321] Trial 92 finished with value: -2.208102453584775 and parameters: {'n_estimators': 764, 'max_depth': 28, 'min_samples_split': 8, 'min_samples_leaf': 15, 'max_features': 'log2', 'criterion': 'mae'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:05:09,429] Trial 93 finished with value: -0.9105755589650741 and parameters: {'n_estimators': 426, 'max_depth': 33, 'min_samples_split': 21, 'min_samples_le

[I 2023-11-06 20:05:41,371] Trial 120 finished with value: -3.008119554680834 and parameters: {'n_estimators': 808, 'max_depth': 13, 'min_samples_split': 14, 'min_samples_leaf': 29, 'max_features': 'auto', 'criterion': 'mae'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:05:41,974] Trial 121 finished with value: -2.2961196247872016 and parameters: {'n_estimators': 308, 'max_depth': 48, 'min_samples_split': 22, 'min_samples_leaf': 20, 'max_features': 'auto', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:05:43,590] Trial 122 finished with value: -5.209303801844797 and parameters: {'n_estimators': 802, 'max_depth': 24, 'min_samples_split': 3, 'min_samples_leaf': 32, 'max_features': 'log2', 'criterion': 'mae'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:05:45,066] Trial 123 finished with value: -4.6138590452807815 and parameters: {'n_estimators': 783, 'max_depth': 38, 'min_samples_split': 6, 'min_sample

[I 2023-11-06 20:06:13,944] Trial 150 finished with value: -3.9432081242224384 and parameters: {'n_estimators': 881, 'max_depth': 33, 'min_samples_split': 15, 'min_samples_leaf': 24, 'max_features': 'log2', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:06:15,268] Trial 151 finished with value: -4.221370772304575 and parameters: {'n_estimators': 633, 'max_depth': 47, 'min_samples_split': 8, 'min_samples_leaf': 20, 'max_features': 'sqrt', 'criterion': 'mae'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:06:15,789] Trial 152 finished with value: -1.4983678913745255 and parameters: {'n_estimators': 261, 'max_depth': 19, 'min_samples_split': 32, 'min_samples_leaf': 6, 'max_features': 'auto', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:06:16,885] Trial 153 finished with value: -1.3860204767687865 and parameters: {'n_estimators': 563, 'max_depth': 24, 'min_samples_split': 20, 'min_sampl

[I 2023-11-06 20:06:48,986] Trial 180 finished with value: -6.3438495916334166 and parameters: {'n_estimators': 519, 'max_depth': 12, 'min_samples_split': 19, 'min_samples_leaf': 31, 'max_features': 'sqrt', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:06:49,937] Trial 181 finished with value: -4.88323290634343 and parameters: {'n_estimators': 505, 'max_depth': 41, 'min_samples_split': 12, 'min_samples_leaf': 22, 'max_features': 'sqrt', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:06:51,810] Trial 182 finished with value: -5.902734708567559 and parameters: {'n_estimators': 994, 'max_depth': 48, 'min_samples_split': 18, 'min_samples_leaf': 27, 'max_features': 'sqrt', 'criterion': 'mse'}. Best is trial 32 with value: -0.2813504827848248.
[I 2023-11-06 20:06:53,960] Trial 183 finished with value: -1.0056676442958232 and parameters: {'n_estimators': 844, 'max_depth': 42, 'min_samples_split': 23, 'min_sampl

Wall time: 3min 58s


In [136]:
study.best_params

{'n_estimators': 358,
 'max_depth': 34,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'criterion': 'mae'}

In [137]:
best_params = study.best_params

## Optuna Visualizations

### Optimization History Plot

In [138]:
optuna.visualization.plot_optimization_history(study)


The Optimization History Plot can observe how the search for **hyperparameters evolved.** The plot helps in understanding whether the optimization process improved the objective function over time or if it converged quickly to a good solution. It can be useful for making decisions about the effectiveness of your hyperparameter optimization approach, such as whether you need to conduct more trials or if you've already reached a satisfactory result.

Accoroding to our graph, our model reached its best value by the 33rd trial, where after that trail the objective values stayed the same.

In summary, the plot illustrates the **progress of our hyperparameter optimization** study by showing how the objective function value changes as you conduct more trials. This can provide valuable insights into the **efficiency and effectiveness of your optimization process.**







### Parallel Coordinate Plot

Parallel Coordinate Plot is used to generate a parallel coordinates plot, which is a type of visualization that helps to understand the **relationships between hyperparameters and their corresponding objective function values** in a hyperparameter optimization study.

this is a useful tool for gaining insights into **the relationships between hyperparameters and their impact on the objective function during an optimization study.** It helps us to **identify promising hyperparameter configurations** and understand **the trade-offs and interactions between different hyperparameters.** This can aid in making decisions on which hyperparameter settings to prioritize for further optimization or model tuning.

In [139]:
optuna.visualization.plot_parallel_coordinate(study)

The plot illustrates serval important aspects:

1) **Hyperparameter Relationships:** Each vertical axis represents a hyperparameter, and the horizontal lines connecting different axes illustrate how the values of these hyperparameters relate to each other during the optimization process. You can see how certain hyperparameter values are chosen together or in relation to one another.

2. **Objective Function Values:** The color and thickness of each line segment in the plot represent the objective function value for a specific trial or combination of hyperparameters. Darker, thicker lines often indicate better objective function values, while lighter, thinner lines correspond to poorer results.

3. **Optimal Configurations:** By visually inspecting the plot, we can identify regions of the plot where the lines converge, indicating successful configurations of hyperparameters that led to good objective function values. This helps you find optimal or promising combinations of hyperparameters.

4. **Divergence and Exploration:** The spread of lines across the plot can also indicate how widely you explored the hyperparameter search space. Tight clusters suggest that the search was focused, while scattered lines may indicate that the search space was thoroughly explored.

### Slice Plot

These Plots provide insights into **the relationship between specific hyperparameters and the objective function value.** It visualizes how a particular hyperparameter or a combination of hyperparameters affects the optimization process.

In [140]:
optuna.visualization.plot_slice(study, params = ['n_estimators',
                                                 'max_depth',
                                                 'min_samples_split',
                                                 'min_samples_leaf',
                                                 'max_features',
                                                 'criterion',])

### Hyperparameter Importances

This plot can provide valuable insights into **which hyperparameters had the most significant impact on the objective function** and the overall model performance. 

In [141]:
optuna.visualization.plot_param_importances(study)

### Assigning the Best Hyperparameters

In [142]:
best_n_estimators = best_params['n_estimators']
best_n_estimators

358

In [143]:
best_max_depth = best_params['max_depth']
best_max_depth


34

In [144]:
best_min_samples_split = best_params['min_samples_split']
best_min_samples_split

2

In [145]:
best_min_samples_leaf = best_params['min_samples_leaf']
best_min_samples_leaf

2

In [146]:
best_max_features = best_params['max_features']
best_max_features

'auto'

In [147]:
best_criterion = best_params['criterion']
best_criterion

'mae'

## Ramdom Forest Model with The Best HyperParatmeters

In [148]:
best_model = RandomForestRegressor( n_estimators = best_n_estimators,
                                  max_depth = best_max_depth,
                                  min_samples_split = best_min_samples_split,
                                  min_samples_leaf =  best_min_samples_leaf,
                                  )

In [149]:
%%time
best_model.fit(X_train, y_train)

Wall time: 377 ms


RandomForestRegressor(max_depth=34, min_samples_leaf=2, n_estimators=358)

In [150]:
y_pred = best_model.predict(X_test)

In [151]:
modelresults(y_pred)

Mean absolute error on model is 0.3761

Mean squared error on model is 0.2073

The r2 score on model is 0.9781


We achieved a better MAE & MSE scores by tuning our hyperparameters with the Optuna model. However, the r2 score is not as high as the previous models we used for this dataset.