In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error

import sys
import optuna
#!{sys.executable} -m pip install optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=sns.load_dataset('healthexp')
df.shape

(274, 4)

In [3]:
df.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [4]:
df=pd.get_dummies(df,columns=['Country'],drop_first=True,dtype=int)
df

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,0,1,0,0,0
1,1970,192.143,72.2,1,0,0,0,0
2,1970,123.993,71.9,0,0,1,0,0
3,1970,150.437,72.0,0,0,0,1,0
4,1970,326.961,70.9,0,0,0,0,1
...,...,...,...,...,...,...,...,...
269,2020,6938.983,81.1,0,1,0,0,0
270,2020,5468.418,82.3,1,0,0,0,0
271,2020,5018.700,80.4,0,0,1,0,0
272,2020,4665.641,84.7,0,0,0,1,0


In [5]:
X=df.drop(columns=['Life_Expectancy'])
Y=df['Life_Expectancy']

In [6]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=10)

In [7]:
model=RandomForestRegressor()

In [8]:
model.fit(X_train,Y_train)

In [9]:
y_pred=model.predict(X_test)

In [10]:
print("Hold out approach r2 score:",r2_score(Y_test,y_pred))
print("Hold out approach mse:",mean_squared_error(Y_test,y_pred))

Hold out approach r2 score: 0.982408609686565
Hold out approach mse: 0.17648869090908742


In [11]:
#kfold:
kfold=KFold(n_splits=5,shuffle=True,random_state=10)
score=cross_val_score(model,X_train,Y_train,cv=kfold,scoring="r2")
print(score.mean())

0.9752864946010051


In [12]:
def objective(trail):
    n_estimators=trail.suggest_int('n_estimators',50,1000)
    max_depth=trail.suggest_int('max_depth',10,50)
    min_samples_split=trail.suggest_int('min_samples_split',2,32)
    min_samples_leaf=trail.suggest_int('min_samples_leaf',1,32)

    model=RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf
    )

    scores=cross_val_score(model,X_train,Y_train,cv=5,scoring="r2")

    return scores.mean()

In [13]:
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=50)

[I 2025-08-12 18:42:10,811] A new study created in memory with name: no-name-0d4409e6-1032-4432-b509-761d02f94b48
[I 2025-08-12 18:42:12,430] Trial 0 finished with value: 0.8222551672220055 and parameters: {'n_estimators': 483, 'max_depth': 31, 'min_samples_split': 15, 'min_samples_leaf': 13}. Best is trial 0 with value: 0.8222551672220055.
[I 2025-08-12 18:42:15,406] Trial 1 finished with value: 0.8505543683424849 and parameters: {'n_estimators': 859, 'max_depth': 13, 'min_samples_split': 17, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.8505543683424849.
[I 2025-08-12 18:42:16,339] Trial 2 finished with value: 0.8531248471673187 and parameters: {'n_estimators': 273, 'max_depth': 12, 'min_samples_split': 25, 'min_samples_leaf': 9}. Best is trial 2 with value: 0.8531248471673187.
[I 2025-08-12 18:42:17,137] Trial 3 finished with value: 0.6943996051281287 and parameters: {'n_estimators': 253, 'max_depth': 49, 'min_samples_split': 26, 'min_samples_leaf': 30}. Best is trial 2 wit

In [14]:
study.best_params

{'n_estimators': 77,
 'max_depth': 19,
 'min_samples_split': 2,
 'min_samples_leaf': 1}