In [2]:
import seaborn as sns
import pandas as pd


In [3]:
healthexp = sns.load_dataset('healthexp')

In [5]:
healthexp.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [7]:
healthexp = pd.get_dummies(healthexp)

In [8]:
healthexp.head()

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False
2,1970,123.993,71.9,False,False,False,True,False,False
3,1970,150.437,72.0,False,False,False,False,True,False
4,1970,326.961,70.9,False,False,False,False,False,True


In [10]:
x = healthexp.drop(columns=['Life_Expectancy'], axis=1)

In [13]:
y = healthexp['Life_Expectancy']

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
rfr = RandomForestRegressor(random_state=34)

In [19]:
rfr.fit(x_train,y_train)

In [20]:
y_pred = rfr.predict(x_test)

In [21]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [22]:
mean_absolute_error(y_test,y_pred)

0.27123636363635395

In [23]:
mean_squared_error(y_test,y_pred)

0.11941941818180976

In [24]:
r2_score(y_test,y_pred)

0.990218403745801

# optuna hyperparameter tuning 

In [25]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
from sklearn.model_selection import cross_val_score

In [39]:
def objective(trial):
     n_estimators = trial.suggest_int('n_estimators', 100,1000)
     max_depth = trial.suggest_int('max_depth', 10,50)
     min_samples_split = trial.suggest_int('min_samples_split', 2,32)
     min_samples_leaf = trial.suggest_int('min_samples_leaf', 1,32)
     
     
     model = RandomForestRegressor(n_estimators=n_estimators, 
                                  max_depth = max_depth, 
                                 min_samples_split = min_samples_split,
                                min_samples_leaf = min_samples_leaf )
     
     score = cross_val_score(model, x_train,y_train,cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean()
     return score

In [40]:
study = optuna.create_study()

[I 2024-11-18 11:56:41,895] A new study created in memory with name: no-name-d1d89b53-80d5-4917-bc74-4e79d6707363


In [41]:
study.optimize(objective, n_trials=200)

[I 2024-11-18 11:56:57,935] Trial 0 finished with value: -2.5221402441587992 and parameters: {'n_estimators': 479, 'max_depth': 46, 'min_samples_split': 23, 'min_samples_leaf': 24}. Best is trial 0 with value: -2.5221402441587992.
[I 2024-11-18 11:56:59,246] Trial 1 finished with value: -2.9364949078447227 and parameters: {'n_estimators': 201, 'max_depth': 42, 'min_samples_split': 31, 'min_samples_leaf': 28}. Best is trial 1 with value: -2.9364949078447227.
[I 2024-11-18 11:57:04,226] Trial 2 finished with value: -2.101435432475344 and parameters: {'n_estimators': 736, 'max_depth': 11, 'min_samples_split': 29, 'min_samples_leaf': 13}. Best is trial 1 with value: -2.9364949078447227.
[I 2024-11-18 11:57:06,943] Trial 3 finished with value: -2.9313904784545897 and parameters: {'n_estimators': 435, 'max_depth': 38, 'min_samples_split': 13, 'min_samples_leaf': 28}. Best is trial 1 with value: -2.9364949078447227.
[I 2024-11-18 11:57:11,802] Trial 4 finished with value: -1.4422972239808816 

In [42]:
study.best_params

{'n_estimators': 101,
 'max_depth': 10,
 'min_samples_split': 22,
 'min_samples_leaf': 32}

In [43]:
best_params = study.best_params

In [50]:
import matplotlib.pyplot as plt
import plotly

In [53]:
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

In [54]:
best_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth, min_samples_split=best_min_samples_split,min_samples_leaf=best_min_samples_leaf)



In [55]:
best_model.fit(x_train,y_train)

In [56]:
y_pred2 = best_model.predict(x_test)

In [57]:
r2_score(y_test,y_pred)

0.990218403745801