In [1]:
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import sklearn.datasets as data
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error

In [2]:
# Load the diabetes dataset
df = data.load_diabetes()

In [3]:
X = df.data
y = df.target

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [7]:
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()

In [8]:
rfr.fit(X_train, y_train)

In [9]:
gbr.fit(X_train, y_train)

In [10]:
y_pred_r = rfr.predict(X_test)
y_pred_g = gbr.predict(X_test)

In [11]:
print("RFR R2 Score: ", r2_score(y_test, y_pred_r))
print("GBR R2 Score: ", r2_score(y_test, y_pred_g))

RFR R2 Score:  0.3052342666735628
GBR R2 Score:  0.2632786892776703


In [12]:
print("MSE RF  Score: ", mean_squared_error(y_test, y_pred_r))
print("MSE GBR  Score: ", mean_squared_error(y_test, y_pred_g))

MSE RF  Score:  3846.224853932584
MSE GBR  Score:  4078.4910363312974


In [13]:
print("MAE RF  Score: ", mean_absolute_error(y_test, y_pred_r))
print("MAE GBR  Score: ", mean_absolute_error(y_test, y_pred_g))

MAE RF  Score:  50.12696629213483
MAE GBR  Score:  52.3322724595972


In [20]:
# param tunning

param_grid_rf = {
    "n_estimators": [50,100,150,200,250],
    "min_samples_split": [2,4, 8,10,15],
    "min_samples_leaf": [2,5,7,9],
    "max_depth": [10,15,20],
    "criterion": ["squared_error"]
}

In [21]:
param_grid_gbr = {
    "n_estimators": [50,100,150,200,250],
    "min_samples_split": [2,4, 8,10,15],
    "min_samples_leaf": [2,5,7,9],
    "max_depth": [2,3,5,8,10],
    "criterion": ["friedman_mse"]
}

In [22]:
# cross val

CV_rf = GridSearchCV(rfr, param_grid_rf, cv=5, n_jobs=-1)

In [25]:
CV_gbr = GridSearchCV(gbr, param_grid_gbr, cv=3, n_jobs=-1)

In [24]:
CV_rf.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [26]:
CV_gbr.fit(X_train, y_train)

In [27]:
y_pred_rf2 = CV_rf.best_estimator_.predict(X_test)
y_pred_gbr2 = CV_gbr.best_estimator_.predict(X_test)

In [29]:
print("RF R2 Score", r2_score(y_test, y_pred_rf2))
print("GBR R2 Score", r2_score(y_test, y_pred_gbr2))

RF R2 Score 0.35048118927860983
GBR R2 Score 0.36365814955690645


In [30]:
print("RF MSE Score", mean_squared_error(y_test, y_pred_rf2))
print("GBR MSE Score", mean_squared_error(y_test, y_pred_gbr2))

RF MSE Score 3595.737776145563
GBR MSE Score 3522.7900907739618


In [31]:
print("RF MSE Score", mean_absolute_error(y_test, y_pred_rf2))
print("GBR MSE Score", mean_absolute_error(y_test, y_pred_gbr2))

RF MSE Score 49.355464955031
GBR MSE Score 49.170286237362276


Optimization with OPTUNA

In [5]:
import optuna
import plotly
from sklearn.model_selection import cross_val_score

In [6]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)

    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf,
                                  random_state=42)

    try:
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        score = np.mean(scores)
        return score
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return float('-inf')

In [7]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=42))

[I 2024-09-12 12:01:13,015] A new study created in memory with name: no-name-0b869183-6289-4203-acfe-df0d9d1f0864


In [8]:
study.optimize(objective, n_trials=100)

[I 2024-09-12 12:01:18,572] Trial 0 finished with value: -3316.2652197835937 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20}. Best is trial 0 with value: -3316.2652197835937.
[I 2024-09-12 12:01:20,996] Trial 1 finished with value: -3368.476407719507 and parameters: {'n_estimators': 240, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 28}. Best is trial 0 with value: -3316.2652197835937.
[I 2024-09-12 12:01:22,050] Trial 2 finished with value: -3424.485663215733 and parameters: {'n_estimators': 641, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 32}. Best is trial 0 with value: -3316.2652197835937.
[I 2024-09-12 12:01:23,557] Trial 3 finished with value: -3232.8546247967497 and parameters: {'n_estimators': 850, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 3 with value: -3232.8546247967497.
[I 2024-09-12 12:01:24,192] Trial 4 finished with value: -3227.263529525083 and pa

In [9]:
best_params = study.best_params
best_score = study.best_value

In [10]:
print(best_params)
print(best_score)

{'n_estimators': 106, 'max_depth': 30, 'min_samples_split': 14, 'min_samples_leaf': 8}
-3188.007294684348


In [11]:
optuna.visualization.plot_optimization_history(study)

In [12]:
optuna.visualization.plot_parallel_coordinate(study)

In [13]:
optuna.visualization.plot_param_importances(study)

In [14]:
optuna.visualization.plot_slice(study, params=['n_estimators', 'max_depth', 'min_samples_leaf', 'min_samples_split'])

In [15]:
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

In [16]:
best_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth, min_samples_split=best_min_samples_split, min_samples_leaf=best_min_samples_leaf)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

In [17]:
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

49.092602087696065
3501.218962659991
0.36755466658644453
