In [1]:
import pandas as pd

dataset = pd.read_csv("../datasets/penguins.csv")

feature_names = [
    "Culmen Length (mm)",
    "Culmen Depth (mm)",
    "Flipper Length (mm)",
]
target_name = "Body Mass (g)"

dataset = dataset[feature_names + [target_name]].dropna(axis="rows", how="any")
dataset = dataset.sample(frac=1, random_state=0).reset_index(drop=True)
data, target = dataset[feature_names], dataset[target_name]

In [2]:
data

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm)
0,39.6,18.1,186.0
1,46.1,18.2,178.0
2,37.5,18.5,199.0
3,45.2,17.8,198.0
4,39.2,19.6,195.0
...,...,...,...
337,49.8,17.3,198.0
338,49.6,16.0,225.0
339,35.7,17.0,189.0
340,36.0,17.9,190.0


In [3]:
target

0      4450.0
1      3250.0
2      4475.0
3      3950.0
4      4675.0
        ...  
337    3675.0
338    5700.0
339    3350.0
340    3450.0
341    5000.0
Name: Body Mass (g), Length: 342, dtype: float64

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

rfg = RandomForestRegressor(random_state=0)
dtr = DecisionTreeRegressor(random_state=0)

In [5]:
from sklearn.model_selection import cross_validate

cv_results_1 = cross_validate(estimator=rfg, cv=10, X=data,
                              y=target, return_train_score=True)
cv_results_2 = cross_validate(estimator=dtr, cv=10, X=data,
                              y=target, return_train_score=True)

In [6]:
cv_results_1['test_score']

array([0.78812533, 0.77436104, 0.88871396, 0.84661427, 0.78043644,
       0.85771461, 0.79185662, 0.73543268, 0.76485281, 0.8077531 ])

In [7]:
cv_results_2['test_score']

array([0.59650558, 0.66957713, 0.7193112 , 0.8184209 , 0.6242818 ,
       0.72483157, 0.53090702, 0.51655398, 0.58421785, 0.49038086])

In [8]:
rfg_2 = RandomForestRegressor(n_estimators=5, random_state=0)
rfg_3 = RandomForestRegressor(n_estimators=100, random_state=0)

cv_results_3 = cross_validate(estimator=rfg_2, cv=10, X=data,
                              y=target, return_train_score=True)
cv_results_4 = cross_validate(estimator=rfg_3, cv=10, X=data,
                              y=target, return_train_score=True)

In [9]:
cv_results_3['test_score']

array([0.77235183, 0.65802718, 0.85849519, 0.82029064, 0.78921964,
       0.85126513, 0.77881189, 0.6144532 , 0.76453205, 0.7611934 ])

In [10]:
cv_results_4['test_score']

array([0.78812533, 0.77436104, 0.88871396, 0.84661427, 0.78043644,
       0.85771461, 0.79185662, 0.73543268, 0.76485281, 0.8077531 ])

In [11]:
n_estimators = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000]

In [12]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    "n_estimators": n_estimators
}

search_cv = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=2), param_distributions=param_distributions,
    scoring="neg_mean_absolute_error", n_iter=10, random_state=0, n_jobs=2, 
    return_train_score=True
)
search_cv.fit(data, target)

columns = [f"param_{name}" for name in param_distributions.keys()]


In [13]:
columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "mean_train_score", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["mean_train_score"] = -cv_results["mean_train_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="param_n_estimators")

Unnamed: 0,param_n_estimators,mean_test_error,mean_train_score,std_test_error
0,1,382.791986,143.110946,43.737078
1,2,324.25991,131.249967,18.396486
2,5,295.015772,120.56822,37.322468
3,10,294.731458,111.594349,31.431481
4,20,278.044917,106.74496,24.008084
5,50,274.966752,104.490129,23.811106
6,100,280.880435,102.983869,23.114316
7,200,279.38831,103.386167,22.720891
8,500,276.659154,102.56288,20.447168
9,1000,277.749462,102.29823,21.558315


In [15]:
param_distributions = {
    "n_estimators": n_estimators
}

search_cv = RandomizedSearchCV(
    RandomForestRegressor(max_depth=5 , n_jobs=2), param_distributions=param_distributions,
    scoring="neg_mean_absolute_error", n_iter=10, random_state=0, n_jobs=2, 
    return_train_score=True
)
search_cv.fit(data, target)

columns = [f"param_{name}" for name in param_distributions.keys()]


In [16]:
columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "mean_train_score", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["mean_train_score"] = -cv_results["mean_train_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="param_n_estimators")

Unnamed: 0,param_n_estimators,mean_test_error,mean_train_score,std_test_error
0,1,319.251829,242.84074,48.854719
1,2,314.250655,215.912414,25.598985
2,5,280.862445,205.26546,22.492504
3,10,276.349883,198.451468,17.789959
4,20,276.404466,193.9295,16.053199
5,50,267.525916,191.744625,19.573069
6,100,270.328163,190.332217,19.290908
7,200,268.549508,190.637074,19.297756
8,500,267.694631,190.861939,18.528596
9,1000,267.544332,190.846748,18.567463
