In [1]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

In [2]:
housing = fetch_california_housing()

housing_df = pd.DataFrame(housing['data'],columns = housing['feature_names'])
housing_df['target'] = housing['target']
housing_df.head()

# if you have downloaded dataset, use below code segment
# housing_df = pd.read_csv('path_to_california_housing.csv')
# housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
X = housing_df.drop('target',axis=1)
y = housing_df['target']

X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [11]:
mdl = RandomForestRegressor(random_state=42)
mdl.fit(X_train,y_train)

# Make predictions
y_preds_regressor = mdl.predict(X_test)


print('Regression metrics on the test set')
print(f"R2 score : {r2_score(y_test,y_preds_regressor)}")
print(f"MAE : {mean_absolute_error(y_test,y_preds_regressor)}")
print(f"MSE : {mean_squared_error(y_test,y_preds_regressor)}")


Regression metrics on the test set
R2 score : 0.8074577125685922
MAE : 0.33189567594476765
MSE : 0.2593172828839291


In [17]:
mdl = RandomForestRegressor(n_estimators=150,
                            max_depth=20,
                            min_samples_split=10,
                            min_samples_leaf=5,
                            max_features='log2',
                            random_state=42)
mdl.fit(X_train,y_train)

# Make predictions
y_preds_regressor = mdl.predict(X_test)


print('Regression metrics on the test set')
print(f"R2 score : {r2_score(y_test,y_preds_regressor)}")
print(f"MAE : {mean_absolute_error(y_test,y_preds_regressor)}")
print(f"MSE : {mean_squared_error(y_test,y_preds_regressor)}")


Regression metrics on the test set
R2 score : 0.8119463764619695
MAE : 0.33502558266572735
MSE : 0.25327191934255916
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   3.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   1.9s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.9s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=10, max_features=log2, mi

In [12]:
# Define the parameter grid
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],  # Updated values
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf, 
    param_distributions=param_dist,
    n_iter=50, 
    cv=5, 
    verbose=2, 
    random_state=42, 
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

# Evaluate on test set
best_rf_random = random_search.best_estimator_
y_preds_random = best_rf_random.predict(X_test)

print('Randomized Search Regression metrics on the test set')
print(f"R2 score : {r2_score(y_test, y_preds_random)}")
print(f"MAE : {mean_absolute_error(y_test, y_preds_random)}")
print(f"MSE : {mean_squared_error(y_test, y_preds_random)}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   2.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   2.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   2.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_es

In [13]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100],
    'max_features': ['sqrt', 'log2'],  # Updated values
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    rf, 
    param_grid=param_grid,
    cv=5, 
    verbose=2, 
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

# Evaluate on test set
best_rf_grid = grid_search.best_estimator_
y_preds_grid = best_rf_grid.predict(X_test)

print('Grid Search Regression metrics on the test set')
print(f"R2 score : {r2_score(y_test, y_preds_grid)}")
print(f"MAE : {mean_absolute_error(y_test, y_preds_grid)}")
print(f"MSE : {mean_squared_error(y_test, y_preds_grid)}")


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   5.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   2.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   7.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  12.5s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=  10.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_e