[Reference](https://medium.com/@roiyeho/random-forests-98892261dc49)

In [7]:
import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

iris = load_iris()
X = iris.data[:, :2] # we only take the first two features
y = iris.target

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_train, y_train)

In [5]:
print(f'Training set accuracy: {clf.score(X_train, y_train):.4f}')
print(f'Test set accuracy: {clf.score(X_test, y_test):.4f}')

Training set accuracy: 0.9286
Test set accuracy: 0.7368


In [10]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_depth': np.arange(3, 10),
    'max_features': ['sqrt', 'log2', None],
    'max_samples': np.linspace(0.5, 1.0, 6)
}

grid = RandomizedSearchCV(RandomForestClassifier(), param_grid, n_iter=25, cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

print(grid.best_params_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
{'n_estimators': 50, 'max_samples': 0.8, 'max_features': 'sqrt', 'max_depth': 4}


In [11]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X, y = data.data, data.target
feature_names = data.feature_names

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor()
reg.fit(X_train, y_train)

In [14]:
train_score = reg.score(X_train, y_train)
print('R2 score on the training set:', np.round(train_score, 4))

test_score = reg.score(X_test, y_test)
print('R2 score on the test set:', np.round(test_score, 4))

R2 score on the training set: 0.9731
R2 score on the test set: 0.809


In [15]:
# Sort the features by their importances in decreasing order
idx = np.argsort(reg.feature_importances_)[::-1]
feature_names = np.array(feature_names)[idx]
feature_importances = reg.feature_importances_[idx]

# Print the features alongside their importances
for name, score in zip(feature_names, feature_importances):
    print(f'{name}: {score:.4f}')

MedInc: 0.5199
AveOccup: 0.1364
Longitude: 0.0878
Latitude: 0.0871
HouseAge: 0.0556
AveRooms: 0.0501
Population: 0.0324
AveBedrms: 0.0307


In [16]:
from sklearn.ensemble import ExtraTreesRegressor

reg = ExtraTreesRegressor()
reg.fit(X_train, y_train)

In [17]:
train_score = reg.score(X_train, y_train)
print('R2 score on the training set:', np.round(train_score, 4))

test_score = reg.score(X_test, y_test)
print('R2 score on the test set:', np.round(test_score, 4))

R2 score on the training set: 1.0
R2 score on the test set: 0.8194
