# Hyper-parameter tutning: ExtraTreesRegressor

In [0]:
# Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# setting paths
data_dir = os.path.abspath('/content/drive/My Drive/Projects/YapAiTek_Challenge/_data')
train_data_fp = os.path.join(data_dir, 'X_train_final.npy')
test_data_fp = os.path.join(data_dir, 'X_test_final.npy')
y_true_fp = os.path.join(data_dir, 'y_train_final.npy')

In [0]:
# Loading data-sets
train = np.load(train_data_fp)
target = np.load(y_true_fp)
test = np.load(test_data_fp)

In [0]:
rs = 13  # random_state constant
cv = 3   # kfold constant

In [0]:
def evaluate(actual, prediction):
    return {'MAE': round(mean_absolute_error(actual, prediction), 5),
            'MSE': round(mean_squared_error(actual, prediction), 5),
            'R^2': round(r2_score(actual, prediction), 5)}

In [0]:
# Train and test sets splitting
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3)

## Hyper-parameter grid search for ExtraTree estimator

In [0]:
extree = ExtraTreesRegressor()

extree_params = {
    'n_estimators': [30, 150, 250],
    'max_depth': [None],
    'min_samples_leaf': [2, 3, 5],
    'max_features': [0.5, 'sqrt', 'log2']
}

In [0]:
%%time

et_gs = GridSearchCV(estimator=extree, 
                     param_grid=extree_params, 
                     scoring='neg_mean_absolute_error', 
                     cv=cv, 
                     n_jobs=-1)

et_gs_fit = et_gs.fit(train, target)

et_gs_df = pd.DataFrame(et_gs_fit.cv_results_)



CPU times: user 6min 4s, sys: 2.44 s, total: 6min 7s
Wall time: 1h 43min 42s


In [0]:
et_gs_df.sort_values(by='mean_test_score', ascending=False)[:10]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,376.983705,1.880727,8.699601,0.31766,,0.5,5,250,"{'max_depth': None, 'max_features': 0.5, 'min_...",-1.532763,-1.445745,-1.548743,-1.509084,0.04526,1
7,225.829487,2.588778,5.244814,0.216941,,0.5,5,150,"{'max_depth': None, 'max_features': 0.5, 'min_...",-1.534947,-1.445772,-1.549509,-1.510076,0.045857,2
6,45.186885,0.576765,1.115092,0.020978,,0.5,5,30,"{'max_depth': None, 'max_features': 0.5, 'min_...",-1.53978,-1.449379,-1.549058,-1.512739,0.044962,3
5,387.918339,6.218791,9.662776,0.583447,,0.5,3,250,"{'max_depth': None, 'max_features': 0.5, 'min_...",-1.544441,-1.445463,-1.549947,-1.513284,0.048009,4
4,232.311618,1.832171,5.751192,0.305489,,0.5,3,150,"{'max_depth': None, 'max_features': 0.5, 'min_...",-1.548584,-1.44737,-1.550746,-1.515567,0.04823,5
3,46.383177,0.543971,1.166503,0.048956,,0.5,3,30,"{'max_depth': None, 'max_features': 0.5, 'min_...",-1.542995,-1.45156,-1.553447,-1.516001,0.045765,6
11,192.487503,1.797238,9.264178,0.536588,,sqrt,2,250,"{'max_depth': None, 'max_features': 'sqrt', 'm...",-1.536449,-1.469222,-1.558363,-1.521345,0.037926,7
2,408.392567,5.27475,10.594235,0.673083,,0.5,2,250,"{'max_depth': None, 'max_features': 0.5, 'min_...",-1.552138,-1.455319,-1.556614,-1.521357,0.046732,8
1,244.701711,1.406167,6.340677,0.353477,,0.5,2,150,"{'max_depth': None, 'max_features': 0.5, 'min_...",-1.555777,-1.454275,-1.558337,-1.522796,0.048463,9
10,115.361754,1.207529,5.5982,0.14417,,sqrt,2,150,"{'max_depth': None, 'max_features': 'sqrt', 'm...",-1.539033,-1.471187,-1.560245,-1.523489,0.037983,10


## Test various hyper-parameters for ExtraTree estimator

In [0]:
# %%time

# ext = ExtraTreesRegressor(n_jobs=-1)
# ext.fit(X_train, y_train)
# preds = ext.predict(X_test)
# print(evaluate(y_test, preds))

{'MAE': 1.39475, 'MSE': 6.08288, 'R^2': 0.7503}
CPU times: user 7min 8s, sys: 2.09 s, total: 7min 10s
Wall time: 3min 38s


In [0]:
# %%time

# ext = ExtraTreesRegressor(n_estimators=250, max_depth=None, min_samples_leaf=10, max_features=0.5, n_jobs=-1)
# ext.fit(X_train, y_train)
# preds = ext.predict(X_test)
# print(evaluate(y_test, preds))

{'MAE': 1.37425, 'MSE': 5.37356, 'R^2': 0.77941}
CPU times: user 6min 39s, sys: 597 ms, total: 6min 39s
Wall time: 3min 22s


In [0]:
# %%time

# ext = ExtraTreesRegressor(n_estimators=150, max_depth=None, min_samples_leaf=2, max_features=0.5, n_jobs=-1)
# ext.fit(X_train, y_train)
# preds = ext.predict(X_test)
# print(evaluate(y_test, preds))

{'MAE': 1.31926, 'MSE': 5.06323, 'R^2': 0.79215}
CPU times: user 4min 21s, sys: 748 ms, total: 4min 21s
Wall time: 2min 13s


In [0]:
# %%time

# ext = ExtraTreesRegressor(n_estimators=150, max_depth=None, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
# ext.fit(X_train, y_train)
# preds = ext.predict(X_test)
# print(evaluate(y_test, preds))

{'MAE': 1.32574, 'MSE': 5.04271, 'R^2': 0.793}
CPU times: user 4min 6s, sys: 351 ms, total: 4min 7s
Wall time: 2min 5s


### Best hyper-parameters: ExtraTrees

In [0]:
%%time

ext = ExtraTreesRegressor(n_estimators=200, max_depth=90, min_samples_leaf=2, max_features=0.5, n_jobs=-1)
ext.fit(X_train, y_train)
preds = ext.predict(X_test)
print(evaluate(y_test, preds))

{'MAE': 1.31907, 'MSE': 5.06138, 'R^2': 0.79223}
CPU times: user 5min 49s, sys: 835 ms, total: 5min 50s
Wall time: 2min 57s


## Comapring with RandomForest Estimator

In [0]:
# %%time

# rf = RandomForestRegressor(n_jobs=-1)
# rf.fit(X_train, y_train)
# preds = rf.predict(X_test)
# print(evaluate(y_test, preds))

{'MAE': 1.32202, 'MSE': 5.23507, 'R^2': 0.7851}
CPU times: user 7min 26s, sys: 714 ms, total: 7min 27s
Wall time: 3min 47s


In [0]:
# %%time

# rf = RandomForestRegressor(n_estimators=150, min_samples_leaf=4, max_features=0.5, n_jobs=-1)
# rf.fit(X_train, y_train)
# preds = rf.predict(X_test)
# print(evaluate(y_test, preds))

{'MAE': 1.30659, 'MSE': 4.91599, 'R^2': 0.7982}
CPU times: user 5min 25s, sys: 424 ms, total: 5min 26s
Wall time: 2min 45s


In [10]:
# %%time

# rf = RandomForestRegressor(n_estimators=150, min_samples_leaf=2, max_features=0.4, n_jobs=-1)
# rf.fit(X_train, y_train)
# preds = rf.predict(X_test)
# print(evaluate(y_test, preds))

{'MAE': 1.3002, 'MSE': 4.90852, 'R^2': 0.7957}
CPU times: user 5min 24s, sys: 1.62 s, total: 5min 25s
Wall time: 2min 45s


In [11]:
# %%time

# rf = RandomForestRegressor(n_estimators=150, min_samples_leaf=2, max_features=0.6, n_jobs=-1)
# rf.fit(X_train, y_train)
# preds = rf.predict(X_test)
# print(evaluate(y_test, preds))

{'MAE': 1.29845, 'MSE': 4.91533, 'R^2': 0.79542}
CPU times: user 7min 9s, sys: 909 ms, total: 7min 10s
Wall time: 3min 37s


### Best hyper-parameters: RandomForest

In [12]:
%%time

rf = RandomForestRegressor(n_estimators=150, min_samples_leaf=2, max_features=0.5, n_jobs=-1)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
print(evaluate(y_test, preds))

{'MAE': 1.29955, 'MSE': 4.91485, 'R^2': 0.79544}
CPU times: user 6min 18s, sys: 702 ms, total: 6min 19s
Wall time: 3min 12s
