# Model selection

## Imports

In [2]:
import os
import sys
sys.path.append("..")

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import src.Cleaning_pipeline as fn
from scipy import stats
import math




from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor


In [4]:
train = pd.read_csv('../INPUT/diamonds_train.csv') 
test = pd.read_csv('../INPUT/diamonds_test.csv')

## Data Engineering

In [5]:
train_clean = fn.clean_data(train, test_data=False)
test_clean = fn.clean_data(test, test_data=True)

In [6]:
X_clean = train_clean.drop(['price'], axis= 1)
y = train_clean.price

### Model testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state = 0)

**Regressors of classification models:**

In [9]:
models = {
    "Kneighbors" : KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm='auto', leaf_size=30,),

    #"RadiusKneighbors" : RadiusNeighborsRegressor(radius=1.0, weights='distance', algorithm='auto'),
    #"GradDescReg" : TransformedTargetRegressor(SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, random_state=0),
                                               #func=np.log1p,
                                                #inverse_func=np.expm1),
    "RandForest" : RandomForestRegressor(n_estimators=700, criterion='mse', max_depth=None, min_samples_split=5, min_samples_leaf=2, random_state=0),

    "DecTreeReg": DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=0)
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)
print('Training done')

Training model: Kneighbors
Training model: RandForest
Training model: DecTreeReg
Training done


In [10]:

for name, model in models.items():
    print(f"---------[{name}]---------:")
    y_pred = model.predict(X_test)
    print("MSE = ",round(mean_squared_error(y_test,y_pred),5))
    print("RMSE = ",np.sqrt(mean_squared_error(y_test,y_pred)))
    print(r2_score(y_test, y_pred))


---------[Kneighbors]---------:
MSE =  368110.0155
RMSE =  606.7207063407085
0.9769077984437536
---------[RandForest]---------:
MSE =  299902.91396
RMSE =  547.6339233076549
0.9811865522676602
---------[DecTreeReg]---------:
MSE =  544468.15876
RMSE =  737.8808567483858
0.9658445357815645


**Strictly regression models (with transformed target):**

In [11]:
models = {
    "LinearRegression" : TransformedTargetRegressor(LinearRegression(),
                                            func=np.log1p,
                                            inverse_func=np.expm1),
    "Ridge" : TransformedTargetRegressor(Ridge(alpha=1, tol=0.001, solver='auto', random_state=0),
                                         func=np.log1p,
                                            inverse_func=np.expm1),
    "Lasso" : TransformedTargetRegressor(Lasso(alpha=1, max_iter=1000, tol=0.0001),
                                         func=np.log1p,
                                            inverse_func=np.expm1),
    "ElasticNet" : TransformedTargetRegressor(ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, tol=0.0001, random_state=0),
                                              func=np.log1p,
                                            inverse_func=np.expm1),
    "BayesianRidge": TransformedTargetRegressor(BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06),
                                                func=np.log1p,
                                            inverse_func=np.expm1)
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)

Training model: LinearRegression
Training model: Ridge
Training model: Lasso
Training model: ElasticNet
Training model: BayesianRidge


In [13]:
for name, model in models.items():
    print(f"---------[{name}]---------:")
    y_pred = model.predict(X_test)

    print("MSE = ",round(mean_squared_error(y_test,y_pred),5))
    print("RMSE = ",np.sqrt(mean_squared_error(y_test,y_pred)))
    print(r2_score(y_test, y_pred))



---------[LinearRegression]---------:
MSE =  3035322.39142
RMSE =  1742.217664765384
0.8095887818885439
---------[Ridge]---------:
MSE =  3033338.02299
RMSE =  1741.6480766753332
0.8097132648797751
---------[Lasso]---------:
MSE =  18174480.5053
RMSE =  4263.153821445274
-0.14011776190210545
---------[ElasticNet]---------:
MSE =  10460828.00923
RMSE =  3234.320331883458
0.34377349525609324
---------[BayesianRidge]---------:
MSE =  3034226.89004
RMSE =  1741.9032378516704
0.8096575046552779


### Model blending

Re-split the train data in train and validation data

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state = 0)


Fit two models:

In [17]:
model1 = RandomForestRegressor(n_estimators=700, criterion='mse', max_depth=None, min_samples_split=5, min_samples_leaf=4, random_state=0)
model1.fit(X_train, y_train)
val_pred1=model1.predict(X_val)
test_pred1=model1.predict(X_test)
val_pred1=pd.DataFrame(val_pred1)
test_pred1=pd.DataFrame(test_pred1)

model2 = GradientBoostingRegressor()
model2.fit(X_train,y_train)
val_pred2=model2.predict(X_val)
test_pred2=model2.predict(X_test)
val_pred2=pd.DataFrame(val_pred2)
test_pred2=pd.DataFrame(test_pred2)

Use the predictions as features for other model:

In [18]:
df_val=pd.concat([X_val.reset_index(), val_pred1,val_pred2],axis=1)
df_test=pd.concat([X_test.reset_index(), test_pred1,test_pred2],axis=1)

In [19]:
model = RandomForestRegressor(n_estimators=700, criterion='mse', max_depth=None, min_samples_split=5, min_samples_leaf=4, random_state=0)
model.fit(df_val,y_val)
model.score(df_test,y_test)

0.9803899733255049

In [20]:
y_pred = model.predict(df_test)
print("MSE = ",round(mean_squared_error(y_test,y_pred),5))
print("RMSE = ",np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test, y_pred))

MSE =  312601.08334
RMSE =  559.1073987513707
0.9803899733255049


**Working on the best model**

In [22]:
parameters = {
    'n_estimators':[100, 200, 300,700,1000],
    'criterion':['mse'],
    'max_features':["auto"],
    'min_samples_split':[2,3,4,5,6,7],
    'min_samples_leaf':[2,3,4,5,6,7]
}
rndfor = RandomForestRegressor()
rndfor_select = GridSearchCV(rndfor, parameters, verbose=4)
rndfor_select.fit(X_train, y_train)



Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, score=0.980, total=   4.8s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s remaining:    0.0s


[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, score=0.977, total=   4.6s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.5s remaining:    0.0s


[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, score=0.981, total=   4.6s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   14.1s remaining:    0.0s


[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, score=0.981, total=   4.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, score=0.980, total=   4.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200, score=0.979, total=   9.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200, score=0.977, total=   9.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700, score=0.980, total=  32.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700, score=0.977, total=  32.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700, score=0.981, total=  32.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700, score=0.981, total=  32.4s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=700, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100, score=0.981, total=   4.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100, score=0.981, total=   4.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100, score=0.980, total=   4.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=200, score=0.980, total=   9.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=200, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=300, score=0.980, total=  13.2s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=700, score=0.980, total=  30.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=700, score=0.977, total=  30.6s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=700, score=0.981, total=  30.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=700, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100, score=0.977, total=   4.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100, score=0.981, total=   4.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100, score=0.981, total=   4.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100, score=0.980, total=   4.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=200, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=300, score=0.981, total=  12.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=300, score=0.980, total=  12.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=700, score=0.980, total=  29.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=700, score=0.977, total=  29.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=700, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100, score=0.980, total=   4.4s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100, score=0.977, total=   4.2s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100, score=0.981, total=   4.2s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100, score=0.981, total=   4.2s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=100, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=300, score=0.981, total=  12.8s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=300, score=0.981, total=  12.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=300, score=0.980, total=  12.8s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=700, score=0.980, total=  29.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=6, n_estimators=700, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=3, min_samples_split=7, n_estimators=1000, score=0.980, total=  41.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100, score=0.979, total=   4.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100, score=0.976, total=   4.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100, score=0.980, total=   4.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100, scor

[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=300, score=0.977, total=  11.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=300, score=0.980, total=  11.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=300, score=0.981, total=  11.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=300, score=0.980, total=  12.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=700, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=1000, score=0.981, total=  39.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=1000, score=0.980, total=  39.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=100, score=0.980, total=   4.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=100, score=0.976, total=   4.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=100, sc

[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300, score=0.979, total=  12.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300, score=0.977, total=  12.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300, score=0.980, total=  11.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300, score=0.981, total=  11.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=300, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=7, n_estimators=1000, score=0.980, total=  39.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=7, n_estimators=1000, score=0.981, total=  39.8s
[CV] criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=4, min_samples_split=7, n_estimators=1000, score=0.980, total=  40.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=100, score=0.979, total=   3.8s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=100, 

[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=200, score=0.979, total=   7.6s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300, score=0.979, total=  11.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300, score=0.976, total=  11.4s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300, score=0.980, total=  11.4s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=1000, score=0.976, total=  37.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=1000, score=0.980, total=  37.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=1000, score=0.981, total=  38.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=1000, score=0.980, total=  38.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=100 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=100

[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=200, score=0.981, total=   7.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=200, score=0.979, total=   7.6s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=300, score=0.979, total=  11.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=300, score=0.976, total=  11.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=6, n_estimators=300, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=1000, score=0.979, total=  38.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=1000, score=0.976, total=  37.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=1000, score=0.980, total=  38.2s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=1000, score=0.981, total=  38.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=5, min_samples_split=7, n_estimators=10

[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=200, score=0.980, total=   7.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=200, score=0.980, total=   7.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=200, score=0.979, total=   7.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=300, score=0.979, total=  11.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=300, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=700, score=0.979, total=  25.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=1000, score=0.979, total=  36.8s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=1000, score=0.976, total=  36.5s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=1000, score=0.980, total=  36.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=4, n_estimators=100

[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=200, score=0.976, total=   7.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=200, score=0.980, total=   7.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=200, score=0.980, total=   7.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=200, score=0.979, total=   7.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=300 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=6, n_estimators=300, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=700, score=0.981, total=  25.6s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=700, score=0.979, total=  25.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1000, score=0.979, total=  36.7s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1000, score=0.976, total=  36.6s
[CV] criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1000,

[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200, score=0.979, total=   7.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200, score=0.976, total=   7.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200, score=0.980, total=   7.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200, score=0.980, total=   7.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=3, n_estimators=200, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=700, score=0.980, total=  24.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=700, score=0.980, total=  24.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=700, score=0.979, total=  25.0s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=1000, score=0.979, total=  35.8s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=4, n_estimators=1000, s

[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=100, score=0.979, total=   3.6s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=200, score=0.979, total=   7.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=200, score=0.976, total=   7.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=200, score=0.980, total=   7.1s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=200 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=6, n_estimators=200, score

[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=700, score=0.976, total=  24.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=700, score=0.980, total=  24.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=700, score=0.980, total=  24.9s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=700 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=700, score=0.979, total=  25.3s
[CV] criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=1000 
[CV]  criterion=mse, max_features=auto, min_samples_leaf=7, min_samples_split=7, n_estimators=1000, sco

[Parallel(n_jobs=1)]: Done 900 out of 900 | elapsed: 273.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n

In [23]:
rndfor_select.best_params_

{'criterion': 'mse',
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 700}

Best model:

In [24]:
models = {
    
    
    "RandForest" : RandomForestRegressor(n_estimators=rndfor_select.best_params_['n_estimators'], 
                                                     criterion=rndfor_select.best_params_['criterion'],
                                                     max_features=rndfor_select.best_params_['max_features'], 
                                                     max_depth=None, 
                                                     min_samples_split=rndfor_select.best_params_['min_samples_split'], 
                                                     min_samples_leaf=rndfor_select.best_params_['min_samples_leaf'], random_state=0),

}



for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)
print('Training done')
for name, model in models.items():
    print(f"---------[{name}]---------:")
    y_pred = model.predict(X_test)
    #y_pred_backtransformed =np.array([math.exp(e)-1 for e in y_pred])
    print("MSE = ",round(mean_squared_error(y_test,y_pred),5))
    print("RMSE = ",np.sqrt(mean_squared_error(y_test,y_pred)))
    print(r2_score(y_test, y_pred))

Training model: RandForest
Training done
---------[RandForest]---------:
MSE =  306147.28114
RMSE =  553.3057754482417
0.9807948319134133
