# Example: Model selection

Perform model comparison and criteria based selection.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

from dstools import model_selection

%matplotlib inline

Load training data

In [2]:
X, y = make_regression()

Specify parameter grid

In [3]:
elnet_pipe = make_pipeline(
    PCA(n_components=0.99, random_state=0),
    ElasticNet(random_state=0)
)
rf_pipe = make_pipeline(
    PCA(n_components=0.99, random_state=0),
    RandomForestRegressor(random_state=0)
)
pipe_grid_specs = (
    (elnet_pipe, {'alpha': [0.01, 0.1, 1], 'l1_ratio': [0.3, 0.5, 0.7]}),
    (rf_pipe, {'max_depth': [200, 500, 700]})
)

Format grid specifications 

In [4]:
pipes_and_parameters = model_selection.parameter_grid(pipe_grid_specs, pipeline=True)

Compare models and report the best alternative

In [5]:
pipe_results = model_selection.compare_estimators(
    X, y, pipes_and_parameters, test_size=0.3, folds=10, scoring='neg_mean_squared_error'
)

Model performance report 
-------------------------
Name: elasticnet
Training scores: -30221.97 +/- 2482.352
Test scores: -32519.288 +/- 3203.032
Train-test difference: 2297.317575399884

Model performance report 
-------------------------
Name: randomforestregressor
Training scores: -50167.07 +/- 8036.687
Test scores: -40338.023 +/- 2426.482
Train-test difference: -9829.047548844304



In [6]:
model_selection.report_best_model(pipe_results, criteria='bias')

Best model report 
--------------------
Name: elasticnet
Criteria: bias
Best scores: -32519.287581346813


In [7]:
model_selection.report_best_model(pipe_results, criteria='variance')

Best model report 
--------------------
Name: randomforestregressor
Criteria: variance
Best scores: -9829.047548844304
