# Model selection

In [1]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [2]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
logistic = linear_model.LogisticRegression()
penalty = ["l1", "l2"]
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)

In [3]:
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)
best_model = gridsearch.fit(features, target)

In [4]:
print("Best penalty:", best_model.best_estimator_.get_params()["penalty"])
print("Best C:", best_model.best_estimator_.get_params()["C"])

Best penalty: l2
Best C: 7.742636826811269


In [5]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

In [7]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
logistic = linear_model.LogisticRegression()
penalty = ["l1", "l2"]
C = uniform(loc=0, scale=4)
hyperparams = dict(C=C, penalty=penalty)

In [8]:
randomizedsearch = RandomizedSearchCV(logistic,
                                      hyperparams,
                                      random_state=1,
                                      n_iter=100,
                                      cv=5,
                                      verbose=0,
                                      n_jobs=-1)

In [9]:
best_model = randomizedsearch.fit(features, target)
print("Best penalty:", best_model.best_estimator_.get_params()["penalty"])
print("Best C:", best_model.best_estimator_.get_params()["C"])

Best penalty: l2
Best C: 3.730229437354635


In [10]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [12]:
np.random.seed()

In [13]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
pipe = Pipeline([("classifier", RandomForestClassifier())])

In [14]:
search_space = [{"classifier": [linear_model.LogisticRegression()],
                 "classifier__penalty": ["l1", "l2"],
                 "classifier__C": np.logspace(0, 4, 10)},
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_features": [1, 2, 3]}]

In [15]:
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)
best_model = gridsearch.fit(features, target)

In [16]:
best_model.best_estimator_.get_params()["classifier"]

LogisticRegression(C=7.742636826811269)

In [17]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [18]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [19]:
np.random.seed(0)

In [20]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [21]:
preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])
pipe = Pipeline([("preprocess", preprocess), ("classifier", linear_model.LogisticRegression())])

In [22]:
search_space = [{"preprocess__pca__n_components": [1, 2, 3],
                 "classifier__penalty": ["l1", "l2"],
                 "classifier__C": np.logspace(0, 4, 10)}]

In [23]:
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

In [24]:
best_model = clf.fit(features, target)

In [25]:
best_model.best_estimator_.get_params()["preprocess__pca__n_components"]

2

In [26]:
logistic = linear_model.LogisticRegression()
penalty = ["l1", "l2"]
C = np.logspace(0 ,4, 1000)
hyperparams = dict(C=C, penalty=penalty)
gridsearch = GridSearchCV(logistic, hyperparams, cv=5, n_jobs=-1, verbose=1)
best_model = gridsearch.fit(features, target)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 3200 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 6000 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 9600 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:   27.9s finished
