# Model Selection

In [4]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

## Selecting Best Models Using Exhaustive Search

In [4]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
logsitic = linear_model.LogisticRegression(solver='liblinear')
penalty = ['l1','l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)
gridsearch = GridSearchCV(logsitic, hyperparameters, cv=5, verbose=0)
best_model = gridsearch.fit(features, target)



In [5]:
best_model.best_estimator_.get_params()

{'C': 7.742636826811269,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l1',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [6]:
best_model.best_estimator_.get_params()['C']

7.742636826811269

In [7]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Selecting Best Models Using Randomized Search

In [9]:
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

C = uniform(loc=0, scale=4)
hyperparameters = dict(C=C, penalty=penalty)
randomizedsearch = RandomizedSearchCV(logsitic, hyperparameters, random_state=1,n_iter=100,cv=5,verbose=0,n_jobs=-1)
best_model = randomizedsearch.fit(features, target)

In [10]:
best_model.best_estimator_.get_params()

{'C': 1.668088018810296,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l1',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [11]:
uniform(loc=0, scale=4).rvs(10)

array([2.67372851, 3.87164982, 1.552851  , 0.24602678, 2.73355972,
       0.01378602, 2.89560132, 3.16320152, 2.94876982, 1.51280739])

In [12]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Selecting Best Models from Multiple Learning Algorithms

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
np.random.seed(0)
pipe = Pipeline([("classifier", RandomForestClassifier())])
search_space = [{"classifier":[LogisticRegression(solver='liblinear')],
                "classifier__penalty":['l1','l2'],
                "classifier__C":np.logspace(0,4,10)},
               {"classifier":[RandomForestClassifier()],
               "classifier__n_estimators":[10,100,1000],
               "classifier__max_features":[1,2,3]}]
gridsearch = GridSearchCV(pipe, search_space,cv=5,verbose=0)
best_model = gridsearch.fit(features, target)



In [16]:
best_model.best_estimator_.get_params()["classifier"]

LogisticRegression(C=7.742636826811269, penalty='l1', solver='liblinear')

In [17]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Selecting Best Models When Preprocessing

In [18]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
np.random.seed(0)
preprocess = FeatureUnion([("std", StandardScaler()),("pca"，PCA())])
pipe = Pipeline([("preprocess", preprocess),
                ("classifier",LogisticRegression('liblinear'))])
search_space = [{"preprocess__pca__n_components":[1,2,3],
                "classifier__penalty":["l1","l2"],
                "classifier__C":np.logspace(0,4,10)}]
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
best_model = clf.fit(features, target)

SyntaxError: invalid character in identifier (<ipython-input-18-786166b923c9>, line 5)

## Speeding Up Model Selection with Parallelization 

In [19]:
logsitic = linear_model.LogisticRegression('liblinear')
penalty = ["l1", "l2"]
C = np.logspace(0,4,1000)
hyperparameters = dict(C=C, penalty=penalty)
gridsearch = GridSearchCV(logsitic, hyperparameters, cv=5, n_jobs=-1,verbose=1)
best_model = gridsearch.fit(features, target)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 2664 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 5464 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 9064 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:   27.2s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Speeding Up Model Selection Using Algorithm- Specific Methods

In [22]:
logit = linear_model.LogisticRegressionCV(Cs=100)
logit.fit(features, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegressionCV(Cs=100)

## Evaluating Performance After Model Selection

In [24]:
from sklearn.model_selection import cross_val_score
logit = linear_model.LogisticRegression()
C = np.logspace(0,4,20)
hyperparameters = dict(C=C)
gridsearch = GridSearchCV(logit, hyperparameters, cv=5,n_jobs=-1,verbose=0)
cross_val_score(gridsearch, features, target).mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.9733333333333334