## Apress - Industrialized Machine Learning Examples

Andreas Francois Vermeulen
2019

### This is an example add-on to a book and needs to be accepted as part of that copyright.

## Chapter-008-008-Features-01

In [1]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

In [2]:
iris = load_iris()

X, y = iris.data, iris.target

# This dataset is high-dimensional. Better do PCA:
pca = PCA(n_components=2, random_state=1968, whiten=True, copy=True)

selection = SelectKBest(k=1)

# Build estimator from PCA and Univariate selection:

combined_features = FeatureUnion([('pca', pca), ('univ_select', selection)])

# Use combined features to transform datasets:
X_features = combined_features.fit(X, y).transform(X)
print("Combined space has", X_features.shape[1], 'features')

svm = SVC(kernel='rbf',gamma='auto', class_weight='balanced')

# Do grid search over k, n_components and C:

pipeline = Pipeline([("features", combined_features), ('svm', svm)])

param_grid = dict(features__pca__n_components=[1, 2, 3],
                  features__univ_select__k=[1, 2, 3],
                  svm__C=[0.1, 1.0 , 20]
                 )

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=25, verbose=10, n_jobs=-1)
grid_search.fit(X, y)
print(grid_search.best_estimator_)

Combined space has 3 features
Fitting 25 folds for each of 27 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1964s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0900s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 117 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 424 tasks      | elapsed:    3.5s


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=1968,
  svd_solver='auto', tol=0.0, whiten=True)), ('univ_select', SelectKBest(k=2, score_func=<function f_classif at 0x00000267E5876378>))],
       trans...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])


[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 592 out of 675 | elapsed:    3.9s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed:    3.9s finished


## Done

In [3]:
import datetime
now = datetime.datetime.now()
print('Done!',str(now))

Done! 2019-04-22 21:45:33.438933
