In [None]:
from sklearn.feature_selection import VarianceThreshold # Feature selector
from sklearn.pipeline import Pipeline # For setting up pipeline
# Various pre-processing steps
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV # For optimization

Choose a small number of different machine learning algorithms and hyperparameters, along with sensible value ranges, and additional ML pipeline components. This could include, for example, a feature imputation step, a feature recoding step, and an ensembling or stacking step in addition to the learning algorithm. If those components have hyperparameters, you must choose ranges and tune them as well. In addition, your pipeline should allow the hyperparameter optimization to turn individual components on and off, e.g. use a one-hot-encoding or not.

You can use implementations of AutoML systems (e.g. auto-sklearn), scientific papers, or the documentation of the library you are using to determine the hyperparameters to tune and the value ranges. Note that there is not only a single way to do this, but define a reasonable space (e.g. don't include whether to turn on debug output or random forests with 1,000,000 trees). Your ML pipeline needs to be reasonably complex, i.e. at least three components.

Determine the best ML pipeline. Make sure to optimize the entire ML pipeline, not individual parts individually. Choose a suitable hyperparameter optimizer; you could also use several and e.g. compare the results achieved by random search and Bayesian optimization. Make sure that the way you evaluate this avoids bias and overfitting. You could use statistical tests to make this determination.

In [None]:
#import models, packages
from sklearn import linear_model, ensemble
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
import numpy

#models to compare
models = [linear_model.RidgeClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier()]

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('onehot', OneHotEncoder()),
    ('selector', VarianceThreshold()),
    ('classifier', KNeighborsClassifier())
])

parameters = {"scaler":[StandardScaler(), MinMaxScaler(), Normalizer(), MaxAbsScaler(), "passthrough"],
              "onehot":[OneHotEncoder(), "passthrough"],
              "selector": [VarianceThreshold(), "passthrough"],
              "classifier":[linear_model.RidgeClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier()]}

ridge_param_grid = {"alpha": [1.0, 1.1, 2.0, 5.0],
    "tol": [.0001, 0.001, 0.01, 0.1],
    "solver": ["svd", "cholesky", "lsqr", "sparse_cg"],
    "max_iter": [100, 200, 500, 1000, 10000, None]
}

bagging_param_grid = {"n_estimators" : [100, 500, 1000, 10000],
    "max_samples" : [0.1, 1.0, 2, 5],
    "max_features" : [0.1, 1.0, 2, 5]
}

random_forest_param_grid = {"n_estimators" : [100, 500, 1000, 10000],
    "criterion" : ["gini", "entropy", "log_loss"],
    "max_depth" : [None, 2, 3, 5, 10]
}
}

grid_params = {
  'lr__penalty': ['l1', 'l2'],
  'lr__C': [1, 5, 10],
  'lr__max_iter': [20, 50, 100],
  'tfidf_pipeline__tfidf_vectorizer__max_df': np.linspace(0.1, 1, 10),
  'tfidf_pipeline__tfidf_vectorizer__binary': [True],
}

parameters = {"scaler":[StandardScaler(), MinMaxScaler(),
 Normalizer(), MaxAbsScaler()],
 'selector__threshold': [0, 0.001, 0.01],
 'classifier__n_neighbors': [1, 3, 5, 7, 10],
 'classifier__p': [1, 2],
 'classifier__leaf_size': [1, 5, 10, 15]