In [None]:
#install and import packages
!pip install --upgrade scikit-learn
!pip install pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV

from sklearn import linear_model, ensemble
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
import numpy

from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC



In [None]:
#read and display data
data = pd.read_csv("/content/winequality-red.csv", sep = ";")
#split into features/target
x = data.drop(columns = ['quality'])
y = data['quality']

In [None]:
#expand search space
#compare performance of different pipelines

In [None]:
!pip install scikit-optimize
import skopt
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

In [None]:
#define hyperparameter grids for each type of classifier
#https://scikit-optimize.github.io/stable/modules/generated/skopt.space.space.Real.html
#https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html

svc_param_grid = BayesSearchCV(SVC(),
      {
     'C': Real(1e-6, 1e+6, prior='log-uniform'),
         'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
         'degree': Integer(1,8),
         'kernel': Categorical(['linear', 'poly', 'rbf']),
          },      n_iter=32,
        random_state=0,
        scoring = "balanced_accuracy"
)
kn_param_grid = BayesSearchCV(KNeighborsClassifier(),
 {
        'n_neighbors' : Integer(1, 100, prior = 'log-uniform'),
        'algorithm' :  Categorical(['ball_tree', 'kd_tree', 'brute']),
        'leaf_size' : Integer(1, 50, prior='log-uniform'),
                },
        n_iter=32,
        random_state=0,
        scoring = "balanced_accuracy"
                             )
ridge_param_grid = BayesSearchCV(RidgeClassifier(),
{
        'tol' : Real(0.01, 0.1, prior = 'log-uniform'),
        'solver' : Categorical(["svd", "cholesky","sparse_cg", 'saga', 'lsqr']),
        'alpha' : Real(0.1, 1.0, prior = 'log-uniform'),
                },
        n_iter=32,
        random_state=0,
        scoring = "balanced_accuracy"
                                )

dt_param_grid = BayesSearchCV(DecisionTreeClassifier(),
{
        'max_depth' : Integer(1, 10, prior = 'log-uniform'),
        'max_features' : Categorical([None, "auto", "sqrt", "log2"]),
        'min_samples_split':Real(0.1, 1.0, prior = 'log-uniform'),
                },
        n_iter=32,
        random_state=0,
        scoring = "balanced_accuracy"
                             )

bagging_param_grid = BayesSearchCV(ensemble.BaggingClassifier(),
                                  {
    "n_estimators" : Integer(50, 500, prior = 'log-uniform'),
    "max_features" : Real(0.1, 5, prior = 'log-uniform'),
},
                                    n_iter=32,
        random_state=0,
        scoring = "balanced_accuracy")

random_forest_param_grid = BayesSearchCV(ensemble.RandomForestClassifier(),
 {"n_estimators" : Integer(100, 100000, prior = 'log-uniform'),
    "criterion" : Categorical(["gini", "entropy", "log_loss"]),
    "max_depth" : Integer(1, 10, prior = "log-uniform"),
},

 n_iter=32,
        random_state=0,
        scoring = "balanced_accuracy"
                             )
#construct a pipeline with a scaler, encoder, feature selector, and estimator/classifier
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('onehot', OneHotEncoder()),
    ('selector', VarianceThreshold()),
    ('estimator', KNeighborsClassifier())
])

#train/test split
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=42)

#define a grid search over different estimators in the pipeline
grid = GridSearchCV(
    estimator=pipe,
    param_grid={
        "scaler": [StandardScaler(), MinMaxScaler(), Normalizer(), MaxAbsScaler(), "passthrough"],
        "onehot": [OneHotEncoder(), "passthrough"],
        "selector"  : [VarianceThreshold(), "passthrough"],
        'estimator': [ridge_param_grid, kn_param_grid, dt_param_grid, bagging_param_grid, random_forest_param_grid],
    },
    scoring = 'balanced_accuracy',
    cv = 3,
    return_train_score = True
)
#fit on training data
grid.fit(x_train, y_train)

#score over test
print('Training set score: ' + str(grid.score(x_train, y_train)))
print('Test set score: ' + str(grid.score(x_test, y_test)))
#10-fold cv over training set
cv_results = cross_validate(
        grid, x_train, y_train, cv=10, return_estimator=True, scoring = "balanced_accuracy"
    )
cv_results = pd.DataFrame(cv_results)
cv_test_scores = cv_results["test_score"]
#display results
print(
        "Generalization score with hyperparameters tuning:\n"
        f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
    )
#display best hyperparameter configuration
print("Best Score: ", grid.best_score_)
print("Best Params: ", grid.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 210, in _get_response_values
    y_pred = prediction_method(X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 602, in predict
    Xt = transform.transform(Xt)
  File "/u

In [None]:
print(grid.param_grid["estimator"][1].best_params_)