In [1]:
#install and import packages
!pip install --upgrade scikit-learn
!pip install pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV

from sklearn import linear_model, ensemble
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
import numpy

from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2


In [2]:
#read and display data
data = pd.read_csv("/content/winequality-red.csv", sep = ";")
#split into features/target
x = data.drop(columns = ['quality'])
y = data['quality']

In [None]:
#define hyperparameter grids for each type of classifier
kn_param_grid = GridSearchCV(KNeighborsClassifier(),
    param_grid = {
        'n_neighbors' : [1, 5, 10, 50, 100],
        'algorithm' :  ['ball_tree', 'kd_tree', 'brute'],
        'leaf_size' : [1, 5, 10, 50]
                }
                             )
ridge_param_grid = GridSearchCV(RidgeClassifier(),
    param_grid = {
        'tol' : [.0001, 0.001, 0.01, 0.1],
        'solver' : ["svd", "cholesky","sparse_cg", 'saga', 'lsqr'],
        'alpha' : [0.1, 0.2, 0.5, 1.0]
                }
                                )

dt_param_grid = GridSearchCV(DecisionTreeClassifier(),
    param_grid = {
        'max_depth' : [1,2,5, 10],
        'max_features' : [None, "auto", "sqrt", "log2"],
        'min_samples_split':[0.1, 0.2, 0.5, 1.0]
                }
                             )

bagging_param_grid = GridSearchCV(estimator = ensemble.BaggingClassifier(), param_grid = {
    "n_estimators" : [50, 100, 500],
    "max_features" : [0.1, 1.0, 5]
})

random_forest_param_grid = {"n_estimators" : [100, 500, 1000, 10000],
    "criterion" : ["gini", "entropy", "log_loss"],
    "max_depth" : [None, 2, 3, 5, 10]
}

#construct a pipeline with a scaler, encoder, feature selector, and estimator/classifier
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('onehot', OneHotEncoder()),
    ('selector', VarianceThreshold()),
    ('estimator', KNeighborsClassifier())
])

#train/test split
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=42)

#define a grid search over different estimators in the pipeline
grid = GridSearchCV(
    estimator=pipe,
    param_grid={
        "scaler": [StandardScaler(), MinMaxScaler(), Normalizer(), MaxAbsScaler(), "passthrough"],
        "onehot": [OneHotEncoder(), "passthrough"],
        "selector"  : [VarianceThreshold(), "passthrough"],
        'estimator': [ridge_param_grid, kn_param_grid, dt_param_grid, bagging_param_grid, random_forest_param_grid],
    },
    scoring = 'balanced_accuracy',
    cv = 3,
    return_train_score = True
)
#fit on training data
grid.fit(x_train, y_train)

#score over test
print('Training set score: ' + str(grid.score(x_train, y_train)))
print('Test set score: ' + str(grid.score(x_test, y_test)))
#10-fold cv over training set
cv_results = cross_validate(
        grid, x_train, y_train, cv=10, return_estimator=True, scoring = "balanced_accuracy"
    )
cv_results = pd.DataFrame(cv_results)
cv_test_scores = cv_results["test_score"]
#display results
print(
        "Generalization score with hyperparameters tuning:\n"
        f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
    )
#display best hyperparameter configuration
print("Best Score: ", grid.best_score_)
print("Best Params: ", grid.best_params_)

240 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_ridge.py", line 1435, in fit
    super().fit(X, Y, sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_ridge.py", line 823, in fit
    raise ValueError(
ValueErro

In [None]:
print(grid.param_grid["estimator"][1].best_params_)