In [2]:
#install and import packages
!pip install --upgrade scikit-learn
!pip install pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2


In [3]:
#print python version for report
!python ‐‐version
import sys; print(sys.version)

python3: can't open file '/content/‐‐version': [Errno 2] No such file or directory
3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]


In [5]:
#read and display data
data = pd.read_csv("/content/winequality-red.csv", sep = ";")
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [6]:
#display information about dataset for report
data.columns
print(data.groupby(['quality']).count())

         fixed acidity  volatile acidity  citric acid  residual sugar  \
quality                                                                 
3                   10                10           10              10   
4                   53                53           53              53   
5                  681               681          681             681   
6                  638               638          638             638   
7                  199               199          199             199   
8                   18                18           18              18   

         chlorides  free sulfur dioxide  total sulfur dioxide  density   pH  \
quality                                                                       
3               10                   10                    10       10   10   
4               53                   53                    53       53   53   
5              681                  681                   681      681  681   
6              638  

In [8]:
#split into features/target
x = data.drop(columns = ['quality'])
#attemptd normalization at one point but this step yielded lower performance
#x_norm = sklearn.preprocessing.normalize(x, axis=0)
y = data['quality']

In [None]:
#function to report relevant metrics
def return_metrics(y_test, y_pred):
  from sklearn import metrics
  print("Accuracy: %.2f" % metrics.accuracy_score(y_test, y_pred))
  print(metrics.balanced_accuracy_score(y_test, y_pred))
  print(metrics.classification_report(y_test, y_pred))
  print(metrics.confusion_matrix(y_test, y_pred))

In [None]:
#suppress warnings about class imbalances
import warnings
warnings.filterwarnings("ignore")

#import models, packages
from sklearn import linear_model, ensemble
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
import numpy

def nested_resampling(m, x, y):
    model_score = []
    for i in range(0, 3): #3-fold outer cross-validation
        x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=42)
        #10-fold inner cross-validation to determine best parameters
        scores = sklearn.model_selection.cross_validate(m, x_train, y_train, cv=10, scoring = "balanced_accuracy", return_estimator=True)
        #obtain highest model performance
        best = list(scores["test_score"]).index(max(scores["test_score"]))
        #obtain model with highest performance
        best_model = scores["estimator"][best]
        #test best performing model on outer test set
        y_pred = best_model.predict(x_test)
        #add to list of best model scores
        model_score.append(sklearn.metrics.balanced_accuracy_score(y_test, y_pred))
    #return average of highest performing models
    return numpy.array(model_score).mean()

def ridge_hpo(x, y):
    best_ridge = 0
    best_config = []
    #define hyperparameter configurations
    for alpha in [1.0, 1.1, 2.0, 5.0]:
        for tol in [.0001, 0.001, 0.01, 0.1]:
            for solver in ["svd", "cholesky", "lsqr", "sparse_cg"]:
                for max_iter in [100, 200, 500, 1000, 10000, None]:
                    #create model
                    m = linear_model.RidgeClassifier(alpha = alpha, tol = tol, solver = solver, max_iter = max_iter)
                    #perform nested resampling for a given hyperparameter configuration
                    mean_balanced_accuracy = nested_resampling(m, x, y)
                    if mean_balanced_accuracy > best_ridge:
                        #update best hyperparameter configuration
                        best_ridge = mean_balanced_accuracy
                        best_config = [alpha, tol, solver, max_iter]
    #print best configuration and balanced accuracy
    print("alpha, tol, solver, max_iter", best_config, "mean score", best_ridge)

#same as above for bagging classifier
def bagging_hpo(x, y):
    best_bagging = 0.0
    best_config = []
    for n_estimators in [100, 500, 1000, 10000]:
        for max_samples in [0.1, 1.0, 2, 5]:
            for max_features in [0.1, 1.0, 2, 5]:
                m = ensemble.BaggingClassifier(n_estimators = n_estimators, max_samples = max_samples, max_features = max_features)
                mean_balanced_accuracy = nested_resampling(m, x, y)
                if mean_balanced_accuracy > best_bagging:
                    best_bagging = mean_balanced_accuracy
                    best_config = [n_estimators, max_samples, max_features]
    print("n_estimators, max_samples, max_features", best_config, "mean score", best_bagging)

#same as above for random forest
def random_forest_hpo(x, y):
    best_rf = 0.0
    best_config = []
    for n_estimators in [100, 500, 1000, 10000]:
        for criterion in ["gini", "entropy", "log_loss"]:
            for max_depth in [None, 2, 3, 5, 10]:
                m = ensemble.RandomForestClassifier(n_estimators = n_estimators, criterion = criterion, max_depth = max_depth)
                mean_balanced_accuracy = nested_resampling(m, x, y)
            if mean_balanced_accuracy > best_rf:
                best_rf = mean_balanced_accuracy
                best_config = [n_estimators, max_samples, max_features]
    print("n_estimators, criterion, max_depth", best_config, "mean score", best_rf)

In [1]:
#perform HPO for Ridge classifier
ridge_hpo(x, y)

NameError: ignored

In [None]:
#perform HPO for bagging classifier
bagging_hpo(x, y)

KeyboardInterrupt: ignored

In [None]:
#perform HPO for random forest classifier
random_forest_hpo(x, y)

In [9]:
#import grid search and cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

#define parameters (same as above) for the three classifiers
ridge_param_grid = {"alpha": [1.0, 1.1, 2.0, 5.0],
    "tol": [.0001, 0.001, 0.01, 0.1],
    "solver": ["svd", "cholesky", "lsqr", "sparse_cg"],
    "max_iter": [100, 200, 500, 1000, 10000, None]
}

bagging_param_grid = {"n_estimators" : [100, 500, 1000, 10000],
    "max_samples" : [0.1, 1.0, 2, 5],
    "max_features" : [0.1, 1.0, 2, 5]
}

random_forest_param_grid = {"n_estimators" : [100, 500, 1000, 10000],
    "criterion" : ["gini", "entropy", "log_loss"],
    "max_depth" : [None, 2, 3, 5, 10]
}

#another function to perform nested resampling using built-in libraries
def nested_resampling_2(model, param_grid, x, y):
    #3-fold outer resampling
    model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=3)
    model_grid_search.fit(x, y)
    #10_fold inner resampling
    cv_results = cross_validate(
        model_grid_search, x, y, cv=10, return_estimator=True
    )

    cv_results = pd.DataFrame(cv_results)
    cv_test_scores = cv_results["test_score"]
    #display results
    print(
        "Generalization score with hyperparameters tuning:\n"
        f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
    )

In [None]:
#import models, packages
from sklearn import linear_model, ensemble
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
import numpy

#models to compare
models = [linear_model.RidgeClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier()]
model_names = ["Ridge Classifier", "Bagging Classifier", "RandomForest Classifier"]
param_grids = [ridge_param_grid, bagging_param_grid, random_forest_param_grid]
for i in range(len(models)):
    print(model_names[i])
    nested_resampling_2(models[i], param_grids[i], x, y)


Ridge Classifier
