In [30]:
import pandas as pd
import numpy as np

import optuna

In [31]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score

In [32]:
main_df = pd.read_csv("./data/winequality-white.csv", sep=";")
main_df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6


In [33]:
main_df.describe() # high level overview of the dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [34]:
main_df.isna() # checking if any column has null values

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,False,False,False,False,False,False,False,False,False,False,False,False
4894,False,False,False,False,False,False,False,False,False,False,False,False
4895,False,False,False,False,False,False,False,False,False,False,False,False
4896,False,False,False,False,False,False,False,False,False,False,False,False


In [35]:
main_df["quality"] -= main_df["quality"].min()
main_df["quality"].value_counts()

3    2198
2    1457
4     880
5     175
1     163
0      20
6       5
Name: quality, dtype: int64

In [36]:
main_df["quality"] = pd.Categorical(main_df.quality)
main_df.dtypes

fixed acidity            float64
volatile acidity         float64
citric acid              float64
residual sugar           float64
chlorides                float64
free sulfur dioxide      float64
total sulfur dioxide     float64
density                  float64
pH                       float64
sulphates                float64
alcohol                  float64
quality                 category
dtype: object

In [37]:
from sklearn.model_selection import train_test_split

X = main_df.drop(columns=["quality"], inplace=False)
y = main_df["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [38]:
normalizer = StandardScaler()

I will be using the guide on https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html for selecting the algorithms.

In [39]:
from sklearn.svm import LinearSVC

svm = make_pipeline(normalizer, LinearSVC(dual=False))
svm.fit(X_train, y_train)

svm.score(X_test, y_test)

0.5408163265306123

In [40]:
from sklearn.neighbors import KNeighborsClassifier

knn = make_pipeline(normalizer, KNeighborsClassifier(n_neighbors=7, weights="distance"))
knn.fit(X_train, y_train)

knn.score(X_test, y_test)

0.6653061224489796

In [41]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = make_pipeline(normalizer, RandomForestClassifier())
random_forest_classifier.fit(X_train, y_train)

random_forest_classifier.score(X_test, y_test)

0.686734693877551

In [53]:
def sklearn_models(trial):
    classifier_name = trial.suggest_categorical("classifier", ["SupportVectorMachines", "KNearestNeighbors", "RandomForest"])

    if classifier_name == "SupportVectorMachines":
        params = {
            "dual": False,
            "tol": trial.suggest_float("tol", 1e-5, 1e-3, log=True),
            "random_state": 42
        }
        classifier = LinearSVC()
        classifier.set_params(**params)
    elif classifier_name == "KNearestNeighbors":
        params = {
            "n_neighbors": trial.suggest_int("n_neigbors", 3, 15, 2),
            "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
            "algorithm": trial.suggest_categorical("algorithm", ["ball_tree", "kd_tree", "brute"])            
        }
        classifier = KNeighborsClassifier()
        classifier.set_params(**params)
    else: # RandomForest
        params = {
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
            "max_depth": trial.suggest_loguniform("max_depth", 3, 99),
            "random_state": 42
        }
        classifier = RandomForestClassifier()
        classifier.set_params(**params)

    classifier_pipeline = make_pipeline(StandardScaler(), classifier)
    classifier_pipeline.fit(X_train, y_train)
    return classifier_pipeline.score(X_test, y_test)


study = optuna.create_study(direction="maximize")
study.optimize(sklearn_models, n_trials=30, timeout=45)
print(study.best_trial)

[32m[I 2021-11-19 12:07:31,092][0m A new study created in memory with name: no-name-764a2cd5-646e-4e5b-bff5-b05b1b453a6f[0m
[32m[I 2021-11-19 12:07:32,266][0m Trial 0 finished with value: 0.6857142857142857 and parameters: {'classifier': 'RandomForest', 'criterion': 'entropy', 'max_depth': 14.567457964101791}. Best is trial 0 with value: 0.6857142857142857.[0m
[32m[I 2021-11-19 12:07:32,316][0m Trial 1 finished with value: 0.5408163265306123 and parameters: {'classifier': 'SupportVectorMachines', 'tol': 4.9461772008605155e-05}. Best is trial 0 with value: 0.6857142857142857.[0m
[32m[I 2021-11-19 12:07:32,850][0m Trial 2 finished with value: 0.5724489795918367 and parameters: {'classifier': 'RandomForest', 'criterion': 'gini', 'max_depth': 6.996538628150811}. Best is trial 0 with value: 0.6857142857142857.[0m
[32m[I 2021-11-19 12:07:32,975][0m Trial 3 finished with value: 0.5489795918367347 and parameters: {'classifier': 'KNearestNeighbors', 'n_neigbors': 3, 'weights': 'un

FrozenTrial(number=24, values=[0.6948979591836735], datetime_start=datetime.datetime(2021, 11, 19, 12, 7, 47, 586318), datetime_complete=datetime.datetime(2021, 11, 19, 12, 7, 48, 460870), params={'classifier': 'RandomForest', 'criterion': 'gini', 'max_depth': 20.905451204789117}, distributions={'classifier': CategoricalDistribution(choices=('SupportVectorMachines', 'KNearestNeighbors', 'RandomForest')), 'criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'max_depth': LogUniformDistribution(high=99.0, low=3.0)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=24, state=TrialState.COMPLETE, value=None)


In [29]:
import xgboost as xgb

def XGB_Model(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)

    param = {
        "verbosity": 0,
        "objective": "multi:softmax",
        "num_class": 7,
        "eval_metric": "mlogloss",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-10, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-10, 1.0, log=True),
    }

    if param["booster"] in ["gbtree", "dart", "gblinear"]:
        param["max_depth"] = trial.suggest_int("max_depth", 3, 11, step=1)
        param["eta"] = trial.suggest_float("eta", 0.1, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 0.1, 1.0, log=True)

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-10, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-10, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_test, pred_labels)
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(XGB_Model, n_trials=20, timeout=200)

print("Trials: {}".format(len(study.trials)))
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-11-19 11:34:28,135][0m A new study created in memory with name: no-name-056bab9c-904c-4b11-8eb4-f55977d09692[0m
[32m[I 2021-11-19 11:34:28,683][0m Trial 0 finished with value: 0.6683673469387755 and parameters: {'booster': 'gbtree', 'lambda': 0.4030868297378068, 'alpha': 0.00011981769393911374, 'max_depth': 9, 'eta': 0.40516758559358423, 'gamma': 0.2887802273364587}. Best is trial 0 with value: 0.6683673469387755.[0m
[32m[I 2021-11-19 11:34:29,006][0m Trial 1 finished with value: 0.6816326530612244 and parameters: {'booster': 'gbtree', 'lambda': 0.00017566829295131156, 'alpha': 0.24441369519725625, 'max_depth': 10, 'eta': 0.3531366842915677, 'gamma': 0.3571857702532052}. Best is trial 1 with value: 0.6816326530612244.[0m
[32m[I 2021-11-19 11:34:29,253][0m Trial 2 finished with value: 0.6295918367346939 and parameters: {'booster': 'dart', 'lambda': 0.028131255352962528, 'alpha': 5.463500898492538e-05, 'max_depth': 6, 'eta': 0.5767593995010245, 'gamma': 0.462498689

Trials: 20
  Value: 0.6877551020408164
  Params: 
    booster: dart
    lambda: 1.866912618242207e-06
    alpha: 8.770102168196541e-07
    max_depth: 11
    eta: 0.4026011783482789
    gamma: 0.1032257935325859
    sample_type: uniform
    normalize_type: forest
    rate_drop: 0.24890723275162133
    skip_drop: 1.1711605934884348e-10
