In [4]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
data_path = Path("./winequality-white.csv")
wine_df = pd.read_csv(data_path, header=0, names=["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"], sep=";")

# data overview:
print(wine_df.describe())

# which quality classes do we have? :
qualities = wine_df["quality"].unique()
print(f"Number of unique 'qualities': {len(qualities)}")
print(f"Qualities: {sorted(qualities)}")
binc = np.bincount([q for q in wine_df["quality"]])
no_inst = len(wine_df)
print(f"\nClass counts: {binc}")
print(f"\nNumber of instances: {no_inst} ")
print(f"\nClass fractions: {np.round(binc/no_inst,4) * 100}")

In [18]:
wine_df.shape # (4898,12)

(4898, 12)

In [23]:
from numpy.random import default_rng
from sklearn.metrics import accuracy_score

data = wine_df.to_numpy()

rng = default_rng(42)
rng.shuffle(data)

X_train = data[898:, :-1]
y_train = data[898:, -1]
X_test = data[:898, :-1]
y_test = data[:898, -1]

X_train.shape, y_train.shape, X_test.shape, y_test.shape


((4000, 11), (4000,), (898, 11), (898,))

In [41]:
tree_reg = DecisionTreeClassifier(max_depth=20)
tree_reg.fit(X_train, y_train)

y_pred = tree_reg.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"acc: {acc}")

acc: 0.5924276169265034


In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=10),
    n_estimators=600, 
    algorithm="SAMME.R",
    learning_rate=0.6
)

ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

ada_clf = AdaBoostClassifier( <br>
    DecisionTreeClassifier(max_depth=10), <br>
    n_estimators=600, <br>
    algorithm="SAMME.R", <br>
    learning_rate=0.6 <br>
) <br>
<br>
running-time: 25.7s <br>
accuracy: 0.7160 <br>