In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
data_path = Path("./winequality-white.csv")
field_names = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"]

wine_df = pd.read_csv(data_path, header=0, names=field_names, sep=";")

# data overview:
#print(wine_df.describe())

# which quality classes do we have? :
qualities = wine_df["quality"].unique()
print(f"Number of unique 'qualities': {len(qualities)}")
print(f"Qualities: {sorted(qualities)}")
binc = np.bincount([q for q in wine_df["quality"]])
no_inst = len(wine_df)
print(f"\nClass counts: {binc[-len(qualities):]}")
print(f"\nNumber of instances: {no_inst} ")
print(f"\nClass fractions: {np.round(binc/no_inst,4) * 100}")

Number of unique 'qualities': 7
Qualities: [3, 4, 5, 6, 7, 8, 9]

Class counts: [  20  163 1457 2198  880  175    5]

Number of instances: 4898 

Class fractions: [ 0.    0.    0.    0.41  3.33 29.75 44.88 17.97  3.57  0.1 ]


In [3]:
wine_df.shape # (4898,12)

(4898, 12)

In [4]:
from numpy.random import default_rng
from sklearn.metrics import accuracy_score

data = wine_df.to_numpy()

rng = default_rng(42)
rng.shuffle(data)

X_train = data[898:, :-1]
y_train = data[898:, -1]
X_test = data[:898, :-1]
y_test = data[:898, -1]

X_train.shape, y_train.shape, X_test.shape, y_test.shape


((4000, 11), (4000,), (898, 11), (898,))

In [5]:
from sklearn.tree import DecisionTreeClassifier

tree_reg = DecisionTreeClassifier(max_depth=20)
tree_reg.fit(X_train, y_train)

y_pred = tree_reg.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"acc: {acc}")

acc: 0.5991091314031181


In [6]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=25, min_samples_split=40, min_samples_leaf=10),
    n_estimators=400, 
    algorithm="SAMME.R",
    learning_rate=1.0
)

ada_clf.fit(X_train, y_train)
print(f"train acc: {ada_clf.score(X_train, y_train):.2f}")
print(f"test acc: {ada_clf.score(X_test, y_test):.2f}")

train acc: 0.99
test acc: 0.69


AdaBoostClassifier( <br>
    DecisionTreeClassifier(max_depth=10), <br>
    n_estimators=600, <br>
    algorithm="SAMME.R", <br>
    learning_rate=0.6 <br>
) <br>
<br>
running-time: 25.7s <br>
accuracy: 0.7160 <br>

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples =2800,
    bootstrap = True,
    n_jobs = -1,
    oob_score = True,
    bootstrap_features=True,
    max_features = 0.8
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(f"pred accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"bag_clf.oob_score_ : {bag_clf.oob_score_:.2f}")

pred accuracy: 0.70
bag_clf.oob_score_ : 0.68


BaggingClassifier( <br>
    DecisionTreeClassifier(), <br>
    n_estimators = 500, <br>
    max_samples =2800, <br>
    bootstrap = True, <br>
    n_jobs = -1, <br>
    oob_score = True, <br>
    bootstrap_features=True, <br>
    max_features = 0.8 <br>
) <br>
 <br>
pred accuracy: 0.70 <br>
bag_clf.oob_score_ : 0.68 <br>
running time: 6.7s <br>

In [8]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(
    n_estimators=500,
    max_samples=2700,
)

rnd_clf.fit(X_train, y_train)
y_pred=rnd_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"pred acc: {acc:.2f}")

pred acc: 0.70


RandomForestClassifier( <br>
    n_estimators=500, <br>
    max_samples=2700, <br>
) <br>
 <br>
pred acc: 0.71 <br>
running time: 5.2s <br>

In [9]:
# take a look at the importance of the features:

feature_names = field_names[:-1]
for name, scores in zip(feature_names, rnd_clf.feature_importances_):
    print(f"{name} : {scores:.2f}")

fixed acidity : 0.08
volatile acidity : 0.10
citric acid : 0.08
residual sugar : 0.09
chlorides : 0.09
free sulfur dioxide : 0.09
total sulfur dioxide : 0.09
density : 0.10
pH : 0.09
sulphates : 0.08
alcohol : 0.11


In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gb_clf = GradientBoostingClassifier(
    max_depth=10,
    n_estimators=120
)

gb_clf.fit(X_train, y_train)
print(f"train scores: {accuracy_score(gb_clf.predict(X_train), y_train):.2f}")
print(f"test scores: {accuracy_score(gb_clf.predict(X_test), y_test):.2f}")



train scores: 1.00
test scores: 0.69


In [13]:
from sklearn.metrics import mean_squared_error # mse for CLASSIFICATION ?????
import copy

gbc = {}
best_gbc = {}
gbc = GradientBoostingClassifier(
    max_depth=20,
    subsample=0.5, 
    min_samples_split=50,
    min_samples_leaf=10,
    learning_rate=0.05,
    warm_start=True, 
    verbose=1,
)

val_errors = []
min_val_error=float("inf")
for n_estimators in range(100,300):
    gbc.n_estimators=n_estimators
    gbc.fit(X_train, y_train)
    y_pred=gbc.predict(X_test)
    val_error = accuracy_score(y_pred, y_test)
    val_errors.append(val_error)
    if(val_error < min_val_error):
        min_val_error = val_error
        best_gbc=copy.deepcopy(gbc)

print(f"train acc: {accuracy_score(y_pred=best_gbc.predict(X_train), y_true=y_train):.2f}")
print(f"test acc: {accuracy_score(y_pred=best_gbc.predict(X_test), y_true=y_test):.2f}")
print(best_gbc)
#print(val_errors)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.2199           0.0246           12.96s
         2           1.1771           0.0286           16.06s
         3           1.1500           0.0253           15.64s
         4           1.1066           0.0219           15.69s
         5           1.0817           0.0203           15.17s
         6           1.0424           0.0171           15.21s
         7           1.0236           0.0137           15.11s
         8           0.9905           0.0160           15.06s
         9           0.9788           0.0141           14.79s
        10           0.9526           0.0124           14.72s
        20           0.7704           0.0042           13.69s
        30           0.6653           0.0012           12.12s
        40           0.5489          -0.0003           10.60s
        50           0.4739          -0.0004            8.81s
        60           0.4108          -0.0009            7.04s
       

train acc: 0.9975 <br>
test acc: 0.7048997772828508 <br>
GradientBoostingClassifier(max_depth=20, n_estimators=63, subsample=0.5,
                           warm_start=True)

train acc: 1.00 <br>
test acc: 0.65 <br>
GradientBoostingClassifier(max_depth=20, min_samples_leaf=10, <br>
                           min_samples_split=500, n_estimators=995, <br>
                           subsample=0.5, warm_start=True) <br>