In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

## Decision-Tree Classifiers and their Ensembles for Classification of Wine-Quality ##
Wine data-set downloaded from [csv-file](http://mng.bz/90Ol)

#### Load Data and Get an Overview: ####

In [2]:
data_path = Path("./winequality-white.csv")
field_names = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"]

wine_df = pd.read_csv(data_path, header=0, names=field_names, sep=";")

# data overview:
#print(wine_df.describe())

# which quality classes do we have? :
print(f"wine_df.shape : {wine_df.shape}")
qualities = wine_df["quality"].unique()
print(f"Number of unique 'qualities': {len(qualities)}")
print(f"Qualities: {sorted(qualities)}")
binc = np.bincount([q for q in wine_df["quality"]])
no_inst = len(wine_df)
print(f"\nClass counts: {binc[-len(qualities):]}")
print(f"\nNumber of instances: {no_inst} ")
print(f"\nClass fractions: {np.round(binc/no_inst,4) * 100}")

wine_df.shape : (4898, 12)
Number of unique 'qualities': 7
Qualities: [3, 4, 5, 6, 7, 8, 9]

Class counts: [  20  163 1457 2198  880  175    5]

Number of instances: 4898 

Class fractions: [ 0.    0.    0.    0.41  3.33 29.75 44.88 17.97  3.57  0.1 ]


#### Split Data into Train- and Test-Set - Save those to Disk: ####

In [5]:
"""
#IF YOU EXECUTE THIS CODE THE MODELS OF THE STACK HAVE TO BE RETRAINED !!!

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

X = wine_df.iloc[:,:-1]
y = wine_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
print(X_train.shape)
X_train = X_train.to_numpy()
print(X_train.shape)
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# write to disk for later use:
#Z = pd.DataFrame(np.append(X_train, np.expand_dims(y_train, axis=1 ), axis=1))
#Z.to_csv("./train.csv", index=False)
Z = np.append(X_train, np.expand_dims(y_train, axis=1 ), axis=1)
print(Z.shape)
np.savetxt("./train.csv", Z, delimiter=",")
#Z = pd.DataFrame(np.append(X_test, np.expand_dims(y_test, axis=1), axis=1))
#Z.to_csv("./test.csv", index=False)
Z = np.append(X_test, np.expand_dims(y_test, axis=1), axis=1)
print(Z.shape)
np.savetxt("./test.csv", Z, delimiter=",")
"""

(3918, 11)
(3918, 11)
(3918, 12)
(980, 12)


#### Load the Train- / Test-Data from Disk (without Pandas version): ####

In [6]:
# load data from the separate csv files - for stack- and blender training

X_train = np.loadtxt("./train.csv", delimiter=",")
y_train = X_train[:,-1]
print(f"y_train.shape : {y_train.shape}")
X_train = X_train[:,:-1]
print(f"X_train.shape : {X_train.shape}")
X_test = np.loadtxt("./test.csv", delimiter=",")
y_test = X_test[:,-1]
X_test = X_test[:,:-1]
print(f"X_test.shape : {X_test.shape}")


y_train.shape : (3918,)
X_train.shape : (3918, 11)
X_test.shape : (980, 11)


In [9]:
type(X_train), type(y_train)

(numpy.ndarray, numpy.ndarray)

#### Train a Simple Decision-Tree Classifier - Check it's Accuracy: ####

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree_clf = DecisionTreeClassifier(max_depth=20)
tree_clf.fit(X_train, y_train)

y_pred = tree_clf.predict(X_test)
print(y_pred[0])
acc = accuracy_score(y_test, y_pred)
print(f"acc: {acc}")

7.0
acc: 0.6163265306122448


### Save the Decision-Tree-Classifier for Use in Stacked Voting: ###

In [12]:
# save the Decision-Tree-Classifier for use in Stacked Voting:

import pickle

# save the model to disk
filename = 'DecisionTree_061_model.dct'
pickle.dump(tree_clf, open(filename, 'wb'))
 


##### Check if the Model is Reloadable:

In [13]:
# Check if the model is reloadable:

# load the model from disk
tree_clf_reloaded = pickle.load(open(filename, 'rb'))

# apply the reloaded model for inference:
result = tree_clf_reloaded.score(X_test, y_test)
print(f"{result:.2f}")

0.62


In [28]:
# check the accuracy by hand: 

y_pred = tree_clf_reloaded.predict(X_test)
y_pred_prob = tree_clf_reloaded.predict_proba(X_test)
y_pred_prob = np.argmax(y_pred_prob, axis=1) + 3 # <--- The classes 0,1,2 do not appear in the data! we have to add this as a "bias"
(y_pred_prob == y_test).sum()/len(y_test), (y_pred_prob == y_pred).sum()/len(y_test)

(0.6163265306122448, 1.0)

### Train an AdaBoost Classifier with a Decision-Tree-Classfier: ###

In [30]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=10),
    n_estimators=600, 
    algorithm="SAMME.R",
    learning_rate=0.6
)

ada_clf.fit(X_train, y_train)
print(f"train acc: {ada_clf.score(X_train, y_train):.2f}")
print(f"test acc: {ada_clf.score(X_test, y_test):.2f}")

train acc: 1.00
test acc: 0.72


#### Save the AdaBoost Classifier and Check if it is Reloadable: ####

In [31]:
# save the AdaBoostClassifier:

import pickle

# save the model to disk
filename = 'AdaBoost_071_model.dct'
pickle.dump(ada_clf, open(filename, 'wb'))
 
# load the model from disk
ada_clf_reloaded = pickle.load(open(filename, 'rb'))

# apply the reloaded model for inference:
result = ada_clf_reloaded.score(X_test, y_test)
print(f"{result:.2f}")

0.72


AdaBoostClassifier( <br>
    DecisionTreeClassifier(max_depth=10), <br>
    n_estimators=600, <br>
    algorithm="SAMME.R", <br>
    learning_rate=0.6 <br>
) <br>
<br>
running-time: 25.7s <br>
accuracy: 0.7160 <br>

### Train a Bagging-Classifier with a Decision-Tree Classifier: ###
Compare Accuracy and Out-Of-Bag-Score.

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples =2800,
    bootstrap = True,
    n_jobs = -1,
    oob_score = True,
    bootstrap_features=True,
    max_features = 0.8
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(f"pred accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"bag_clf.oob_score_ : {bag_clf.oob_score_:.2f}")

BaggingClassifier( <br>
    DecisionTreeClassifier(), <br>
    n_estimators = 500, <br>
    max_samples =2800, <br>
    bootstrap = True, <br>
    n_jobs = -1, <br>
    oob_score = True, <br>
    bootstrap_features=True, <br>
    max_features = 0.8 <br>
) <br>
 <br>
pred accuracy: 0.70 <br>
bag_clf.oob_score_ : 0.68 <br>
running time: 6.7s <br>

### Train a Random-Forrest Classifier: ###

In [None]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(
    n_estimators=500,
    max_samples=2700,
)

rnd_clf.fit(X_train, y_train)
y_pred=rnd_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"pred acc: {acc:.2f}")

RandomForestClassifier( <br>
    n_estimators=500, <br>
    max_samples=2700, <br>
) <br>
 <br>
pred acc: 0.71 <br>
running time: 5.2s <br>

#### Take a Look at the Importance of the Features: ####

In [None]:
# take a look at the importance of the features:

feature_names = field_names[:-1]
for name, scores in zip(feature_names, rnd_clf.feature_importances_):
    print(f"{name} : {scores:.2f}")

### Train a GradientBoosting Classifier: ###

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gb_clf = GradientBoostingClassifier(
    max_depth=10,
    n_estimators=120
)

gb_clf.fit(X_train, y_train)
print(f"train scores: {accuracy_score(gb_clf.predict(X_train), y_train):.2f}")
print(f"test scores: {accuracy_score(gb_clf.predict(X_test), y_test):.2f}")



#### Find the best Number of Trees in the Forrest: ####

In [None]:
from sklearn.metrics import mean_squared_error # mse for CLASSIFICATION ?????
import copy

gbc = {}
best_gbc = {}
gbc = GradientBoostingClassifier(
    max_depth=20,
    subsample=0.5, 
    min_samples_split=50,
    min_samples_leaf=10,
    learning_rate=0.05,
    warm_start=True, 
    verbose=1,
)

val_errors = []
min_val_error=float("inf")
for n_estimators in range(100,300):
    gbc.n_estimators=n_estimators
    gbc.fit(X_train, y_train)
    y_pred=gbc.predict(X_test)
    val_error = accuracy_score(y_pred, y_test)
    val_errors.append(val_error)
    if(val_error < min_val_error):
        min_val_error = val_error
        best_gbc=copy.deepcopy(gbc) # deep-copy otherwize further training will spoil the classifier

print(f"train acc: {accuracy_score(y_pred=best_gbc.predict(X_train), y_true=y_train):.2f}")
print(f"test acc: {accuracy_score(y_pred=best_gbc.predict(X_test), y_true=y_test):.2f}")
print(best_gbc)
#print(val_errors)

train acc: 0.9975 <br>
test acc: 0.7048997772828508 <br>
GradientBoostingClassifier(max_depth=20, n_estimators=63, subsample=0.5,
                           warm_start=True)

train acc: 1.00 <br>
test acc: 0.65 <br>
GradientBoostingClassifier(max_depth=20, min_samples_leaf=10, <br>
                           min_samples_split=500, n_estimators=995, <br>
                           subsample=0.5, warm_start=True) <br>