In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [2]:
wine_ds = pd.read_csv('WQ_Dataset.csv')

In [3]:
x = wine_ds.drop("quality", axis=1)
y = wine_ds["quality"].apply(lambda y_value: "perfect" if y_value == 10 else 
                             ("good" if (y_value >= 7 and y_value < 10) else 
                             ("average" if (y_value >= 5 and y_value < 7) else 
                             ("bad" if (y_value >= 3 and y_value < 5) else "inedible"))))


In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

In [5]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [6]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier(random_state=5)
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(x_train, y_train)
best_rf = grid_search_rf.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [7]:
param_grid_gbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.05],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5, 10]
}
gbm = GradientBoostingClassifier(random_state=5)
grid_search_gbm = GridSearchCV(estimator=gbm, param_grid=param_grid_gbm, cv=5, n_jobs=-1, verbose=2)
grid_search_gbm.fit(x_train, y_train)
best_gbm = grid_search_gbm.best_estimator_

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [8]:
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly']
}
svm = SVC(probability=True, random_state=5)
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, n_jobs=-1, verbose=2)
grid_search_svm.fit(x_train, y_train)
best_svm = grid_search_svm.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [9]:
stack = StackingClassifier(
    estimators=[("rf", best_rf), ("gbm", best_gbm), ("svm", best_svm)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=5),
    cv=5
)

In [10]:
stack.fit(x_train, y_train)
y_pred_stack = stack.predict(x_test)

In [11]:
print(f"Optimized Ensemble model accuracy: {accuracy_score(y_test, y_pred_stack)}")
print(classification_report(y_test, y_pred_stack))

Optimized Ensemble model accuracy: 0.8875
              precision    recall  f1-score   support

     average       0.90      0.97      0.94       267
         bad       0.25      0.14      0.18         7
        good       0.83      0.52      0.64        46

    accuracy                           0.89       320
   macro avg       0.66      0.54      0.59       320
weighted avg       0.88      0.89      0.88       320

