In [1]:
import numpy as np
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import Lasso
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import BayesianRidge
from keras.models import Sequential
from keras.layers import Dense

In [2]:
# Connect to SQLite database and retrieve data
conn = sqlite3.connect('wine_quality.db')
query = "SELECT * FROM wine_quality"
wine_df = pd.read_sql_query(query, conn)
conn.close()

In [3]:
print("Data has been successfully retrieved from wine_quality.db")

Data has been successfully retrieved from wine_quality.db


In [4]:
# Set up a binary identifier for quality
wine_df['quality'] = wine_df['quality'].apply(lambda x: 1 if x >= 7 else 0)
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,Red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0,Red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0,Red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0,Red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,Red


In [5]:
# Count how many "good" wines
good_wines = wine_df[wine_df['quality'] == 1]
quality_counts = good_wines['quality'].count()
print(quality_counts)

1277


In [6]:
# Separate features and target variable
X = wine_df.drop(['quality', 'type'], axis=1)
y = wine_df['quality']

In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Function to evaluate and print model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return classification_report(y_test, y_pred, output_dict=True), confusion_matrix(y_test, y_pred), accuracy

In [51]:
models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Decision Tree Classifier', DecisionTreeClassifier(random_state=42)),
    ('Random Forest Classifier', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting Classifier', GradientBoostingClassifier(random_state=42)),
    ('AdaBoost', AdaBoostClassifier(random_state=42)),
    ('Bagging Classifier', BaggingClassifier(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Quadratic Discriminant Analysis', QuadraticDiscriminantAnalysis()),
    ('Multilayer Perceptron', MLPClassifier(random_state=42)),
    ('Ridge Classifier', RidgeClassifier(random_state=42)),
    ('ExtraTrees Classifier', ExtraTreesClassifier(random_state=42)),
    ('Isolation Forest', IsolationForest(random_state=42))
]

In [52]:
# DataFrame to store results
results_df = pd.DataFrame(columns=[
    'Model', 'Precision', 'Recall', 'F1-Score', 'Support', 'Accuracy',
    'Predicted Positive Actual Positive', 'Predicted Positive Actual Negative',
    'Predicted Negative Actual Positive', 'Predicted Negative Actual Negative'
])

In [53]:
# Train and evaluate each model
for name, model in models:
    print(f"Evaluating {name}...")
    try:
        if 'PCA' in name or 'K-Means' in name:
            model.fit(X_train)
            y_pred = model.predict(X_test)
            y_pred = (y_pred > 0.5).astype(int) if 'PCA' in name else y_pred
            report = classification_report(y_test, y_pred, output_dict=True)
            cm = confusion_matrix(y_test, y_pred)
            accuracy = accuracy_score(y_test, y_pred)
        else:
            model.fit(X_train, y_train)
            report, cm, accuracy = evaluate_model(model, X_test, y_test)

        # Store the results in the DataFrame
        results_df = results_df.append({
            'Model': name,
            'Precision': report['1']['precision'],
            'Recall': report['1']['recall'],
            'F1-Score': report['1']['f1-score'],
            'Support': report['1']['support'],
            'Accuracy': accuracy,
            'Predicted Positive Actual Positive': cm[1, 1],
            'Predicted Positive Actual Negative': cm[0, 1],
            'Predicted Negative Actual Positive': cm[1, 0],
            'Predicted Negative Actual Negative': cm[0, 0]
        }, ignore_index=True)
    except Exception as e:
        print(f"Failed to evaluate {name}: {e}")


Evaluating Logistic Regression...
Evaluating Linear Discriminant Analysis...
Evaluating Support Vector Machine...


  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({


Evaluating Decision Tree Classifier...
Evaluating Random Forest Classifier...


  results_df = results_df.append({


Evaluating Gradient Boosting Classifier...


  results_df = results_df.append({


Evaluating AdaBoost...


  results_df = results_df.append({


Evaluating Bagging Classifier...


  results_df = results_df.append({


Evaluating K-Nearest Neighbors...


  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({


Evaluating Gaussian Naive Bayes...
Evaluating Quadratic Discriminant Analysis...
Evaluating Multilayer Perceptron...


  results_df = results_df.append({
  results_df = results_df.append({


Evaluating Ridge Classifier...
Evaluating ExtraTrees Classifier...


  results_df = results_df.append({


Evaluating Isolation Forest...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  results_df = results_df.append({


THE BEST MODEL FOR WHITE WINE WAS FOUND TO BE RANDOMFOREST CLASSIFIER.

THE BEST MODEL FOR RED WINE WAS FOUND TO BE THE EXTRATREESCLASSIFIER AND THE BAGGING CLASSIFIER DEPENDING ON IF RECALL IS MORE/LESS IMPORTANT.

In [13]:
# Create a Neural Network Model.
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1bb0a3b78b0>

In [12]:
# Assuming X and y are your features and labels
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_test to a numpy array
y_test = y_test.to_numpy()

# Define different architectures
architectures = [
    [32, 16],   # Two hidden layers with 32 and 16 neurons
    [64, 32, 16],  # Three hidden layers with 64, 32, and 16 neurons
    [128, 64, 32],  # Three hidden layers with 128, 64, and 32 neurons
    [32, 32, 32],   # Three hidden layers with 32 neurons each
    [64, 64],  # Two hidden layers with 64 neurons each
]

best_accuracy = 0
best_model = None
best_architecture = None

for architecture in architectures:
    # Create the model
    model = Sequential()
    model.add(Dense(architecture[0], input_dim=X_train.shape[1], activation='relu'))
    
    for units in architecture[1:]:
        model.add(Dense(units, activation='relu'))
    
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test), verbose=0)
    
    # Evaluate the model
    y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = np.mean(y_pred_nn == y_test.reshape(-1, 1))
    
    print(f"Architecture: {architecture}")
    print(confusion_matrix(y_test, y_pred_nn))
    print(classification_report(y_test, y_pred_nn))
    print(f"Accuracy: {accuracy}\n")
    
    # Save the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_architecture = architecture

print(f"Best Architecture: {best_architecture}")
print(f"Best Accuracy: {best_accuracy}")

Architecture: [32, 16]
[[1029   19]
 [ 221   31]]
              precision    recall  f1-score   support

           0       0.82      0.98      0.90      1048
           1       0.62      0.12      0.21       252

    accuracy                           0.82      1300
   macro avg       0.72      0.55      0.55      1300
weighted avg       0.78      0.82      0.76      1300

Accuracy: 0.8153846153846154

Architecture: [64, 32, 16]
[[1009   39]
 [ 193   59]]
              precision    recall  f1-score   support

           0       0.84      0.96      0.90      1048
           1       0.60      0.23      0.34       252

    accuracy                           0.82      1300
   macro avg       0.72      0.60      0.62      1300
weighted avg       0.79      0.82      0.79      1300

Accuracy: 0.8215384615384616

Architecture: [128, 64, 32]
[[1027   21]
 [ 224   28]]
              precision    recall  f1-score   support

           0       0.82      0.98      0.89      1048
           1      

Best Architecture: [64, 32, 16]

Best Accuracy: 0.8215384615384616

In [13]:
# Assuming X and y are your features and labels
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_test to a numpy array
y_test = y_test.to_numpy()

# Define different architectures
architectures = [
    [32, 16],   # Two hidden layers with 32 and 16 neurons
    [64, 32, 16],  # Three hidden layers with 64, 32, and 16 neurons
    [128, 64, 32],  # Three hidden layers with 128, 64, and 32 neurons
    [32, 32, 32],   # Three hidden layers with 32 neurons each
    [64, 64],  # Two hidden layers with 64 neurons each
]

# Define different activation functions
activation_functions = ['relu', 'tanh', 'sigmoid']

# Function to create model
def create_model(architecture, activation):
    model = Sequential()
    model.add(Dense(architecture[0], input_dim=X_train.shape[1], activation=activation))
    
    for units in architecture[1:]:
        model.add(Dense(units, activation=activation))
    
    model.add(Dense(1, activation='sigmoid'))
    return model

best_accuracy = 0
best_model = None
best_architecture = None
best_activation = None

for architecture in architectures:
    for activation in activation_functions:
        # Create the model
        model = create_model(architecture, activation)

        # Compile the model
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        # Train the model
        model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test), verbose=0)
        
        # Evaluate the model
        y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
        accuracy = np.mean(y_pred_nn == y_test.reshape(-1, 1))
        
        print(f"Architecture: {architecture} | Activation: {activation}")
        print(confusion_matrix(y_test, y_pred_nn))
        print(classification_report(y_test, y_pred_nn))
        print(f"Accuracy: {accuracy}\n")
        
        # Save the best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
            best_architecture = architecture
            best_activation = activation

print(f"Best Architecture: {best_architecture}")
print(f"Best Activation Function: {best_activation}")
print(f"Best Accuracy: {best_accuracy}")

Architecture: [32, 16] | Activation: relu
[[1012   36]
 [ 199   53]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90      1048
           1       0.60      0.21      0.31       252

    accuracy                           0.82      1300
   macro avg       0.72      0.59      0.60      1300
weighted avg       0.79      0.82      0.78      1300

Accuracy: 0.8192307692307692

Architecture: [32, 16] | Activation: tanh
[[1021   27]
 [ 212   40]]
              precision    recall  f1-score   support

           0       0.83      0.97      0.90      1048
           1       0.60      0.16      0.25       252

    accuracy                           0.82      1300
   macro avg       0.71      0.57      0.57      1300
weighted avg       0.78      0.82      0.77      1300

Accuracy: 0.8161538461538461

Architecture: [32, 16] | Activation: sigmoid
[[985  63]
 [166  86]]
              precision    recall  f1-score   support

           0       0.86  

Best Architecture: [64, 64]
Best Activation Function: sigmoid
Best Accuracy: 0.83

41/41 [==============================] - 0s 1ms/step
Architecture: [32, 16] | Activation: relu
[[1012   36]
 [ 199   53]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90      1048
           1       0.60      0.21      0.31       252

    accuracy                           0.82      1300
   macro avg       0.72      0.59      0.60      1300
weighted avg       0.79      0.82      0.78      1300

Accuracy: 0.8192307692307692

41/41 [==============================] - 0s 1ms/step
Architecture: [32, 16] | Activation: tanh
[[1021   27]
 [ 212   40]]
              precision    recall  f1-score   support

           0       0.83      0.97      0.90      1048
           1       0.60      0.16      0.25       252

    accuracy                           0.82      1300
   macro avg       0.71      0.57      0.57      1300
weighted avg       0.78      0.82      0.77      1300

Accuracy: 0.8161538461538461

41/41 [==============================] - 0s 946us/step
Architecture: [32, 16] | Activation: sigmoid
[[985  63]
 [166  86]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      1048
           1       0.58      0.34      0.43       252

    accuracy                           0.82      1300
   macro avg       0.72      0.64      0.66      1300
weighted avg       0.80      0.82      0.81      1300

Accuracy: 0.8238461538461539

41/41 [==============================] - 0s 1ms/step
Architecture: [64, 32, 16] | Activation: relu
[[962  86]
 [144 108]]
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      1048
           1       0.56      0.43      0.48       252

    accuracy                           0.82      1300
   macro avg       0.71      0.67      0.69      1300
weighted avg       0.81      0.82      0.81      1300

Accuracy: 0.823076923076923

41/41 [==============================] - 0s 1ms/step
Architecture: [64, 32, 16] | Activation: tanh
[[1023   25]
 [ 204   48]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      1048
           1       0.66      0.19      0.30       252

    accuracy                           0.82      1300
   macro avg       0.75      0.58      0.60      1300
weighted avg       0.80      0.82      0.78      1300

Accuracy: 0.8238461538461539

41/41 [==============================] - 0s 1ms/step
Architecture: [64, 32, 16] | Activation: sigmoid
[[1010   38]
 [ 192   60]]
              precision    recall  f1-score   support

           0       0.84      0.96      0.90      1048
           1       0.61      0.24      0.34       252

    accuracy                           0.82      1300
   macro avg       0.73      0.60      0.62      1300
weighted avg       0.80      0.82      0.79      1300

Accuracy: 0.823076923076923

41/41 [==============================] - 0s 1ms/step
Architecture: [128, 64, 32] | Activation: relu
[[962  86]
 [156  96]]
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      1048
           1       0.53      0.38      0.44       252

    accuracy                           0.81      1300
   macro avg       0.69      0.65      0.67      1300
weighted avg       0.80      0.81      0.80      1300

Accuracy: 0.8138461538461539

41/41 [==============================] - 0s 1ms/step
Architecture: [128, 64, 32] | Activation: tanh
[[1040    8]
 [ 234   18]]
              precision    recall  f1-score   support

           0       0.82      0.99      0.90      1048
           1       0.69      0.07      0.13       252

    accuracy                           0.81      1300
   macro avg       0.75      0.53      0.51      1300
weighted avg       0.79      0.81      0.75      1300

Accuracy: 0.8138461538461539

41/41 [==============================] - 0s 1ms/step
Architecture: [128, 64, 32] | Activation: sigmoid
[[1020   28]
 [ 209   43]]
              precision    recall  f1-score   support

           0       0.83      0.97      0.90      1048
           1       0.61      0.17      0.27       252

    accuracy                           0.82      1300
   macro avg       0.72      0.57      0.58      1300
weighted avg       0.79      0.82      0.77      1300

Accuracy: 0.8176923076923077

41/41 [==============================] - 0s 1ms/step
Architecture: [32, 32, 32] | Activation: relu
[[996  52]
 [172  80]]
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      1048
           1       0.61      0.32      0.42       252

    accuracy                           0.83      1300
   macro avg       0.73      0.63      0.66      1300
weighted avg       0.80      0.83      0.81      1300

Accuracy: 0.8276923076923077

41/41 [==============================] - 0s 1ms/step
Architecture: [32, 32, 32] | Activation: tanh
[[832 216]
 [ 86 166]]
              precision    recall  f1-score   support

           0       0.91      0.79      0.85      1048
           1       0.43      0.66      0.52       252

    accuracy                           0.77      1300
   macro avg       0.67      0.73      0.69      1300
weighted avg       0.81      0.77      0.78      1300

Accuracy: 0.7676923076923077

41/41 [==============================] - 0s 1ms/step
Architecture: [32, 32, 32] | Activation: sigmoid
[[989  59]
 [175  77]]
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      1048
           1       0.57      0.31      0.40       252

    accuracy                           0.82      1300
   macro avg       0.71      0.62      0.65      1300
weighted avg       0.79      0.82      0.80      1300

Accuracy: 0.82

41/41 [==============================] - 0s 1ms/step
Architecture: [64, 64] | Activation: relu
[[986  62]
 [179  73]]
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      1048
           1       0.54      0.29      0.38       252

    accuracy                           0.81      1300
   macro avg       0.69      0.62      0.63      1300
weighted avg       0.79      0.81      0.79      1300

Accuracy: 0.8146153846153846

41/41 [==============================] - 0s 1ms/step
Architecture: [64, 64] | Activation: tanh
[[1027   21]
 [ 207   45]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      1048
           1       0.68      0.18      0.28       252

    accuracy                           0.82      1300
   macro avg       0.76      0.58      0.59      1300
weighted avg       0.80      0.82      0.78      1300

Accuracy: 0.8246153846153846

41/41 [==============================] - 0s 1ms/step
Architecture: [64, 64] | Activation: sigmoid
[[1001   47]
 [ 174   78]]
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      1048
           1       0.62      0.31      0.41       252

    accuracy                           0.83      1300
   macro avg       0.74      0.63      0.66      1300
weighted avg       0.81      0.83      0.81      1300

Accuracy: 0.83

Best Architecture: [64, 64]
Best Activation Function: sigmoid
Best Accuracy: 0.83
​

