In [25]:
# Import Libraries
# ------------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from IPython.display import display
from tensorflow.keras import Input

In [26]:
# Load and Preprocess Data
# ------------------------------------------
df = pd.read_csv("winequality-red.csv", sep=';')  
df.columns = df.columns.str.strip()  

# Convert to binary classification
df['quality_binary'] = df['quality'].apply(lambda x: 1 if x >= 6 else 0)
df.drop('quality', axis=1, inplace=True)


# Features and target
X = df.drop('quality_binary', axis=1)
y = df['quality_binary']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Shuffle
X_scaled, y = shuffle(X_scaled, y, random_state=42)

In [27]:
# Stratified 10-Fold Cross Validation
# ------------------------------------------
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = []
for train_index, test_index in kf.split(X_scaled, y):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    folds.append((X_train, X_test, y_train, y_test))

In [28]:
# Model Training Functions
# ------------------------------------------
def train_random_forest(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_knn(X_train, y_train):
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)
    return model

def build_gru_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))  
    model.add(GRU(32))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

In [29]:
# Manual Metric Calculation
# ------------------------------------------
def calculate_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()

    total = TP + TN + FP + FN
    accuracy = (TP + TN) / total
    error_rate = (FP + FN) / total

    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
    fnr = FN / (TP + FN) if (TP + FN) > 0 else 0
    tss = recall - fpr
    hss_num = 2 * (TP * TN - FP * FN)
    hss_den = ((TP + FN) * (FN + TN)) + ((TP + FP) * (FP + TN))
    hss = hss_num / hss_den if hss_den != 0 else 0

    return {
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "Accuracy": accuracy,
        "Error Rate": error_rate,
        "Recall": recall,
        "Precision": precision,
        "F1 Score": f1,
        "FPR": fpr,
        "FNR": fnr,
        "TSS": tss,
        "HSS": hss
    }

In [30]:
# Train Models & Collect Metrics
# ------------------------------------------
metrics_rf, metrics_knn, metrics_gru = [], [], []

for fold_idx, (X_train, X_test, y_train, y_test) in enumerate(folds, start=1):
    print(f"\nFold {fold_idx}")

    # Random Forest
    rf_model = train_random_forest(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    metrics_rf.append(calculate_metrics(y_test, rf_preds))

    # KNN
    knn_model = train_knn(X_train, y_train)
    knn_preds = knn_model.predict(X_test)
    metrics_knn.append(calculate_metrics(y_test, knn_preds))

    # GRU
    X_train_gru = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test_gru = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
    gru_model = build_gru_model((1, X_train.shape[1]))
    es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=0)
    gru_model.fit(X_train_gru, y_train, epochs=20, batch_size=32, 
                  validation_split=0.1, callbacks=[es], verbose=0)
    gru_preds = (gru_model.predict(X_test_gru) > 0.5).astype("int32").flatten()
    metrics_gru.append(calculate_metrics(y_test, gru_preds))


Fold 1
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 2
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 3
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 4
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 6
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 7
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 8
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 9
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Fold 10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


In [31]:
# Summarize Results
# ------------------------------------------
def summarize_metrics(metrics_list):
    df = pd.DataFrame(metrics_list)
    df.loc['Average'] = df.mean(numeric_only=True)
    df.index = [f'Fold {i+1}' for i in range(len(metrics_list))] + ['Average']
    return df.round(4)
print("\n📊 Summary for Random Forest")
df_rf = summarize_metrics(metrics_rf)
display(df_rf)

print("\n📊 Summary for KNN")
df_knn = summarize_metrics(metrics_knn)
display(df_knn)

print("\n📊 Summary for GRU")
df_gru = summarize_metrics(metrics_gru)
display(df_gru)


📊 Summary for Random Forest


Unnamed: 0,TP,TN,FP,FN,Accuracy,Error Rate,Recall,Precision,F1 Score,FPR,FNR,TSS,HSS
Fold 1,73.0,61.0,13.0,13.0,0.8375,0.1625,0.8488,0.8488,0.8488,0.1757,0.1512,0.6732,0.6732
Fold 2,70.0,63.0,11.0,16.0,0.8312,0.1688,0.814,0.8642,0.8383,0.1486,0.186,0.6653,0.6622
Fold 3,77.0,57.0,17.0,9.0,0.8375,0.1625,0.8953,0.8191,0.8556,0.2297,0.1047,0.6656,0.6707
Fold 4,76.0,62.0,12.0,10.0,0.8625,0.1375,0.8837,0.8636,0.8736,0.1622,0.1163,0.7216,0.7229
Fold 5,76.0,65.0,9.0,10.0,0.8812,0.1188,0.8837,0.8941,0.8889,0.1216,0.1163,0.7621,0.7614
Fold 6,69.0,59.0,16.0,16.0,0.8,0.2,0.8118,0.8118,0.8118,0.2133,0.1882,0.5984,0.5984
Fold 7,70.0,57.0,18.0,15.0,0.7938,0.2062,0.8235,0.7955,0.8092,0.24,0.1765,0.5835,0.5849
Fold 8,71.0,56.0,19.0,14.0,0.7938,0.2062,0.8353,0.7889,0.8114,0.2533,0.1647,0.582,0.5843
Fold 9,70.0,67.0,8.0,15.0,0.8562,0.1438,0.8235,0.8974,0.8589,0.1067,0.1765,0.7169,0.7129
Fold 10,64.0,61.0,13.0,21.0,0.7862,0.2138,0.7529,0.8312,0.7901,0.1757,0.2471,0.5773,0.5733



📊 Summary for KNN


Unnamed: 0,TP,TN,FP,FN,Accuracy,Error Rate,Recall,Precision,F1 Score,FPR,FNR,TSS,HSS
Fold 1,64.0,50.0,24.0,22.0,0.7125,0.2875,0.7442,0.7273,0.7356,0.3243,0.2558,0.4199,0.4207
Fold 2,70.0,51.0,23.0,16.0,0.7562,0.2438,0.814,0.7527,0.7821,0.3108,0.186,0.5031,0.5065
Fold 3,69.0,47.0,27.0,17.0,0.725,0.275,0.8023,0.7188,0.7582,0.3649,0.1977,0.4375,0.4416
Fold 4,75.0,49.0,25.0,11.0,0.775,0.225,0.8721,0.75,0.8065,0.3378,0.1279,0.5343,0.5414
Fold 5,72.0,54.0,20.0,14.0,0.7875,0.2125,0.8372,0.7826,0.809,0.2703,0.1628,0.5669,0.5702
Fold 6,67.0,47.0,28.0,18.0,0.7125,0.2875,0.7882,0.7053,0.7444,0.3733,0.2118,0.4149,0.4182
Fold 7,65.0,47.0,28.0,20.0,0.7,0.3,0.7647,0.6989,0.7303,0.3733,0.2353,0.3914,0.3938
Fold 8,64.0,48.0,27.0,21.0,0.7,0.3,0.7529,0.7033,0.7273,0.36,0.2471,0.3929,0.3948
Fold 9,65.0,55.0,20.0,20.0,0.75,0.25,0.7647,0.7647,0.7647,0.2667,0.2353,0.498,0.498
Fold 10,62.0,52.0,22.0,23.0,0.717,0.283,0.7294,0.7381,0.7337,0.2973,0.2706,0.4321,0.4317



📊 Summary for GRU


Unnamed: 0,TP,TN,FP,FN,Accuracy,Error Rate,Recall,Precision,F1 Score,FPR,FNR,TSS,HSS
Fold 1,66.0,54.0,20.0,20.0,0.75,0.25,0.7674,0.7674,0.7674,0.2703,0.2326,0.4972,0.4972
Fold 2,64.0,55.0,19.0,22.0,0.7438,0.2562,0.7442,0.7711,0.7574,0.2568,0.2558,0.4874,0.4861
Fold 3,75.0,50.0,24.0,11.0,0.7812,0.2188,0.8721,0.7576,0.8108,0.3243,0.1279,0.5478,0.5546
Fold 4,63.0,56.0,18.0,23.0,0.7438,0.2562,0.7326,0.7778,0.7545,0.2432,0.2674,0.4893,0.487
Fold 5,65.0,58.0,16.0,21.0,0.7688,0.2312,0.7558,0.8025,0.7784,0.2162,0.2442,0.5396,0.5371
Fold 6,65.0,51.0,24.0,20.0,0.725,0.275,0.7647,0.7303,0.7471,0.32,0.2353,0.4447,0.4461
Fold 7,61.0,54.0,21.0,24.0,0.7188,0.2812,0.7176,0.7439,0.7305,0.28,0.2824,0.4376,0.4366
Fold 8,66.0,58.0,17.0,19.0,0.775,0.225,0.7765,0.7952,0.7857,0.2267,0.2235,0.5498,0.5489
Fold 9,59.0,59.0,16.0,26.0,0.7375,0.2625,0.6941,0.7867,0.7375,0.2133,0.3059,0.4808,0.477
Fold 10,61.0,63.0,11.0,24.0,0.7799,0.2201,0.7176,0.8472,0.7771,0.1486,0.2824,0.569,0.5626


***Discussion***

**Random Forest outperformed KNN and GRU across all evaluation metrics. 
This is expected as Random Forest handles noisy tabular data well, builds multiple trees, and reduces overfitting through ensembling. 
KNN was fast and easy to train but showed weakness in precision due to class overlap and sensitivity to feature scale. 
GRU, while competitive, is primarily designed for sequential data (e.g., time-series) and does not have a structural advantage over Random Forest in this tabular, non-temporal context.** 