# K-Nearest Neighbours Wine Quality Classifier
Group K: 
Shehab Hassani 06071687


## Table of Contents

1. [Imports](#1-imports)
2. [Load Data](#2-load-data)
3. [Create Binary Target](#3-create-binary-target)
4. [Data Splitting Function](#4-data-splitting-function)
5. [Z-Score Normalization Function](#5-z-score-normalization-function)
6. [Train and Evaluate k-NN Function](#6-train-and-evaluate-k-nn-function)
7. [Test Evaluation Function](#7-test-evaluation-function)
8. [Validation Curve Plotting Function](#8-validation-curve-plotting-function)
9. [Experiment 1: Original Split (900/300/400)](#9-experiment-1-original-split-900300400)
10. [Experiment 2: New Split (400/400/800)](#10-experiment-2-new-split-400400800)
11. [Results Comparison](#11-results-comparison)

## 1. Imports

In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## 2. Load Data

In [4]:
filepath = 'sparklingwine.csv'
df = pd.read_csv(filepath, index_col=0)

print(f"Data loaded successfully. Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Data loaded successfully. Shape: (1600, 12)
Columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


## 3. Create Binary Target

In [5]:
threshold = 6
df['good_wine'] = (df['quality'] >= threshold).astype(int)

print(f"\nBinary target created:")
print(f"  Good wines (quality >= {threshold}): {df['good_wine'].sum()}")
print(f"  Not good wines (quality < {threshold}): {(df['good_wine'] == 0).sum()}")


Binary target created:
  Good wines (quality >= 6): 1073
  Not good wines (quality < 6): 527


## 4. Data Splitting Function

In [6]:
def split_data(df, train_size, val_size, test_size):
    feature_cols = [col for col in df.columns if col not in ['quality', 'good_wine']]
    
    X = df[feature_cols].values
    y = df['good_wine'].values
    
    X_train = X[:train_size]
    y_train = y[:train_size]
    
    X_val = X[train_size:train_size + val_size]
    y_val = y[train_size:train_size + val_size]
    
    X_test = X[train_size + val_size:train_size + val_size + test_size]
    y_test = y[train_size + val_size:train_size + val_size + test_size]
    
    print(f"\nData split:")
    print(f"  Training set: {X_train.shape[0]} samples")
    print(f"  Validation set: {X_val.shape[0]} samples")
    print(f"  Test set: {X_test.shape[0]} samples")
    
    return X_train, y_train, X_val, y_val, X_test, y_test

## 5. Z-Score Normalization Function

In [7]:
def zscore_normalize(X_train, X_val, X_test):
   
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    
    std[std == 0] = 1
 
    X_train_norm = (X_train - mean) / std
    X_val_norm = (X_val - mean) / std
    X_test_norm = (X_test - mean) / std
    
    print(f"\nZ-score normalization applied using training set statistics.")
    
    return X_train_norm, X_val_norm, X_test_norm

## 6. Train and Evaluate k-NN Function

In [8]:
def train_and_evaluate_knn(X_train, y_train, X_val, y_val, k_values):

    validation_accuracies = []
    validation_errors = []
    
    print(f"\nTraining k-NN classifiers for k = 1 to {max(k_values)}...")
    
    for k in k_values:
        
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        
        y_pred = knn.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        error = 1 - accuracy
        
        validation_accuracies.append(accuracy)
        validation_errors.append(error)

    best_idx = np.argmax(validation_accuracies)
    best_k = k_values[best_idx]
    best_accuracy = validation_accuracies[best_idx]
    
    print(f"\nValidation Results:")
    print(f"  Best k: {best_k}")
    print(f"  Best validation accuracy: {best_accuracy:.4f}")
    print(f"  Best validation error: {1 - best_accuracy:.4f}")
    
    return best_k, best_accuracy, validation_accuracies, validation_errors

## 7. Test Evaluation Function

In [9]:
def evaluate_on_test(X_train, y_train, X_test, y_test, best_k):
    
    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    test_accuracy = accuracy_score(y_test, y_pred)
    test_error = 1 - test_accuracy
    
    print(f"\nTest Set Results (k={best_k}):")
    print(f"  Test accuracy: {test_accuracy:.4f}")
    print(f"  Generalisation error: {test_error:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Not Good', 'Good']))
    print(f"Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return test_accuracy, test_error, knn

## 8. Validation Curve Plotting Function

In [None]:
def plot_validation_curve(k_values, validation_errors, title, filename):
    
    best_idx = np.argmin(validation_errors)
    best_k = k_values[best_idx]
    best_error = validation_errors[best_idx]
    
    hover_text = [f'k={k}<br>Error={err:.4f}<br>Accuracy={1-err:.4f}' 
                  for k, err in zip(k_values, validation_errors)]
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=k_values,
        y=validation_errors,
        mode='lines+markers',
        name='Validation Error',
        line=dict(color='#3366CC', width=2),
        marker=dict(size=6, color='#3366CC'),
        hovertext=hover_text,
        hoverinfo='text'
    ))
    
    fig.add_trace(go.Scatter(
        x=[best_k],
        y=[best_error],
        mode='markers',
        name=f'Best k={best_k} (Error={best_error:.4f})',
        marker=dict(size=16, color='#DC3912', symbol='star'),
        hovertext=f'<b>BEST</b><br>k={best_k}<br>Error={best_error:.4f}<br>Accuracy={1-best_error:.4f}',
        hoverinfo='text'
    ))
    
    fig.update_layout(
        title=dict(text=title, font=dict(size=16)),
        xaxis=dict(
            title='k (Number of Neighbours)',
            tickmode='linear',
            dtick=10,
            gridcolor='rgba(128,128,128,0.2)'
        ),
        yaxis=dict(
            title='Validation Error',
            gridcolor='rgba(128,128,128,0.2)'
        ),
        legend=dict(x=0.7, y=0.98),
        hovermode='closest',
        template='plotly_white',
        width=900,
        height=500
    )
    
    html_filename = filename.replace('.png', '.html')
    fig.write_html(html_filename)
    fig.show()
    print(f"\nInteractive plot saved as '{html_filename}'")

## 9. Experiment 1: Original Split (900/300/400)

In [None]:
print("\n" + "=" * 70)
print("EXPERIMENT: Original Split")
print("Split: Train=900, Validation=300, Test=400")
print("=" * 70)

X_train_1, y_train_1, X_val_1, y_val_1, X_test_1, y_test_1 = split_data(
    df, train_size=900, val_size=300, test_size=400
)

X_train_norm_1, X_val_norm_1, X_test_norm_1 = zscore_normalize(X_train_1, X_val_1, X_test_1)

k_values = list(range(1, 101))
best_k_1, best_val_acc_1, val_accuracies_1, val_errors_1 = train_and_evaluate_knn(
    X_train_norm_1, y_train_1, X_val_norm_1, y_val_1, k_values
)

test_acc_1, test_error_1, classifier_1 = evaluate_on_test(
    X_train_norm_1, y_train_1, X_test_norm_1, y_test_1, best_k_1
)

plot_validation_curve(
    k_values, val_errors_1,
    'k-NN Validation Error vs k (Original Split)',
    'validation_curve_original_split.png'
)

results1 = {
    'best_k': best_k_1,
    'best_validation_accuracy': best_val_acc_1,
    'validation_error': 1 - best_val_acc_1,
    'test_accuracy': test_acc_1,
    'generalisation_error': test_error_1,
    'validation_errors': val_errors_1
}


EXPERIMENT: Original Split
Split: Train=900, Validation=300, Test=400

Data split:
  Training set: 900 samples
  Validation set: 300 samples
  Test set: 400 samples

Z-score normalization applied using training set statistics.

Training k-NN classifiers for k = 1 to 100...

Validation Results:
  Best k: 1
  Best validation accuracy: 0.7567
  Best validation error: 0.2433

Test Set Results (k=1):
  Test accuracy: 0.6900
  Generalisation error: 0.3100

Classification Report:
              precision    recall  f1-score   support

    Not Good       0.53      0.51      0.52       132
        Good       0.76      0.78      0.77       268

    accuracy                           0.69       400
   macro avg       0.65      0.64      0.65       400
weighted avg       0.69      0.69      0.69       400

Confusion Matrix:
[[ 67  65]
 [ 59 209]]



Interactive plot saved as 'validation_curve_original_split.html'


## 10. Experiment 2: New Split (400/400/800)

In [None]:
print("\n" + "=" * 70)
print("EXPERIMENT: New Split")
print("Split: Train=400, Validation=400, Test=800")
print("=" * 70)

X_train_2, y_train_2, X_val_2, y_val_2, X_test_2, y_test_2 = split_data(
    df, train_size=400, val_size=400, test_size=800
)

X_train_norm_2, X_val_norm_2, X_test_norm_2 = zscore_normalize(X_train_2, X_val_2, X_test_2)

k_values = list(range(1, 101))
best_k_2, best_val_acc_2, val_accuracies_2, val_errors_2 = train_and_evaluate_knn(
    X_train_norm_2, y_train_2, X_val_norm_2, y_val_2, k_values
)

test_acc_2, test_error_2, classifier_2 = evaluate_on_test(
    X_train_norm_2, y_train_2, X_test_norm_2, y_test_2, best_k_2
)

plot_validation_curve(
    k_values, val_errors_2,
    'k-NN Validation Error vs k (New Split)',
    'validation_curve_new_split.png'
)

results2 = {
    'best_k': best_k_2,
    'best_validation_accuracy': best_val_acc_2,
    'validation_error': 1 - best_val_acc_2,
    'test_accuracy': test_acc_2,
    'generalisation_error': test_error_2,
    'validation_errors': val_errors_2
}


EXPERIMENT: New Split
Split: Train=400, Validation=400, Test=800

Data split:
  Training set: 400 samples
  Validation set: 400 samples
  Test set: 800 samples

Z-score normalization applied using training set statistics.

Training k-NN classifiers for k = 1 to 100...

Validation Results:
  Best k: 5
  Best validation accuracy: 0.7375
  Best validation error: 0.2625

Test Set Results (k=5):
  Test accuracy: 0.7475
  Generalisation error: 0.2525

Classification Report:
              precision    recall  f1-score   support

    Not Good       0.62      0.58      0.60       259
        Good       0.81      0.83      0.82       541

    accuracy                           0.75       800
   macro avg       0.71      0.70      0.71       800
weighted avg       0.74      0.75      0.75       800

Confusion Matrix:
[[151 108]
 [ 94 447]]



Interactive plot saved as 'validation_curve_new_split.html'


## 11. Results Comparison

In [None]:
print("\n" + "=" * 70)
print("COMPARISON OF RESULTS")
print("=" * 70)

print(f"\n{'Metric':<30} {'Original (900/300/400)':<25} {'New (400/400/800)':<25}")
print("-" * 80)
print(f"{'Best k':<30} {results1['best_k']:<25} {results2['best_k']:<25}")
print(f"{'Validation Accuracy':<30} {results1['best_validation_accuracy']:<25.4f} {results2['best_validation_accuracy']:<25.4f}")
print(f"{'Validation Error':<30} {results1['validation_error']:<25.4f} {results2['validation_error']:<25.4f}")
print(f"{'Test Accuracy':<30} {results1['test_accuracy']:<25.4f} {results2['test_accuracy']:<25.4f}")
print(f"{'Generalisation Error':<30} {results1['generalisation_error']:<25.4f} {results2['generalisation_error']:<25.4f}")

print("\n" + "=" * 70)
print("ANALYSIS AND EXPLANATION")
print("=" * 70)

k_values = list(range(1, 101))

hover_text_1 = [f'<b>Original Split</b><br>k={k}<br>Error={err:.4f}<br>Accuracy={1-err:.4f}' 
                for k, err in zip(k_values, results1['validation_errors'])]
hover_text_2 = [f'<b>New Split</b><br>k={k}<br>Error={err:.4f}<br>Accuracy={1-err:.4f}' 
                for k, err in zip(k_values, results2['validation_errors'])]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=k_values,
    y=results1['validation_errors'],
    mode='lines+markers',
    name=f'Original Split (900/300/400), Best k={results1["best_k"]}',
    line=dict(color='#3366CC', width=2),
    marker=dict(size=5, color='#3366CC'),
    hovertext=hover_text_1,
    hoverinfo='text'
))

fig.add_trace(go.Scatter(
    x=k_values,
    y=results2['validation_errors'],
    mode='lines+markers',
    name=f'New Split (400/400/800), Best k={results2["best_k"]}',
    line=dict(color='#DC3912', width=2),
    marker=dict(size=5, color='#DC3912'),
    hovertext=hover_text_2,
    hoverinfo='text'
))

best_idx_1 = np.argmin(results1['validation_errors'])
best_idx_2 = np.argmin(results2['validation_errors'])

fig.add_trace(go.Scatter(
    x=[k_values[best_idx_1]],
    y=[results1['validation_errors'][best_idx_1]],
    mode='markers',
    name=f'Best Original k={results1["best_k"]}',
    marker=dict(size=14, color='#3366CC', symbol='star', line=dict(width=2, color='white')),
    hovertext=f'<b>BEST (Original)</b><br>k={results1["best_k"]}<br>Error={results1["validation_errors"][best_idx_1]:.4f}',
    hoverinfo='text'
))

fig.add_trace(go.Scatter(
    x=[k_values[best_idx_2]],
    y=[results2['validation_errors'][best_idx_2]],
    mode='markers',
    name=f'Best New k={results2["best_k"]}',
    marker=dict(size=14, color='#DC3912', symbol='star', line=dict(width=2, color='white')),
    hovertext=f'<b>BEST (New)</b><br>k={results2["best_k"]}<br>Error={results2["validation_errors"][best_idx_2]:.4f}',
    hoverinfo='text'
))

fig.update_layout(
    title=dict(text='Comparison of Validation Errors: Original vs New Split', font=dict(size=16)),
    xaxis=dict(
        title='k (Number of Neighbours)',
        tickmode='linear',
        dtick=10,
        gridcolor='rgba(128,128,128,0.2)'
    ),
    yaxis=dict(
        title='Validation Error',
        gridcolor='rgba(128,128,128,0.2)'
    ),
    legend=dict(x=0.5, y=0.98, xanchor='center', orientation='h'),
    hovermode='closest',
    template='plotly_white',
    width=950,
    height=550
)

fig.write_html('validation_comparison.html')
fig.show()
print("\nInteractive comparison plot saved as 'validation_comparison.html'")


COMPARISON OF RESULTS

Metric                         Original (900/300/400)    New (400/400/800)        
--------------------------------------------------------------------------------
Best k                         1                         5                        
Validation Accuracy            0.7567                    0.7375                   
Validation Error               0.2433                    0.2625                   
Test Accuracy                  0.6900                    0.7475                   
Generalisation Error           0.3100                    0.2525                   

ANALYSIS AND EXPLANATION



Interactive comparison plot saved as 'validation_comparison.html'
