In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
def load_data(filepath):

    df = pd.read_csv(filepath, index_col=0)

    print(f"Data loaded successfully. Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    return df

In [None]:
def create_binary_target(df, threshold=6):

    df['good_wine'] = (df['quality'] >= threshold).astype(int)

    print(f"\nBinary target created:")
    print(f"  Good wines (quality >= {threshold}): {df['good_wine'].sum()}")
    print(f"  Not good wines (quality < {threshold}): {(df['good_wine'] == 0).sum()}")
    
    return df

In [None]:
def split_data(df, train_size, val_size, test_size):
    feature_cols = [col for col in df.columns if col not in ['quality', 'good_wine']]
    
    X = df[feature_cols].values
    y = df['good_wine'].values
    
    X_train = X[:train_size]
    y_train = y[:train_size]
    
    X_val = X[train_size:train_size + val_size]
    y_val = y[train_size:train_size + val_size]
    
    X_test = X[train_size + val_size:train_size + val_size + test_size]
    y_test = y[train_size + val_size:train_size + val_size + test_size]
    
    print(f"\nData split:")
    print(f"  Training set: {X_train.shape[0]} samples")
    print(f"  Validation set: {X_val.shape[0]} samples")
    print(f"  Test set: {X_test.shape[0]} samples")
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
def zscore_normalize(X_train, X_val, X_test):
   
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    
    std[std == 0] = 1
 
    X_train_norm = (X_train - mean) / std
    X_val_norm = (X_val - mean) / std
    X_test_norm = (X_test - mean) / std
    
    print(f"\nZ-score normalization applied using training set statistics.")
    
    return X_train_norm, X_val_norm, X_test_norm

In [None]:
def train_and_evaluate_knn(X_train, y_train, X_val, y_val, k_values):

    validation_accuracies = []
    validation_errors = []
    
    print(f"\nTraining k-NN classifiers for k = 1 to {max(k_values)}...")
    
    for k in k_values:
        
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        
        y_pred = knn.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        error = 1 - accuracy
        
        validation_accuracies.append(accuracy)
        validation_errors.append(error)

    best_idx = np.argmax(validation_accuracies)
    best_k = k_values[best_idx]
    best_accuracy = validation_accuracies[best_idx]
    
    print(f"\nValidation Results:")
    print(f"  Best k: {best_k}")
    print(f"  Best validation accuracy: {best_accuracy:.4f}")
    print(f"  Best validation error: {1 - best_accuracy:.4f}")
    
    return best_k, best_accuracy, validation_accuracies, validation_errors

In [None]:
def evaluate_on_test(X_train, y_train, X_test, y_test, best_k):
    
    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    test_accuracy = accuracy_score(y_test, y_pred)
    test_error = 1 - test_accuracy
    
    print(f"\nTest Set Results (k={best_k}):")
    print(f"  Test accuracy: {test_accuracy:.4f}")
    print(f"  Generalisation error: {test_error:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Not Good', 'Good']))
    print(f"Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return test_accuracy, test_error, knn

In [None]:
def plot_validation_curve(k_values, validation_errors, title, filename):
    
    plt.figure(figsize=(12, 6))
    plt.plot(k_values, validation_errors, 'b-', linewidth=1.5, marker='o', 
             markersize=3, label='Validation Error')
    
    best_idx = np.argmin(validation_errors)
    best_k = k_values[best_idx]
    best_error = validation_errors[best_idx]
    plt.scatter([best_k], [best_error], color='red', s=150, zorder=5, 
                label=f'Best k={best_k} (Error={best_error:.4f})')
    
    plt.xlabel('k (Number of Neighbours)', fontsize=12)
    plt.ylabel('Validation Error', fontsize=12)
    plt.title(title, fontsize=14)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.xticks(range(0, max(k_values) + 1, 10))
    plt.tight_layout()
    plt.savefig(filename, dpi=150)
    plt.show()
    print(f"\nPlot saved as '{filename}'")

In [None]:
def run_experiment(df, train_size, val_size, test_size, experiment_name):
    
    print("\n" + "=" * 70)
    print(f"EXPERIMENT: {experiment_name}")
    print(f"Split: Train={train_size}, Validation={val_size}, Test={test_size}")
    print("=" * 70)

    X_train, y_train, X_val, y_val, X_test, y_test = split_data(
        df, train_size, val_size, test_size
    )
    
    X_train_norm, X_val_norm, X_test_norm = zscore_normalize(X_train, X_val, X_test)
    
    k_values = list(range(1, 101))
    best_k, best_val_acc, val_accuracies, val_errors = train_and_evaluate_knn(
        X_train_norm, y_train, X_val_norm, y_val, k_values
    )
    
    test_acc, test_error, classifier = evaluate_on_test(
        X_train_norm, y_train, X_test_norm, y_test, best_k
    )
    
    plot_filename = f'validation_curve_{experiment_name.lower().replace(" ", "_")}.png'
    plot_validation_curve(
        k_values, val_errors,
        f'k-NN Validation Error vs k ({experiment_name})',
        plot_filename
    )
    
    return {
        'experiment_name': experiment_name,
        'train_size': train_size,
        'val_size': val_size,
        'test_size': test_size,
        'best_k': best_k,
        'best_validation_accuracy': best_val_acc,
        'validation_error': 1 - best_val_acc,
        'test_accuracy': test_acc,
        'generalisation_error': test_error,
        'validation_errors': val_errors
    }

In [None]:
def main():
    filepath = 'sparklingwine.csv'
    df = load_data(filepath)
    
    df = create_binary_target(df, threshold=6)

    results1 = run_experiment(
        df, 
        train_size=900, 
        val_size=300, 
        test_size=400,
        experiment_name="Original Split"
    )
    
    results2 = run_experiment(
        df, 
        train_size=400, 
        val_size=400, 
        test_size=800,
        experiment_name="New Split"
    )
    
    
    print("\n" + "=" * 70)
    print("COMPARISON OF RESULTS")
    print("=" * 70)
    
    print(f"\n{'Metric':<30} {'Original (900/300/400)':<25} {'New (400/400/800)':<25}")
    print("-" * 80)
    print(f"{'Best k':<30} {results1['best_k']:<25} {results2['best_k']:<25}")
    print(f"{'Validation Accuracy':<30} {results1['best_validation_accuracy']:<25.4f} {results2['best_validation_accuracy']:<25.4f}")
    print(f"{'Validation Error':<30} {results1['validation_error']:<25.4f} {results2['validation_error']:<25.4f}")
    print(f"{'Test Accuracy':<30} {results1['test_accuracy']:<25.4f} {results2['test_accuracy']:<25.4f}")
    print(f"{'Generalisation Error':<30} {results1['generalisation_error']:<25.4f} {results2['generalisation_error']:<25.4f}")
    
    print("\n" + "=" * 70)
    print("ANALYSIS AND EXPLANATION")
    print("=" * 70)
    

    plt.figure(figsize=(12, 6))
    k_values = list(range(1, 101))
    plt.plot(k_values, results1['validation_errors'], 'b-', linewidth=1.5, 
             label=f'Original Split (900/300/400), Best k={results1["best_k"]}')
    plt.plot(k_values, results2['validation_errors'], 'r-', linewidth=1.5, 
             label=f'New Split (400/400/800), Best k={results2["best_k"]}')
    plt.xlabel('k (Number of Neighbours)', fontsize=12)
    plt.ylabel('Validation Error', fontsize=12)
    plt.title('Comparison of Validation Errors: Original vs New Split', fontsize=14)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.xticks(range(0, 101, 10))
    plt.tight_layout()
    plt.savefig('validation_comparison.png', dpi=150)
    plt.show()
    print("\nComparison plot saved as 'validation_comparison.png'")
    
    return results1, results2

if __name__ == "__main__":
    results1, results2 = main()