# Predicting wine quality using K's nearest neighbours algorithm

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [13]:
data = pd.read_csv("winequality-white.csv", sep=";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


**dropping categorical columns**

In [14]:
categorical_cols = data.select_dtypes(include=["object"]).columns
data = data.drop(columns=[col for col in categorical_cols])
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


**handling missing values**

In [15]:
missing_info = data.isnull().sum() / len(data) * 100
for column in data.columns:
    if missing_info[column] < 10:
        data[column].fillna(data[column].mean(), inplace=True)
    else:
        data.dropna(subset=[column], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mean(), inplace=True)


**setting up features**

In [16]:
X = data.drop(columns=["quality"]).values
y = data["quality"].values

**splitting data into train and tests set**

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

**scaling our train and test set using standard scaler**

In [18]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**choosing our model**

In [19]:
knn = KNeighborsClassifier()

**setting up our parameters grid for cross validation and testing**

In [20]:
param_grid={
    'n_neighbors': range(1, 21),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid= GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train_scaled, y_train)

print("Best Hyperparameters:", grid.best_params_)
print("best cross-validation score:", grid.best_score_)

best_knn = grid.best_estimator_
y_pred = best_knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'distance'}
best cross-validation score: 0.6455665978591646
Test Set Accuracy: 0.6802721088435374


In [21]:
print("Predictions:", y_pred[:5])
print("Actual labels:", y_test[:5])

Predictions: [7 8 7 5 7]
Actual labels: [7 8 8 5 7]


# Conclusion: The previous version of this code had an accuracy of around 54%. With grid search CV and hyperparameter tuning it was brought up to 68% on the test set. It predicted 4 out of 5 predictions correctly when tested with the first 5 values.