In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.metrics import accuracy_score

In [2]:
main_df = pd.read_csv("./data/winequality-white.csv", sep=";")
main_df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6


In [3]:
main_df.describe() # high level overview of the dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [18]:
main_df.isnull().any() # checking if any column has null values

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [5]:
main_df["quality"] -= main_df["quality"].min()
main_df["quality"].value_counts()

3    2198
2    1457
4     880
5     175
1     163
0      20
6       5
Name: quality, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X = main_df.drop(columns=["quality"], inplace=False)
y = main_df["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [7]:
normalizer = StandardScaler()

I will be using the guide on https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html for selecting the algorithms.

In [8]:
svm = LinearSVC(dual=False) # dual=True is the default, "Prefer dual=False when n_samples > n_features" as stated in scikit-learn docs.
svm.fit(X_train, y_train)

svm.score(X_test, y_test)

0.5142857142857142

In [9]:
normalized_svm = make_pipeline(normalizer, LinearSVC(dual=False))
normalized_svm.fit(X_train, y_train)

normalized_svm.score(X_test, y_test)

0.5061224489795918

In [10]:
knn = KNeighborsClassifier(n_neighbors=7, weights="distance")
knn.fit(X_train, y_train)

knn.score(X_test, y_test)

0.5948979591836735

In [11]:
normalized_knn = make_pipeline(normalizer, KNeighborsClassifier(n_neighbors=7, weights="distance"))
normalized_knn.fit(X_train, y_train)

normalized_knn.score(X_test, y_test)

0.6581632653061225

In [12]:
xgb_model = XGBClassifier(use_label_encoder=False, max_depth=8)
xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)
accuracy_score(y_test, xgb_predictions)



0.6918367346938775

In [16]:
normalized_xgb_model = make_pipeline(normalizer, XGBClassifier(use_label_encoder=False, max_depth=8))
normalized_xgb_model.fit(X_train, y_train)

normalized_xgb_predictions = normalized_xgb_model.predict(X_test)
accuracy_score(y_test, normalized_xgb_predictions)



0.6908163265306122