In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

In [45]:
df = pd.read_csv('winequality_data.csv')
df = df.drop(columns=["Id"])
df = df[df['total sulfur dioxide'] < 200]
df['quality'] = np.where(df['quality'] <= 5, 0, 1)


In [46]:
x = df[df.columns[:-1]]
y = df['quality']
sc = StandardScaler()
x = sc.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)

# K Nearest Neighbors Classifier

### Tunning the model

In [47]:
param_grid = {'n_neighbors': range(1, 21)}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(x_train, y_train)

print("Best n_neighbors:", grid_search.best_params_)


Best n_neighbors: {'n_neighbors': 14}


In [48]:
n3 = KNeighborsClassifier(n_neighbors = 14)
n3.fit(x_train, y_train)
pred_n3 = n3.predict(x_test)
print(classification_report(y_test, pred_n3))

              precision    recall  f1-score   support

           0       0.69      0.75      0.72       102
           1       0.79      0.72      0.75       127

    accuracy                           0.74       229
   macro avg       0.74      0.74      0.74       229
weighted avg       0.74      0.74      0.74       229



# Random Forest Classifier

In [49]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
pred_rf = rf.predict(x_test)
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76       102
           1       0.82      0.78      0.80       127

    accuracy                           0.78       229
   macro avg       0.78      0.78      0.78       229
weighted avg       0.78      0.78      0.78       229



# Decision Tree Classifier

In [50]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
pred_dt = dt.predict(x_test)
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.66      0.68      0.67       102
           1       0.74      0.72      0.73       127

    accuracy                           0.70       229
   macro avg       0.70      0.70      0.70       229
weighted avg       0.70      0.70      0.70       229



# Stochastic Gradient Descent

In [51]:
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
pred_sgd = sgd.predict(x_test)
print(classification_report(y_test, pred_sgd))

              precision    recall  f1-score   support

           0       0.67      0.75      0.71       102
           1       0.78      0.70      0.74       127

    accuracy                           0.72       229
   macro avg       0.73      0.73      0.72       229
weighted avg       0.73      0.72      0.73       229



# Trying to improve results

In [52]:
# # number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start=50, stop=1000, num=10)]
# # number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # max number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num=10)] + [None]
# # min number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # min number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # method of selecting samples for training each tree
# bootstrap = [True, False]

# # random grid
# random_grid = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}

# rf_optimized = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42)

# # print(rf_optimized.best_params_)

# rf_optimized.fit(x_train, y_train)
# pred_optimized = rf_optimized.predict(x_test)
# print(classification_report(y_test, pred_optimized))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   4.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   4.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   3.9s
[CV] END bootstrap=True, max_depth=65, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=577; total time=   2.3s
[CV] END bootstrap=True, max_depth=65, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=577; total time=   2.3s
[CV] END bootstrap=True, max_depth=65, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=577; total time=   2.3s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=6

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "/home/betty/anaconda3/envs/ML-env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/betty/anaconda3/envs/ML-env/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/betty/anaconda3/envs/ML-env/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/betty/anaconda3/envs/ML-env/lib/python3.12/site-packages/sklearn/utils/_param_va

              precision    recall  f1-score   support

           0       0.74      0.78      0.76       102
           1       0.82      0.78      0.80       127

    accuracy                           0.78       229
   macro avg       0.78      0.78      0.78       229
weighted avg       0.78      0.78      0.78       229

