In [None]:
!pip install matplotlib

In [None]:
!pip install seaborn

In [None]:
!pip install scikit-learn

In [None]:
!pip install xgboost

In [None]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import RobustScaler, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
red_url = 'https://raw.githubusercontent.com/Coragon42/singlestore_wine/main/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/Coragon42/singlestore_wine/main/winequality-white.csv'
red_data = pd.read_csv(red_url, sep=';', header=0)
white_data = pd.read_csv(white_url, sep=';', header=0)
red_data.head()

In [None]:
white_data.head()

In [None]:
# combine red and white wine datasets, manually encoding a new binary color feature
red_data['color'] = 1;
white_data['color'] = 0;
wine_data = pd.concat([red_data, white_data], ignore_index = True, copy = False)
print(wine_data)

In [None]:
wine_data.describe()

In [None]:
# nice dataset with no missing values and all numerical data (less preprocessing needed)
wine_data.info()

In [None]:
wine_data.hist(xlabelsize=10,ylabelsize=10,figsize=(12,12))

In [None]:
wine_data.boxplot(figsize=(14,7),rot=90)

In [None]:
scatter_matrix(wine_data, figsize=[20, 20])
plt.show()

In [None]:
# Quality ratings are only from 3 to 9. 
# Let's binarize them (according to the median) into a classification problem.
wine_data.loc[wine_data['quality'] <= 6,'quality'] = 0
wine_data.loc[wine_data['quality'] > 6,'quality'] = 1
wine_data.describe()

In [None]:
def percent_outliers(col):
    q1 = col.describe()[4]
    q3 = col.describe()[6]
    margin = 1.5*(q3-q1)
    upper = q3 + margin
    lower = q1 - margin
    count = 0
    for val in col:
        if val < lower or val > upper:
            count += 1
    return count / col.size

# some classifiers are sensitive to outliers, but we also want to preserve most original data
def cap_outliers(col):
    q1 = col.describe()[4]
    q3 = col.describe()[6]
    margin = 1.5*(q3-q1)
    upper = q3 + margin
    lower = q1 - margin
    col.mask(col>upper,upper,inplace=True)
    col.mask(col<lower,lower,inplace=True)
    
for col in wine_data.columns:
    if col == 'quality' or col == 'color':
        continue
    percent = percent_outliers(wine_data[col])
    print(col + ': ' + str(percent))
    if (percent < 0.05):
        print(' ^capped')
        cap_outliers(wine_data[col])

In [None]:
q_data = wine_data.quality
wine_data.drop('quality',axis=1,inplace=True)
wine_train,wine_temp,q_train,q_temp = train_test_split(wine_data,q_data,random_state=0,test_size=0.2)
wine_dev,wine_test,q_dev,q_test = train_test_split(wine_temp,q_temp,random_state=0,test_size=0.5)
# train-dev-test split, 80-10-10%
# training data is for fitting, dev set is for model selection/parameter tuning/preventing overfitting, test set is for testing performance

In [None]:
corr_mat = abs(pd.concat([wine_train,q_train],axis=1).corr())
fig,ax = plt.subplots(figsize=(40,30))
sns.set(font_scale=1.05)
sns.heatmap(corr_mat,annot=True,mask=np.triu(corr_mat))
plt.show()

In [None]:
# manually tuned hyperparameters (use RandomSearchCV/GridSearchCV if you have time)
xgb_model = XGBClassifier(n_estimators=2500,learning_rate=0.01,max_depth=8,min_child_weight=1,alpha=0.1,gamma=0.01,subsample=0.6,colsample_bytree=0.6,n_jobs=-1)
xgb_model.fit(wine_train,q_train,verbose=False)
print(f'accuracy of xgb_model = {accuracy_score(q_dev,xgb_model.predict(wine_dev))}')
# print(f'mean cross_val_score of xgb_model = {cross_val_score(xgb_model,wine_train,q_train,n_jobs=-1).mean()}')

In [None]:
rforest = RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=0)
rforest.fit(wine_train,q_train)
print(f'accuracy of rforest = {accuracy_score(q_dev,rforest.predict(wine_dev))}')

In [None]:
scaler = RobustScaler(unit_variance=True)
scaler.set_output(transform='pandas')
logreg = LogisticRegression(C=0.01,n_jobs=-1)
ridge = RidgeClassifier()
knclass = KNeighborsClassifier(n_jobs=-1)
svc = SVC(gamma=1)
naive = GaussianNB()
qda = QuadraticDiscriminantAnalysis()
# gpc = GaussianProcessClassifier(n_jobs=-1)
# mlp = MLPClassifier(max_iter=1000,early_stopping=True)
pipe_logreg = make_pipeline(scaler,logreg)
pipe_ridge = make_pipeline(scaler,ridge)
pipe_knclass = make_pipeline(scaler,knclass)
pipe_svc = make_pipeline(scaler,svc)
pipe_naive = make_pipeline(scaler,naive)
pipe_qda = make_pipeline(scaler,qda)
# pipe_gpc = make_pipeline(scaler,gpc)
# pipe_mlp = make_pipeline(scaler,mlp)
print(f'accuracy of pipe_logreg = {pipe_logreg.fit(wine_train,q_train).score(wine_dev,q_dev)}')
print(f'accuracy of pipe_ridge = {pipe_ridge.fit(wine_train,q_train).score(wine_dev,q_dev)}')
print(f'accuracy of pipe_knclass = {pipe_knclass.fit(wine_train,q_train).score(wine_dev,q_dev)}')
print(f'accuracy of pipe_svc = {pipe_svc.fit(wine_train,q_train).score(wine_dev,q_dev)}')
print(f'accuracy of pipe_naive = {pipe_naive.fit(wine_train,q_train).score(wine_dev,q_dev)}')
print(f'accuracy of pipe_qda = {pipe_qda.fit(wine_train,q_train).score(wine_dev,q_dev)}')
# print(f'accuracy of pipe_gpc = {pipe_gpc.fit(wine_train,q_train).score(wine_dev,q_dev)}') # too gigantic
# print(f'accuracy of pipe_mlp = {pipe_mlp.fit(wine_train,q_train).score(wine_dev,q_dev)}')

In [None]:
# choosing well-performing models resistant to multicollinearity (https://repositorium.sdum.uminho.pt/bitstream/1822/10029/1/wine5.pdf)
models = [('xgb_model',xgb_model),('rforest',rforest),('pipe_ridge',pipe_ridge),('pipe_svc',pipe_svc)] #,('pipe_mlp',pipe_mlp),('pipe_gpc',pipe_gpc),('pipe_qda',pipe_qda)]
stack = StackingClassifier(estimators=models,final_estimator=RidgeClassifier(),n_jobs=-1) # default final estimator would be logistic regression, which is sensitive to multicollinearity
stack.fit(wine_train,q_train)
print(f'accuracy of stack on validation data = {accuracy_score(q_dev,stack.predict(wine_dev))}')

In [None]:
print(f'accuracy of stack on test data = {accuracy_score(q_test,stack.predict(wine_test))}')

In [None]:
# from joblib import dump, load
# dump(stack, 'wine_quality_classifier.joblib') # ran locally

https://repositorium.sdum.uminho.pt/bitstream/1822/10029/1/wine5.pdf

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

https://machinelearningmastery.com/stacking-ensemble-machine-learning-with-python/

https://stackoverflow.com/questions/61859341/does-including-a-scaler-in-sklearns-pipeline-scale-the-target-variable

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

https://stats.stackexchange.com/questions/73032/linear-kernel-and-non-linear-kernel-for-support-vector-machine

https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

https://scikit-learn.org/stable/model_persistence.html

https://runmercury.com/docs/input-widgets/