# Wine Quality

In [None]:
# Imports libraries/ect..
import numpy as np                  
import pandas as pd                 
import matplotlib.pyplot as plt     
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Activation
from scikeras.wrappers import KerasClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier


seed = 7
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

sns.set_theme()
%matplotlib inline

In [None]:
df = pd.read_csv("WineQT.csv")
df.head()

In [None]:
df.shape

In [None]:
# Drops unneeded ID column
df = df.drop(['Id'], axis=1)

In [None]:
# finds na values
df.isna().sum()

In [None]:
# looks for duplicates
df.duplicated().sum()

In [None]:
# drops duplicates
df = df.drop_duplicates()

In [None]:
df.describe()

### Outlier Detection

In [None]:
fig, ax = plt.subplots(12,2, figsize=(30,80))
for index, i in enumerate(df.columns):
    sns.distplot(df[i], ax=ax[index,0],color='blue')
    stats.probplot(df[i], plot=ax[index,1])

In [None]:
# Correlation of features
mat = df.corr()
fig, ax = plt.subplots(figsize = (15,15))
sns.heatmap(mat, annot = True, annot_kws={'size': 15});

In [None]:
# Box plots for outliers
fig, ax = plt.subplots(3, 1, figsize=(12, 18))

sns.boxplot(data=df[df.columns[0:4]], ax=ax[0])
sns.boxplot(data=df[df.columns[4:8]], ax=ax[1])
sns.boxplot(data=df[df.columns[8:]])

In [None]:
# Finds outliers based on z_score then removes them
def outlier_dect(df, column):
    z_scores = np.abs(stats.zscore(df[column]))
    df = df[(z_scores < 3)]
    print(column, " = ", len(np.where(z_scores>3)[0]), 'outliers')
    return df

# Run for columns
for column in df.columns:
    df = outlier_dect(df, column)

In [None]:
# Box plots for after outliers detection
fig, ax = plt.subplots(3, 1, figsize=(12, 18))

sns.boxplot(data=df[df.columns[0:4]], ax=ax[0])
sns.boxplot(data=df[df.columns[4:8]], ax=ax[1])
sns.boxplot(data=df[df.columns[8:]])

### Data Prepping and Models

In [None]:
# Assigns data to X and y
X = df.drop(['quality'], axis=1)
y = df['quality']

In [None]:
# Splits data test shape
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=11)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

df.sample(7)

### Non-Normalized Data Models  

In [None]:
# Keras Neural Network Model
def create_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Input(shape=(11,)))
    model.add(tf.keras.layers.Dense(128))
    model.add(Activation('relu'))
    model.add(tf.keras.layers.Dense(64))
    model.add(Activation('relu'))
    model.add(tf.keras.layers.Dense(32))
    model.add(Activation('relu'))
    model.add(tf.keras.layers.Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(optimizer = tf.optimizers.Adam(), loss = 'binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
# Using a function to train models, this can be repurposed for Normailized data

def train_models(X_train, y_train, X_val, y_val, print_acc=False):

    dict_acc = {}

    # Label Encoder: this will allow XGBoost to preform as y_train/y_val will start from 0
    # previous error: ValueError: Invalid classes inferred from unique values of `y`. Expected: [0 1 2 3 4], got [4 5 6 7 8] (XGB model)
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_val = le.fit_transform(y_val)


    # Dummy classifier to test if models can make good fit
    dum = DummyClassifier(strategy='prior')

    scores = cross_val_score(dum, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_dum_train = scores.mean()
    dum.fit(X_train, y_train)
    acc_dum_val = dum.score(X_val, y_val)

    dict_acc['Dummy Classifier'] = [acc_dum_train, acc_dum_val]

    # Bernoulli Naive Bayes
    bern = BernoulliNB()

    scores = cross_val_score(bern, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_bern_train = scores.mean()
    bern.fit(X_train, y_train)
    acc_bern_val = bern.score(X_val, y_val)

    dict_acc['Bernoulli NB'] = [acc_bern_train, acc_bern_val]

    # Decision Tree
    dtree = DecisionTreeClassifier()

    scores = cross_val_score(dtree, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_dtree_train = scores.mean()
    dtree.fit(X_train, y_train)
    acc_dtree_val = dtree.score(X_val, y_val)

    dict_acc['Decision Tree'] = [acc_dtree_train, acc_dtree_val]

    #Ridge Classifer
    rclass = RidgeClassifier()

    scores = cross_val_score(rclass, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_rclass_train = scores.mean()
    rclass.fit(X_train, y_train)
    acc_rclass_val = rclass.score(X_val, y_val)

    dict_acc['Ridge Classifier'] = [acc_rclass_train, acc_rclass_val]

    #Logistic Regression
    lr = LogisticRegression()

    scores = cross_val_score(lr, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_lr_train = scores.mean()
    lr.fit(X_train, y_train)
    acc_lr_val = lr.score(X_val, y_val)

    dict_acc['Logistic Regression'] = [acc_lr_train, acc_lr_val]

    # SGD Classifier
    sgd = SGDClassifier()

    scores = cross_val_score(sgd, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_sgd_train = scores.mean()
    sgd.fit(X_train, y_train)
    acc_sgd_val = sgd.score(X_val, y_val)

    dict_acc['SGD Classifier'] = [acc_sgd_train, acc_sgd_val]

    #KNeighborsClassifier
    knn = KNeighborsClassifier()

    scores = cross_val_score(knn, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_knn_train = scores.mean()
    knn.fit(X_train, y_train)
    acc_knn_val = knn.score(X_val, y_val)

    dict_acc['KNeighbors Classifier'] = [acc_knn_train, acc_knn_val]

    #Support Vector Classifier (SVC)
    svc = SVC()

    scores = cross_val_score(svc, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_svc_train = scores.mean()
    svc.fit(X_train, y_train)
    acc_svc_val = svc.score(X_val, y_val)

    dict_acc['SVC'] = [acc_svc_train, acc_svc_val]

    #Random Forest Classifier
    rf = RandomForestClassifier()

    scores = cross_val_score(rf, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_rf_train = scores.mean()
    rf.fit(X_train, y_train)
    acc_rf_val = rf.score(X_val, y_val)

    dict_acc['Random Forest Classifier'] = [acc_rf_train, acc_rf_val]

    #XGBClassifier
    xgb = XGBClassifier()

    scores = cross_val_score(xgb, X_train, y_train, cv=kfold, scoring='accuracy')
    acc_xgb_train = scores.mean()
    xgb.fit(X_train, y_train)
    acc_xgb_val = xgb.score(X_val, y_val)

    dict_acc['XGBoost Classifier'] = [acc_xgb_train, acc_xgb_val]

    #Neural Network
    model_keras = KerasClassifier(build_fn=create_model, epochs=150, batch_size=64, verbose=0)

    results = cross_val_score(model_keras, X_train, y_train, cv=kfold)
    acc_train = results.mean()
    model_keras.fit(X_train, y_train)
    acc_keras = model_keras.score(X_val, y_val)

    dict_acc['Neural Network'] = [acc_train, acc_keras]

    # Printing the values
    if print_acc:
        for key, values in dict_acc.items():
            print("{} Accuracy on Training Data (CV): {}".format(key, values[0]))
            print("{} Accuracy on Validation Data: {}".format(key, values[1]))
            print()
    
    return dict_acc

In [None]:
dict_models = train_models(X_train, y_train, X_val, y_val, print_acc=True)

In [None]:

df_acc = pd.DataFrame.from_dict(dict_models)
df_acc = df_acc.transpose()
df_acc.columns = ['Acc Training (CV)', 'Acc Validation']
df_acc