In [None]:
# %% Deep learning - Section 10.82
#    The wine quality dataset

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% A few notes about the wine dataset
#    It come from the UCI (university of california irvine) machine learning repository,
#    where you can find many more datasets: https://archive.ics.uci.edu/


In [None]:
# %% Import data into a pandas dataframe

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

data = pd.read_csv(url,sep=';')
data


In [None]:
# %% Explore descriptive statistics of data

# pandas has a .descriptive() method
data.describe()


In [None]:
# %% List unique values per column

for i in data.keys():
    print(f'{i} has {len(np.unique(data[i]))} unique values')


In [None]:
# %% Pairwise plot

# Pick some columns
features2plot = [ 'fixed acidity','volatile acidity','citric acid','quality' ]
sns.pairplot(data[features2plot],kind='reg',hue='quality')

plt.savefig('figure1_wine_dataset.png')

plt.show()

files.download('figure1_wine_dataset.png')


In [None]:
# %% Plot some data

# Remove some outliers
data = data[data['total sulfur dioxide']<200]

phi    = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,figsize=(9*phi,6))

ax = sns.boxplot(data=data)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)

plt.savefig('figure2_wine_dataset.png')

plt.show()

files.download('figure2_wine_dataset.png')


In [None]:
# %% Z-score all the variables except for quality

# As you might notice from the above graph, there is a lot of variability in the
# scale of the variables. This is an issue for DL models.

# Find all columns we want to normalise
cols2zscore = data.keys()
cols2zscore = cols2zscore.drop('quality')

# Z-score (explicit for clarity: mean center and scale by std)
for col in cols2zscore:
    mean_val  = np.mean(data[col])
    std_val   = np.std(data[col])
    data[col] = (data[col] - mean_val) / std_val

# Z-score (compact version)
data[cols2zscore] = data[cols2zscore].apply(stats.zscore)

# Look at the descriptives now
data.describe()


In [None]:
# %% Re-plot the data and have a look

phi    = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,figsize=(6*phi,6))

ax = sns.boxplot(data=data)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)

plt.savefig('figure4_wine_dataset.png')

plt.show()

files.download('figure4_wine_dataset.png')


In [None]:
# %% A closer look at the quality column

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

counts = data['quality'].value_counts()
plt.bar(list(counts.keys()),counts)
plt.xlabel('Quality rating')
plt.ylabel('Count')
plt.rcParams.update({'font.size': 15})

plt.savefig('figure5_wine_dataset.png')

plt.show()

files.download('figure5_wine_dataset.png')


In [None]:
# %% Binarise quality

# Such an imbalance in the distribution is a problem, binarising can be
# a possible solution

data['boolean_quality'] = 0
data['boolean_quality'][data['quality']>5] = 1
data['boolean_quality'][data['quality']<6] = 0 # Implicit but here for clarity

data[['quality','boolean_quality']]


In [None]:
# %% Reorganise data from dataframe to tensors

data_t = torch.tensor( data[cols2zscore].values ).float()
labels = torch.tensor( data['boolean_quality'].values ).float()

print(data_t.shape)
print(labels.shape)

# Labels need to be multidimentional fr PyTorch, not an array
labels = labels[:,None]
print(labels.shape)


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data_t,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 64
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Check batches sizes

for X,y in train_loader:
    print(f'X shape: {X.shape}; y shape: {y.shape}')


In [None]:
# %% Exercise 1
#    In a later video, we will use DL to predict residual sugar. Use seaborn to make a histogram of that data column.
#    Spend a minute to explore the visualization options in sns.histplot. For example, you can add a kernel density
#    estimate, make the histogram bars purple, and so on.

# As already visible from the boxplots, the distribution is quite skewed. The code also
# contains an gratuitously complicated way of computing the optimal bin numbers

# Number of bins using Freedman–Diaconis' rule
tmp = data['residual sugar']
n   = len(tmp)
iqr = np.subtract(*np.percentile(tmp, [75, 25]))
bin_width = 2 * iqr / np.cbrt(n)
bins      = int(np.ceil((tmp.max() - tmp.min()) / bin_width))

# Plotting (without kernel)
sns.histplot(data=data, x='residual sugar', bins=bins, kde=False)
plt.title('Histogram of residual sugar')
plt.xlabel('Residual sugar')
plt.ylabel('Count')

plt.savefig('figure6_wine_dataset_extra1.png')

plt.show()

files.download('figure6_wine_dataset_extra1.png')

# Plotting (with kernel)
sns.histplot(data=data, x='residual sugar', bins=bins, kde=True)
plt.title('Histogram of residual sugar')
plt.xlabel('Residual sugar')
plt.ylabel('Count')

plt.savefig('figure7_wine_dataset_extra1.png')

plt.show()

files.download('figure7_wine_dataset_extra1.png')


In [None]:
# %% Exercise 2
#    (Warning: This exercise is for people who are familiar with statistics.) Loop over all the variables in the dataset,
#    and perform an independent-samples t-test on the data for the binarized wine quality. Which variables are significantly
#    different between "low" and "high" quality wine?

# Loved this one. Indeed, a lot of features differ significantly for low- and high-quality
# wines; notice that assuming equal variance is not always appropriate (as per Levene's test),
# I haven't adapted the loop out of laziness, but you get the idea

# Split groups
qual_low  = data[data['boolean_quality'] == 0]
qual_high = data[data['boolean_quality'] == 1]

print('Statistics on z-scored data:\n')

# Loop over all features except 'quality_bin'
for col in data.columns.drop(['quality','boolean_quality']):

    # Drop NaNs for safety
    x0 = qual_low[col].dropna()
    x1 = qual_high[col].dropna()

    # Levene's test for equal variances (have a look)
    _,p_levene = stats.levene(x0, x1)

    # Independent t-test (assuming equal variances)
    t_stat,p_ttest = stats.ttest_ind(x0,x1,equal_var=True)

    print(f"Feature: {col}")
    print(f"  Levene p-value: {p_levene:.4f} (test for equal variances)")
    print(f"  Independent t-test p-value: {p_ttest:.4f}")
    print(f"  Mean (low quality): {x0.mean():.3f}, Mean (high quality): {x1.mean():.3f}")
    print( )
