In [1]:
import numpy as np
import pandas as pd
import sklearn.preprocessing, sklearn.cluster, sklearn.metrics
import scipy.spatial
import matplotlib.pyplot as plt
import seaborn as sns

# Data description

Data source: https://archive.ics.uci.edu/dataset/186/wine+quality or https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009/data

The set of 11 attributes/variables based on physicochemical tests and 1 variable describing quality.

|Attribute|Description|
|:-------|:-------|
|fixed acidity|most acids involved with wine or fixed or nonvolatile (do not evaporate readily)|
|volatile acidity|the amount of acetic acid in wine, which at too high of levels can lead to an unpleasant, vinegar taste|
|citric acid|found in small quantities, citric acid can add 'freshness' and flavor to wines|
|residual sugar|the amount of sugar remaining after fermentation stops, it's rare to find wines with less than 1 gram/liter and wines with greater than 45 grams/liter are considered sweet|
|chlorides|the amount of salt in the wine|
|free sulfur dioxide|the free form of SO2 exists in equilibrium between molecular SO2 (as a dissolved gas) and bisulfite ion; it prevents microbial growth and the oxidation of wine|
|total sulfur dioxide|amount of free and bound forms of S02; in low concentrations, SO2 is mostly undetectable in wine, but at free SO2 concentrations over 50 ppm, SO2 becomes evident in the nose and taste of wine|
|density|the density of water is close to that of water depending on the percent alcohol and sugar content|
|pH|describes how acidic or basic a wine is on a scale from 0 (very acidic) to 14 (very basic); most wines are between 3-4 on the pH scale|
|sulphates|a wine additive which can contribute to sulfur dioxide gas (S02) levels, wich acts as an antimicrobial and antioxidant|
|alcohol|the percent alcohol content of the wine|
|quality|output variable (based on sensory data, score between 0 and 10)

# Data loading

In [None]:
df = pd.read_csv('data_clustering/winequality-red.csv')
df

In [None]:
df.describe()

In [None]:
df_only_numeric = df.select_dtypes(np.number)

# Create the subplots
n_rows, n_cols = df_only_numeric.shape
fig, axes = plt.subplots(nrows=1, ncols=n_cols, figsize=(40, 6))
for i, column in enumerate(df_only_numeric):
    sns.histplot(data=df_only_numeric, x=column, ax=axes[i]).set_title(column)

In [None]:
df.quality.value_counts().sort_index()

In [None]:
df['quality_class'] = 'normal'
df.loc[df.quality <= 4, 'quality_class'] = 'bad'
df.loc[df.quality >= 7, 'quality_class'] = 'good'
df.quality_class.value_counts()

In [None]:
sns.heatmap(df_only_numeric.corr(method='spearman'), annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)

In [9]:
top_5_features_for_quality = ['alcohol', 'sulphates', 'volatile acidity', 'citric acid', 'chlorides']

# Data preprocessing before clustering
- Why do we need to preprocess the data?

In [10]:
X = df_only_numeric.drop(columns=['quality']).values

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
X_min_max_scaled = scaler.fit_transform(X)
X_min_max_scaled

# Clustering

## KMeans + clustering quality
- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
- Do you know any technique to determine the best number of clusters?

In [None]:
clustering = sklearn.cluster.KMeans(n_clusters=5)
clustering.fit(X_min_max_scaled)

In [None]:
clustering.labels_

In [None]:
pd.Series(clustering.labels_).value_counts()

In [None]:
clustering.inertia_

In [None]:
sklearn.metrics.silhouette_score(X_min_max_scaled, clustering.labels_)

## Elbow method

#### Calculate SSE and Silhouette for differenet parameters of clusters $k \in <2, 15>$. Create visualization for both of clustering quality criteria

In [None]:
clustering_scores = []
for k in range(2, 16):
    clustering = sklearn.cluster.KMeans(n_clusters=k, random_state=13).fit(X_min_max_scaled)
    clustering_scores.append({
        'k': k,
        'sse': clustering.inertia_,
        'silhouette': sklearn.metrics.silhouette_score(X_min_max_scaled, clustering.labels_),
    })
df_clustering_scores = pd.DataFrame.from_dict(clustering_scores, orient='columns')
df_clustering_scores

In [None]:
sns.lineplot(data=df_clustering_scores, x='k', y='sse')

In [None]:
sns.lineplot(data=df_clustering_scores, x='k', y='silhouette')

#### How many cluster do you see ?

#### Can you imagine other quality criteria to use over some real dataset? Do you know difference between external and internal criteria?

## Clustering interpretation

In [None]:
clustering = sklearn.cluster.KMeans(n_clusters=6, random_state=13)
clustering.fit(X_min_max_scaled)

In [21]:
df['cluster_id'] = clustering.labels_

In [None]:
sns.countplot(data=df, x='cluster_id')

#### Let's take a look on the centroids

In [None]:
df.select_dtypes(np.number).groupby('cluster_id').mean()

#### Use describe() for quick inspection of numeric values in dataset.

In [None]:
with pd.option_context('display.max_columns', None):
    display(df.groupby('cluster_id').describe(include='object'))

In [None]:
with pd.option_context('display.max_columns', None):
    display(df.groupby('cluster_id').describe(exclude='object'))

In [None]:
with pd.option_context('display.max_columns', None):
    display(df.select_dtypes(np.number).groupby('cluster_id').agg(['mean', 'min', 'max']))

### Graphical ilustration of values in the clusters

In [None]:
for column in ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar']:
    sns.displot(data=df, x=column, hue='cluster_id', palette='tab10', kde=True, bins=50)
    plt.title(column)
    plt.show()

In [None]:
for column in ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar']:
    sns.boxplot(data=df, y=column, x='cluster_id')
    plt.title(column)
    plt.show()

#### Are there any differences attributes for different clusters?

#### Clusters possible interpretation

For cluster 0 there are lower *volatile acidity* values. 

The cluster 1 represents the wines with high *fixed acidity* and *citric acid* values.

The biggest cluster 2 (consisted of 536 points out of total 1599 points) has small sulfur values and relatively small residual sugar.

etc...

### Can we see differences in wine quality for different clusters?

In [None]:
sns.boxplot(data=df, y='quality', x='cluster_id')

In [None]:
df.groupby('cluster_id').quality.value_counts().head(20)

In [None]:
df_quality_in_clusters = pd.pivot(df.groupby('cluster_id').quality.value_counts().reset_index(name='count'), index='quality', columns=['cluster_id'], values='count')
df_quality_in_clusters.fillna(0, inplace=True)
df_quality_in_clusters

In [None]:
sns.heatmap((df_quality_in_clusters/df_quality_in_clusters.sum(axis=0)).sort_index(ascending=False), cmap='YlOrBr', vmin=0, vmax=1, annot=True, fmt='.1%')

In [None]:
sns.heatmap((df_quality_in_clusters.T/df_quality_in_clusters.sum(axis=1)).T.sort_index(ascending=False), cmap='YlOrBr', vmin=0, vmax=1, annot=True, fmt='.1%')

#### How can we interpret previous illustrations?

#### Description of clusters based on its quality values:

- TODO: Fill in some observations

## DBSCAN + distance analysis
- https://www.kdnuggets.com/2020/04/dbscan-clustering-algorithm-machine-learning.html
- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

We are going to use only top 5 selected features with the highest correlation to quality for following clustering.

In [None]:
top_5_features_for_quality

In [None]:
sns.pairplot(data=df[top_5_features_for_quality + ['quality_class']], hue='quality_class')

In [40]:
scaler = sklearn.preprocessing.MinMaxScaler()
X_min_max_scaled = scaler.fit_transform(df[top_5_features_for_quality].values)

In [None]:
clustering = sklearn.cluster.DBSCAN()
clustering.fit(X_min_max_scaled)

In [None]:
pd.Series(clustering.labels_).value_counts()

What does the number -1 means?
#### Find better parameters *eps* and *min_pts* and visualize you result

In [None]:
clustering = sklearn.cluster.DBSCAN(eps=0.25, min_samples=10).fit(X_min_max_scaled)
pd.Series(clustering.labels_).value_counts()

#### Lets get insight into distances in our dataset, to allow us to find better configuration of parameters for DBSCAN

In [None]:
distance_matrix = scipy.spatial.distance_matrix(X_min_max_scaled, X_min_max_scaled)
distance_matrix

In [None]:
sns.histplot(distance_matrix.flatten())

In [None]:
distance_matrix = scipy.spatial.distance_matrix(X_min_max_scaled, X_min_max_scaled)
distance_matrix.sort(axis=1)
distance_matrix

In [None]:
sns.histplot(distance_matrix[:, 1])

In [None]:
sns.histplot(distance_matrix[:, 10], bins=50)

#### What can we notice in previous visualizations?

In [None]:
clustering = sklearn.cluster.DBSCAN(eps=0.07, min_samples=10).fit(X_min_max_scaled)
pd.Series(clustering.labels_).value_counts()

#### Description of clusters (and probably noise points too)

- We should consider removing -1 cluster out of interpretation section

In [50]:
df['cluster_id'] = clustering.labels_

In [None]:
df.groupby('cluster_id').quality_class.value_counts()

In [None]:
df_quality_in_clusters = pd.pivot(df.groupby('cluster_id').quality.value_counts().reset_index(name='count'), index='quality', columns=['cluster_id'], values='count')
df_quality_in_clusters.fillna(0, inplace=True)
df_quality_in_clusters.sort_index(ascending=False)

In [None]:
sns.heatmap((df_quality_in_clusters/df_quality_in_clusters.sum(axis=0)).sort_index(ascending=False), cmap='YlOrBr', vmin=0, vmax=1, annot=True, fmt='.1%')

In [None]:
colorpalette = dict([(i, sns.color_palette('tab10')[i]) for i in range(10)])
colorpalette[-1] = (0.8,0.8,0.8)
colorpalette

In [None]:
sns.pairplot(data=df[top_5_features_for_quality + ['cluster_id']], hue='cluster_id', palette=colorpalette, plot_kws={'alpha':0.6})

In [None]:
sns.pairplot(data=df[df.cluster_id != -1][top_5_features_for_quality + ['cluster_id']].sort_values('cluster_id', ascending=False), hue='cluster_id', palette=colorpalette, plot_kws={'alpha':0.6})

### Discussion: What to do with group cluster_id=-1? Should it be included or excluded from clustering interpretation?

#### There goes further description of clusters...

## Is the scaling of the feature good?

In [None]:
sns.histplot(data=df_only_numeric, x='fixed acidity')

In [None]:
sns.histplot(data=df, x='total sulfur dioxide')

In [None]:
sns.histplot(data=df, x='residual sugar')

In [61]:
distance_matrix_first_column = scipy.spatial.distance_matrix(scaler.fit_transform(df[['fixed acidity']]), scaler.fit_transform(df[['fixed acidity']]))
distance_matrix_second_column = scipy.spatial.distance_matrix(scaler.fit_transform(df[['total sulfur dioxide']]), scaler.fit_transform(df[['total sulfur dioxide']]))
distance_matrix_third_column = scipy.spatial.distance_matrix(scaler.fit_transform(df[['residual sugar']]), scaler.fit_transform(df[['residual sugar']]))

In [None]:
distance_matrix_first_column[np.triu_indices_from(distance_matrix_first_column, k=1)]

In [None]:
sns.histplot(distance_matrix_first_column[np.triu_indices_from(distance_matrix_first_column, k=1)], bins=50)

In [None]:
sns.histplot(distance_matrix_second_column[np.triu_indices_from(distance_matrix_second_column, k=1)], bins=50)

In [None]:
sns.histplot(distance_matrix_third_column[np.triu_indices_from(distance_matrix_third_column, k=1)], bins=50)

### How would you treat non-numerical attributes during preprocessing?

### For more info about Sklearn clustering, take a look into documentation https://scikit-learn.org/stable/modules/clustering.html#clustering

# Tasks (2p)

## Part 1 
Continue with clustering analysis:

MinMax scaling is not the only best possibility for scaling of numerical attributes.

1. Take a look into documentation (https://scikit-learn.org/stable/modules/classes.html?highlight=preprocessing#module-sklearn.preprocessing) and choose other method for scaling your features.

2. Apply clustering method of your choice (https://scikit-learn.org/stable/modules/clustering.html) for differently scaled data.

3. Describe detected clusters.

As a bonus (voluntary), it may be interesting to combine differently scaled features - e.g. feature1 scaled using MinMax, feature2 scaled using PowerTransform etc. And used this dataset for clustering.

## Part 2
Pick one from the following tasks:
- a\) Implement function for Entropy or Gini index as a **clustering external validation** during the "elbow method".

or

- b\) Use a Mahalanobis distance with some clustering method. (Hint: take a look into https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html and look for Sklearn clustering method able to work with "precomputed" distance matrix)