In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = '../datasets/winequality-red.csv'
df = pd.read_csv(data)
df.head()

In [None]:
df.drop(['fixed acidity','volatile acidity', 'citric acid', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'chlorides'],axis=1,inplace=True)
df.shape
df.describe()

In [None]:
plt.figure(1, figsize=(16,4))
n = 0 
for i in ['alcohol', 'pH', 'sulphates']:
    n += 1
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
    sns.distplot(df[i] , bins = 32)
    plt.title(f'Histogram of {i}')
plt.show()

In [None]:
# First Clustering
# Plotting the dataset

X1 = df.loc[:, ['pH', 'sulphates']].values
inertia = []
for n in range(1 , 11):
    model = KMeans(n_clusters = n,
               init='k-means++',
               max_iter=500,
               random_state=42)
    model.fit(X1)
    inertia.append(model.inertia_)

In [None]:
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('No. : Clusters') , plt.ylabel('Wine Quality')
plt.show()

In [None]:
# FIrst Clustering
# By pH and sulfates

model = KMeans(n_clusters = 4,
            init='k-means++',
            max_iter=500,
            random_state=42)
model.fit(X1)
labels = model.labels_
centroids = model.cluster_centers_
y_kmeans = model.fit_predict(X1) 

plt.figure(figsize=(20,10))
plt.scatter(X1[y_kmeans == 0, 0], X1[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster A')
plt.scatter(X1[y_kmeans == 1, 0], X1[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster B')
plt.scatter(X1[y_kmeans == 2, 0], X1[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster C')
plt.scatter(X1[y_kmeans == 3, 0], X1[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster D')
plt.scatter(X1[y_kmeans == 4, 0], X1[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster E')
plt.title('Clusters of Wine - pH * sulfates')
plt.xlabel('pH')
plt.ylabel('sulphates')
plt.legend()
plt.show()

In [None]:
# Second Clustering
# Plotting the dataset

X2 = df.loc[:, ['alcohol', 'quality']].values
inertia = []
for n in range(1 , 11):
    model = KMeans(n_clusters = n,
               init='k-means++',
               max_iter=500,
               random_state=42)
    model.fit(X2)
    inertia.append(model.inertia_)

plt.figure(1 , figsize = (20, 10))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('No. : Clusters') , plt.ylabel('Wine Quality')
plt.show()

In [None]:
# Second Clustering
# By alcohol and quality

model = KMeans(n_clusters = 5,
            init='k-means++',
            max_iter=500,
            random_state=42)
model.fit(X2)
labels = model.labels_
centroids = model.cluster_centers_
y_kmeans = model.fit_predict(X2) 

plt.figure(figsize=(20,10))
plt.scatter(X2[y_kmeans == 0, 0], X2[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X2[y_kmeans == 1, 0], X2[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X2[y_kmeans == 2, 0], X2[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X2[y_kmeans == 3, 0], X2[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X2[y_kmeans == 4, 0], X2[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of Wine - alcohol * quality')
plt.xlabel('alcohol')
plt.ylabel('quality')
plt.legend()
plt.show()

In [None]:
# Third Clustering
# Plotting the dataset

X3 = df.loc[:, ['alcohol', 'quality', 'pH']].values
inertia = []
for n in range(1 , 11):
    model = KMeans(n_clusters = n,
               init='k-means++',
               max_iter=500,
               random_state=42)
    model.fit(X3)
    inertia.append(model.inertia_)

plt.figure(1 , figsize = (20, 10))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('No. : Clusters') , plt.ylabel('Wine Quality')
plt.show()

In [None]:
# Third Clustering
# Identifying the centroid clusters

model = KMeans(n_clusters = 6,
            init='k-means++',
            max_iter=500,
            random_state=42)
model.fit(X3)
labels = model.labels_
# centroids = model.cluster_centers_

df['cluster'] =  labels
df

In [None]:
# Plotting a scatter plot
# 3D Model Representation

fig = px.scatter_3d(df,
                    x="alcohol",
                    y="quality",
                    z="pH",
                    color='cluster',
                    hover_data=["alcohol",
                                "quality",
                                "pH"],
                    category_orders = {"cluster": range(0, 5)},
                    )

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()