# Case study on Unsupervised Learning
Do the following in the wine dataset.

1) Read the dataset to the python environment.

2) Try out different clustering models in the wine dataset.

3) Find the optimum number of clusters in each model and create the model with the optimum number of clusters.


In [None]:
# Importing the required libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore") 

In [None]:
# Loading the data file into the pandas dataframe.

data = pd.read_csv(r'C:\Users\Aneesh\Downloads\Wine.csv')
data

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data.dtypes

# Checking correlation

In [None]:
correlation = data.corr()
plt.figure(figsize=(20,10))
sns.heatmap(correlation, annot = True, xticklabels = True, yticklabels = True,cmap = 'RdYlGn_r')
plt.show()

From the heatmap above, we can conclude that the correlation between

"Flavanoids" and "Total phenols" (0.86);"OD280" and "Flavanoids" (0.79); "OD280" and "Total phenols" (0.7); are large. 
There seems to be a high correlation between Flavanoids and total phenols, and Flavanoids and OD280 of diluted wines

# PCA

# Normalize data
We are going to normalize data by removing the mean and scaling to unit variance using preprocessing.
StandardScaler and the table below show the general description statistics of normalized data.

In [None]:
from sklearn.preprocessing import Normalizer
no = pd.DataFrame(Normalizer().fit_transform(data), columns = data.columns, index = data.index)
no

In [None]:
#standardscaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean = True,with_std = True)
scaler.fit(data)
scaled_data = scaler.transform(data)
scaled_data

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca=PCA()
pca.fit(scaled_data)

In [None]:
pip install pca

In [None]:
from pca import pca

# Initialize to reduce the data up to the number of componentes that explains 95% of the variance.
model = pca(n_components = 0.95)

# Fit transform
results = model.fit_transform(scaled_data)

# Plot explained variance
fig, ax = model.plot()


We observe that there is an elbow at 3

In [None]:
# Scatter first 2 PCs
fig, ax = model.scatter()

In [None]:
# Make biplot with the number of features
fig, ax = model.biplot(n_feat = 13,cmap = 'copper')

In [None]:
pca = PCA(n_components=3)
x_pca = pca.fit_transform(scaled_data)

In [None]:
pcadf = pd.DataFrame(x_pca,columns=["PC1","PC2","PC3"])
pcadf

In [None]:
pca.explained_variance_ratio_

# K means clustering

In [None]:
from sklearn.cluster import KMeans
# elbow method
wcss = []
for i in range(1,11):
  kmeans = KMeans(i)
  kmeans.fit(pcadf)
  wcss.append(kmeans.inertia_)

number_clusters = range(1,11)
plt.plot(number_clusters,wcss)
plt.title('The Elbow plot')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

We observe the elbow at k=3

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
score = []
for i in range(2,11):
  km = KMeans(i)
  km.fit_predict(pcadf)
  score.append(silhouette_score(pcadf, km.labels_, metric = 'euclidean'))
number_clusters = range(2,11)
plt.plot(number_clusters,score)
plt.title('The Silhouette score plot')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score ')
plt.show()


We observe the highest Silhouette score is for k=3

Hence we select k=3 for our KMeans clustering algorithm

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42).fit(pcadf)

In [None]:
kmeans.labels_

In [None]:
kmeans.inertia_

In [None]:
kmeans.n_iter_

In [None]:
kmeans.cluster_centers_

In [None]:
from collections import Counter
Counter(kmeans.labels_)

In [None]:
plt.figure(figsize = (8,8))
sns.scatterplot(x_pca[:,0],x_pca[:,1],hue = kmeans.labels_,palette = 'flare')
plt.show()

The data has been clustered into 3 clusters labelled as 0,1,2 using K-Means clustering algorithm

# Hierarchical Clustering
1) Agglomerative Hierarchical Clustering

In [None]:
#Finding the optimal number of clusters using the dendrogram  

import scipy.cluster.hierarchy as shc 
plt.figure(figsize = (10,10)) 
dendro = shc.dendrogram(shc.linkage(pcadf, method = "ward")) 
plt.axhline(y = 25,color = "black",linestyle = "--") 
plt.title("Dendrogram Plot")  
plt.ylabel("Euclidean Distances")  
plt.xlabel("Wines")  
plt.show()

We have the longest vertical line in the range 12-28. So we draw a horizontal line at y=25 to obtain the number of clusters. 
Since there are 3 cuts to the horizontal line, the number of clusters is 3

In [None]:
from sklearn.cluster import AgglomerativeClustering
linkage = ['complete', 'average', 'single']
affinity = ["euclidean","manhattan","cosine"]
s_scores = []
max_score = -1
for i in affinity:
  for j in linkage:
    agg = AgglomerativeClustering(n_clusters = 3, affinity = i, linkage = j,).fit(pcadf)
    s_scores.append(silhouette_score(pcadf, agg.labels_)) 
    if s_scores[-1] > max_score:
      aff = i
      link = j
print(aff,link)

In [None]:
cos_single = silhouette_score(pcadf,AgglomerativeClustering(n_clusters = 3, affinity = "cosine", linkage = "single").fit(pcadf).labels_ )
cos_single

In [None]:
euclidean_ward = silhouette_score(pcadf,AgglomerativeClustering(n_clusters = 3, affinity = "euclidean", linkage = "ward").fit(pcadf).labels_ )
euclidean_ward

In [None]:
agg = AgglomerativeClustering(n_clusters =3 , affinity = 'euclidean', linkage = 'ward',)
agg.fit(pcadf)

In [None]:
agg.labels_

In [None]:
#number of observations in each cluster

from collections import Counter
Counter(agg.labels_)

In [None]:
plt.figure(figsize = (10,10))
sns.scatterplot(x_pca[:,0],x_pca[:,1],hue = agg.labels_,palette = 'Set1')
plt.show()

The data has been clustered into 3 clusters labelled as 0,1,2 using Agglomerative Hierarchical clustering algorithm

# Model Comparison
We will plot the clustering given by different models

In [None]:
ax = []
names = ["K-means clustering","Hierarchical clustering"]
comp = [kmeans.labels_,agg.labels_]
fig = plt.figure(figsize = (10,10))
for i in range(0,2):
  ax.append(fig.add_subplot(3,2,(i+1)))
  sns.scatterplot(x_pca[:,0],x_pca[:,1],hue = comp[i],ax = ax[i],palette = "Set1")
  ax[i].set_title(names[i])
plt.show()

Not much difference is visible in the different clustering algorithms
Evaluating the clustering algorithms
The Davies–Bouldin index is a metric for evaluating clustering algorithms where the validation of how well the clustering has been done is made using quantities and features inherent to the dataset.
Lower the DB index value, better is the clustering

In [None]:
from sklearn.metrics import davies_bouldin_score
row_names = ['Algorithm','Silhouette score','DB index']
alg = ["K-Means","Hierarchical"]
comp = [kmeans.labels_,agg.labels_]
d = {}
for i in range(0,2):
    d[i] = [alg[i],silhouette_score(pcadf,comp[i]),davies_bouldin_score(pcadf,comp[i])]
d
table = pd.DataFrame(d,index=row_names)
table

From the above table it can be observed that the clustering algorithms perform almost in a similar way. 
The Silhouette score is maximum and the DB index is minimum for K-Means algorithm.
Hence it can be concluded that K-Means is the best clustering algorithm for our data

In [None]:
newdf = no
newdf["Cluster"] = kmeans.labels_
newdf

In [None]:
ax = []
names = newdf.columns
fig = plt.figure(figsize = (20,20))
for i in range(13):
  ax.append(fig.add_subplot(5,3,(i+1)))
  sns.boxplot(x = "Cluster",y = names[i],data = newdf,ax = ax[i],palette = "Set2")
  
plt.show()

Hence we can infer that the different wines in this dataset can be divided into three groups on the basis of the different variables