In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from ipywidgets import interact
import ipywidgets as widgets
import seaborn as sns

In [2]:
from sklearn.datasets import load_wine

In [3]:
df = load_wine()

In [4]:
wine = df.data

In [5]:
wine.shape

(178, 13)

In [6]:
wine_label = df.target

In [7]:
wine_label.shape

(178,)

In [8]:
df.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [9]:
labels = np.reshape(wine_label,(178,1))

In [10]:
final_wine_data = np.concatenate([wine, labels],axis = 1)

In [11]:
final_wine_data.shape

(178, 14)

In [12]:
wine_data = pd.DataFrame(final_wine_data)
wine_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0


In [13]:
features = df.feature_names

In [14]:
feature_label = np.append(features, 'label')

In [15]:
wine_data.columns = feature_label

In [16]:
wine_data

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2.0


In [17]:
X = wine_data.iloc[:,: -1]

X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [18]:
y = wine_data.iloc[:,-1]

## Using K-Means Clustering Algorithm

In [19]:
cols = ["alcohol", "malic_acid", "ash" , "alcalinity_of_ash" , "magnesium", "total_phenols", "flavanoids", "nonflavanoid_phenols", "proanthocyanins", "color_intensity" ,"hue" , "od280/od315_of_diluted_wines",  "proline"]
def make_kmeans(attributes=cols, scaler=MinMaxScaler(), dimension_reduction='pca', components=2, k=3, show='features'):
 attributes = list(attributes)
 scaled_values = scaler.fit_transform(X[attributes])
 X_l=X.iloc[:, 0:3].values
 y_l=X.iloc[:, 3].values
 
 components = min(components, len(attributes))
 if dimension_reduction == 'pca':
    values = PCA().fit_transform(scaled_values)[:, :components]
 elif dimension_reduction == 'lda':
    values = LinearDiscriminantAnalysis(n_components=2).fit_transform(X_l,y_l)[:, :components] 
 else: 
    values = scaled_values
 
 cluster = KMeans(n_clusters=k).fit_predict(values)
 
 if show == 'features':
        df_plot = X.astype(float).copy()
 elif show == 'values':
    df_plot = pd.DataFrame(values)
 
 df_plot['cluster'] = cluster
 
 sns.pairplot(df_plot, hue="cluster", diag_kind = 'hist', diag_kws={'alpha':0.5} , vars=[c for c in df_plot.columns if c != 'cluster'])

style = {'description_width': '150px'}
layout = widgets.Layout(width='400px')
i = interact(make_kmeans
    , attributes=widgets.SelectMultiple(options=cols, value=cols, rows=len(cols)
     ,description='Features', layout=layout, style=style
     ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
     , scaler=widgets.RadioButtons(options=[ ('Z-Transform', StandardScaler()), ('Min-Max', MinMaxScaler()), ('None', FunctionTransformer(validate=False))]
     ,description='Scaler', layout=layout, style=style
     ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
     , dimension_reduction=widgets.RadioButtons(options=[('PCA', 'pca'), ('LDA', 'lda'), ('without', None)]
     ,description='Dimensionality Reduction'
     , layout=layout, style=style
     ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
     , components=widgets.SelectionSlider(options=range(1, 5), value=4
     ,description='PCA components'
     , layout=layout, style=style
     ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
     , k=widgets.SelectionSlider(options=range(2, 8),description='k'
     , layout=layout, style=style
     ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
     , show=widgets.RadioButtons(options=[('Features', 'features'), ('Values', 'values')],description='Show'
     , layout=layout, style=style,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
 )


interactive(children=(SelectMultiple(description='Features', index=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),…

## Using DBSCAN Clustering Algorithm

In [20]:
cols = ["alcohol", "malic_acid", "ash" , "alcalinity_of_ash" , 
        "magnesium", "total_phenols", "flavanoids", "nonflavanoid_phenols",
        "proanthocyanins", "color_intensity" ,"hue" , "od280/od315_of_diluted_wines",  "proline"]
def make_dbscan(attributes=cols, scaler=MinMaxScaler(), dimension_reduction=None, components=5, eps=.5,
min_samples=5, show='features'):
    attributes = list(attributes)
    scaled_values = scaler.fit_transform(X[attributes])
    X_l=X.iloc[:, 0:3].values
    y_l=X.iloc[:, 3].values
    components = min(components, len(attributes))
    if dimension_reduction == 'pca':
        values = PCA().fit_transform(scaled_values)[:, :components]
    elif dimension_reduction == 'tsne':
        values = LinearDiscriminantAnalysis(n_components=2).fit_transform(X_l,y_l)[:, :components]
    else:
        values = scaled_values
 
    cluster = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(values)
    print('Found', 1+np.max(cluster), 'clusters with', np.sum(cluster == -1), 'outliers')
 
    if show == 'features':
        df_plot = X.astype(float).copy()
    elif show == 'values':
        df_plot = pd.DataFrame(values)
 
    df_plot['cluster'] = cluster
 
    sns.pairplot(df_plot, hue="cluster", diag_kind = 'hist', diag_kws={'alpha':0.5}, vars=[c for c in df_plot.columns if c != 'cluster'])

style = {'description_width': '150px'}
layout = widgets.Layout(width='400px')

i = interact(make_dbscan
 , attributes=widgets.SelectMultiple(options=cols, value=cols, rows=len(cols)
 ,description='Features', layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
 , scaler=widgets.RadioButtons(options=[('Z-Transform', StandardScaler()), ('Min-Max', MinMaxScaler()), ('None', FunctionTransformer(validate=False))]
 ,description='Scaler', layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
 , dimension_reduction=widgets.RadioButtons(options=[('PCA', 'pca'), ('LDA', 'lda'), ('without', None)]
 ,description='Dimensionality Reduction'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
 , components=widgets.SelectionSlider(options=range(1, 5), value=4
 ,description='Number components'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
                                      
 , eps=widgets.SelectionSlider(options=[.1, .25, .5, .6, .7, .8, .9, 1., 1.25, 1.5, 2.], value = .5
 ,description='Epsilon'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
                               
 , min_samples=widgets.SelectionSlider(options=range(1, 20), value=5
 ,description='Min. samples'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
                                       
 , show=widgets.RadioButtons(options=[('Features', 'features'), ('Values', 'values')]
 ,description='Show'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
 )

interactive(children=(SelectMultiple(description='Features', index=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),…

# Hierarchical Clustering

In [38]:
from sklearn.cluster import AgglomerativeClustering
cols = ["alcohol", "malic_acid", "ash" , "alcalinity_of_ash" , 
        "magnesium", "total_phenols", "flavanoids", "nonflavanoid_phenols",
        "proanthocyanins", "color_intensity" ,"hue" , "od280/od315_of_diluted_wines",  "proline"]

def make_agg_clustering(attributes=cols, scaler=MinMaxScaler(), dimension_reduction=None, components=5, n_clusters=3, linkage='ward', show='features'):
    attributes = list(attributes)
    scaled_values = scaler.fit_transform(X[attributes])
    X_l=X.iloc[:, 0:3].values
    y_l=X.iloc[:, 3].values
    components = min(components, len(attributes))
    if dimension_reduction == 'pca':
        values = PCA().fit_transform(scaled_values)[:, :components]
    elif dimension_reduction == 'tsne':
        values = LinearDiscriminantAnalysis(n_components=2).fit_transform(X_l,y_l)[:, :components]
    else:
        values = scaled_values
 
    cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage).fit_predict(values)
    print('Found', np.unique(cluster).size, 'clusters')
 
    if show == 'features':
        df_plot = X.astype(float).copy()
    elif show == 'values':
        df_plot = pd.DataFrame(values)
 
    df_plot['cluster'] = cluster
 
    sns.pairplot(df_plot, hue="cluster", diag_kind='hist', diag_kws={'alpha':0.5}, vars=[c for c in df_plot.columns if c != 'cluster'])

style = {'description_width': '150px'}
layout = widgets.Layout(width='400px')

i = interact(make_agg_clustering
 , attributes=widgets.SelectMultiple(options=cols, value=cols, rows=len(cols)
 ,description='Features', layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
 , scaler=widgets.RadioButtons(options=[('Z-Transform', StandardScaler()), ('Min-Max', MinMaxScaler()), ('None', FunctionTransformer(validate=False))]
 ,description='Scaler', layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
 , dimension_reduction=widgets.RadioButtons(options=[('PCA', 'pca'), ('LDA', 'lda'), ('without', None)]
 ,description='Dimensionality Reduction'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
             
 , components=widgets.SelectionSlider(options=range(1, 5), value=4
 ,description='Number components'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
                                      
 , n_clusters=widgets.SelectionSlider(options=range(2, 8), value=3
 ,description='Number of Clusters'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
                               
 , linkage=widgets.RadioButtons(options=['ward', 'complete', 'average'])
 ,description='Linkage Type'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True

 , show=widgets.RadioButtons(options=[('Features', 'features'), ('Values', 'values')]
 ,description='Show'
 , layout=layout, style=style
 ,disabled=False,continuous_update=False,orientation='horizontal',readout=True) 
 )

interactive(children=(SelectMultiple(description='Features', index=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),…