In [3]:
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import warnings 
warnings.filterwarnings('ignore')

# Nutrient Dataset


In [12]:
nutrient = pd.read_csv('nutrient.csv',index_col=0)
nutrient

Unnamed: 0_level_0,energy,protein,fat,calcium,iron
Food_Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BEEF BRAISED,340,20,28,9,2.6
HAMBURGER,245,21,17,9,2.7
BEEF ROAST,420,15,39,7,2.0
BEEF STEAK,375,19,32,9,2.6
BEEF CANNED,180,22,10,17,3.7
CHICKEN BROILED,115,20,3,8,1.4
CHICKEN CANNED,170,25,7,12,1.5
BEEF HEART,160,26,5,14,5.9
LAMB LEG ROAST,265,20,20,9,2.6
LAMB SHOULDER ROAST,300,18,25,9,2.3


In [13]:
scaler = StandardScaler().set_output(transform='pandas')
nutrient_sclaed = scaler.fit_transform(nutrient)

In [14]:
eps_range = [0.2,0.4,0.6,1]
mp_range = [2,3,4,5]
cnt = 0
a =[]
for i in eps_range:
    for j in mp_range:
        clust_DB = DBSCAN(eps=i, min_samples=j)
        clust_DB.fit(nutrient_sclaed.iloc[:,:-1])
        if len(set(clust_DB.labels_)) > 2:
            # it is just a counter
            cnt = cnt + 1
            # by this we are getting cluster label (-1 for outliers) and adding into column
            nutrient_sclaed['Clust'] = clust_DB.labels_
            # here we are extracting the data without outliers(-1) so we can calculate the silhouette_score
            nutrient_scl_inliers = nutrient_sclaed[nutrient_sclaed['Clust']!=-1]
            # calculatng silhouette sscore
            sil_sc = silhouette_score(nutrient_scl_inliers.iloc[:,:-1],  # this is data without cluster label
                             nutrient_scl_inliers.iloc[:,-1]) # this is only clustered label data
            # here making a list of score with different parameter for every parameter
            a.append([cnt,i,j,sil_sc])
            print(i,j,sil_sc)
# Making Dataframe of every parameter 
a = np.array(a)
pa = pd.DataFrame(a,columns=['Sr','eps','min_point','silouette score'])
print("Best Paramters:")
pa[pa['silouette score'] == pa['silouette score'].max()]


0.2 2 0.6892754870042915
0.4 2 0.9040570858308878
0.6 2 0.7752441663222646
1 2 0.43236957939711584
1 3 0.43147150560271585
Best Paramters:


Unnamed: 0,Sr,eps,min_point,silouette score
1,2.0,0.4,2.0,0.904057


In [15]:
# DBSCAN with Best Parameters

clust_DB = DBSCAN(eps=0.4, min_samples=2)
clust_DB.fit(nutrient_sclaed.iloc[:,:-1])
print(clust_DB.labels_)
print(len(set(clust_DB.labels_)))

[ 0 -1 -1  0 -1 -1  1 -1 -1 -1  0  0  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1 -1]
3


In [16]:
nutrient_clust = nutrient.copy()

# adding the cluster label into dataframe
nutrient_clust['cluster'] = clust_DB.labels_

# eliminating the outliers(-1 value)
nutrient_clust = nutrient_clust[nutrient_clust['cluster'] != -1]

# Analysing the individual
print(nutrient_clust.sort_values('cluster'))

# calculating the mean on the basis of cluster label
nutrient_clust.groupby('cluster').mean()


                energy  protein  fat  calcium  iron  cluster
Food_Item                                                   
BEEF BRAISED       340       20   28        9   2.6        0
BEEF STEAK         375       19   32        9   2.6        0
SMOKED HAM         340       20   28        9   2.5        0
PORK ROAST         340       19   29        9   2.5        0
PORK SIMMERED      355       19   30        9   2.4        0
CHICKEN CANNED     170       25    7       12   1.5        1
TUNA CANNED        170       25    7        7   1.2        1


Unnamed: 0_level_0,energy,protein,fat,calcium,iron
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,350.0,19.4,29.4,9.0,2.52
1,170.0,25.0,7.0,9.5,1.35
