# HIERARCHICAL CLUSTERING

In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

Hierarchical clustering is a clustering algorithm which uses the following steps to develop clusters:
1. Start with each data point in a single cluster.
2. Find the data points with the shortest distance (using an appropriate distance measure) and
merge them to form a cluster.
3. Repeat step 2 until all data points are merged together to form a single cluster.

In [2]:
df = pd.read_csv("beer.csv")
df.head()

Unnamed: 0,name,calories,sodium,alcohol,cost
0,Budweiser,144,15,4.7,0.43
1,Schlitz,151,19,4.9,0.43
2,Lowenbrau,157,15,0.9,0.48
3,Kronenbourg,170,7,5.2,0.73
4,Heineken,152,11,5.0,0.77


In [3]:
from sklearn.cluster import AgglomerativeClustering

In [4]:
# We will create clusters using AgglomerativeClustering and store the new cluster labels in h_clusterid variable.
from sklearn.preprocessing import StandardScaler

In [5]:
scaler = StandardScaler()
scaled_beer_df = scaler.fit_transform(df[["calories","sodium","alcohol","cost"]])

In [9]:
scaled_beer_df[0:5]

array([[ 0.38791334,  0.00779468,  0.43380786, -0.45682969],
       [ 0.6250656 ,  0.63136906,  0.62241997, -0.45682969],
       [ 0.82833896,  0.00779468, -3.14982226, -0.10269815],
       [ 1.26876459, -1.23935408,  0.90533814,  1.66795955],
       [ 0.65894449, -0.6157797 ,  0.71672602,  1.95126478]])

In [10]:
h_clusters = AgglomerativeClustering(3)
h_clusters.fit(scaled_beer_df)
df["h_clusterid"] = h_clusters.labels_

In [11]:
df.head()

Unnamed: 0,name,calories,sodium,alcohol,cost,h_clusterid
0,Budweiser,144,15,4.7,0.43,1
1,Schlitz,151,19,4.9,0.43,1
2,Lowenbrau,157,15,0.9,0.48,0
3,Kronenbourg,170,7,5.2,0.73,2
4,Heineken,152,11,5.0,0.77,2


In [12]:
df[df.h_clusterid == 0]

Unnamed: 0,name,calories,sodium,alcohol,cost,h_clusterid
2,Lowenbrau,157,15,0.9,0.48,0
8,Miller_Lite,99,10,4.3,0.43,0
9,Budweiser_Light,113,8,3.7,0.4,0
11,Coors_Light,102,15,4.1,0.46,0
12,Michelob_Light,135,11,4.2,0.5,0
15,Pabst_Extra_Light,68,15,2.3,0.38,0
18,Olympia_Goled_Light,72,6,2.9,0.46,0
19,Schlitz_Light,97,7,4.2,0.47,0


In [13]:
df[df.h_clusterid == 1]

Unnamed: 0,name,calories,sodium,alcohol,cost,h_clusterid
0,Budweiser,144,15,4.7,0.43,1
1,Schlitz,151,19,4.9,0.43,1
5,Old_Milwaukee,145,23,4.6,0.28,1
6,Augsberger,175,24,5.5,0.4,1
7,Srohs_Bohemian_Style,149,27,4.7,0.42,1
10,Coors,140,18,4.6,0.44,1
16,Hamms,139,19,4.4,0.43,1
17,Heilemans_Old_Style,144,24,4.9,0.43,1


In [14]:
df[df.h_clusterid == 2]

Unnamed: 0,name,calories,sodium,alcohol,cost,h_clusterid
3,Kronenbourg,170,7,5.2,0.73,2
4,Heineken,152,11,5.0,0.77,2
13,Becks,150,19,4.7,0.76,2
14,Kirin,149,6,5.0,0.79,2
