## Module 18.6.3 Running Hierarchical Clustering

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install hvplot



In [56]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import hvplot


In [57]:
# Load data
file_path = ('/content/drive/MyDrive/Data/new_iris_data.csv')
df_iris = pd.read_csv(file_path)
df_iris.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [61]:
# Scale the dataset
iris_scaled = StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

[[-0.90068117  1.03205722 -1.3412724  -1.31297673]
 [-1.14301691 -0.1249576  -1.3412724  -1.31297673]
 [-1.38535265  0.33784833 -1.39813811 -1.31297673]
 [-1.50652052  0.10644536 -1.2844067  -1.31297673]
 [-1.02184904  1.26346019 -1.3412724  -1.31297673]]


In [62]:
# Apply PCA to reduce the dataset from four features to two.
# Initialize PCA model
pca = PCA(n_components=2)

# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

# Transform PCA data to a DataFrame
df_iris_pca = pd.DataFrame(
  data=iris_pca, columns=["principal component 1", "principal component 2"])
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [63]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

In [64]:
# To run the hierarchical clustering algorithm, first we import the libraries to later create a dendrogram. 
import plotly.figure_factory as ff



In [65]:
# Create the dendrogram
# We'll pass a color_threshold of 0 to make all the branches the same color:
fig = ff.create_dendrogram(df_iris_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [66]:
# To run the hierarchical algorithm (Agglomerative clustering):

agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_iris_pca)

In [67]:
# Add a new class column to df_iris
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [68]:
import ipywidgets as w
from IPython.display import display

In [69]:
%matplotlib notebook
%matplotlib inline

In [70]:

# Plotting the clusters

plot = df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

In [71]:
hvplot.save(plot, 'test.html')

In [None]:
# ! pip install selenium

In [72]:
# In Anaconda prompt run the following code
# conda install -c conda-forge firefox geckodriver

In [None]:
# hvplot.save(plot, 'test.png')