## Hierarchical Clustering Algorithm

### Read the data into a dataframe

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

grades = pd.read_csv("data/grades.csv")
grades.head()

Unnamed: 0,Student,English,Math,Science
0,1,99,96,97
1,2,99,96,97
2,3,98,97,97
3,4,95,100,95
4,5,95,96,96


### The 3D plot of the grades of the students

In [39]:
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
%matplotlib notebook


fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the values
ax.scatter(grades.iloc[:, 1], grades.iloc[:, 2], grades.iloc[:, 3], c=None, edgecolor='k', alpha=0.5)

ax.set_xlabel('English')
ax.set_ylabel('Maths')
ax.set_zlabel('Science')
ax.set_title('Original Data')

plt.show()

<IPython.core.display.Javascript object>

### Agglomerative Clustering is applied to the datset (3 clusters)
The counter suggests that the clusters are uniform, with almost same number of observations in each cluster

In [41]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from collections import Counter

agg = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')
labels = agg.fit_predict(grades)
print(Counter(labels))

labels

Counter({0: 245, 1: 201, 2: 174})


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2,

### Plot the clusters in 3D for visualizing them better (3 clusters)

In [43]:
def set_colors(labels, colors='rgbykcm'):
    colored_labels = []
    for label in labels:
        colored_labels.append(colors[label])
    return colored_labels


from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
%matplotlib notebook

colors = set_colors(labels)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the values
ax.scatter(grades.iloc[:, 1], grades.iloc[:, 2], grades.iloc[:, 3], c=colors, edgecolor='k', alpha=0.5)

ax.set_xlabel('English')
ax.set_ylabel('Maths')
ax.set_zlabel('Science')
ax.set_title('3 Clusters')

plt.show()

<IPython.core.display.Javascript object>

### Agglomerative Clustering is applied to the datset (4 clusters)

In [37]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from collections import Counter

agg_4 = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='complete')
labels4 = agg_4.fit_predict(grades)
print(Counter(labels4))

labels4

Counter({0: 201, 2: 174, 1: 137, 3: 108})


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2,

### 3D plot (4 clusters)
As we can see from the below graph, choosing 4 clusters doesnot group the data well. There is a lot of overlapping between gree and yellow observations

In [38]:
def set_colors(labels, colors='rgbykcm'):
    colored_labels = []
    for label in labels:
        colored_labels.append(colors[label])
    return colored_labels


from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
%matplotlib notebook

colors = set_colors(labels4)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the values
ax.scatter(grades.iloc[:, 1], grades.iloc[:, 2], grades.iloc[:, 3], c=colors, edgecolor='k', alpha=0.5)

ax.set_xlabel('English')
ax.set_ylabel('Maths')
ax.set_zlabel('Science')
ax.set_title('4 Clusters')

plt.show()

<IPython.core.display.Javascript object>

### References

https://github.com/mlnjsh/DSBDA-Btech/blob/master/grades_km_input.csv<br>
Data Science and Big Data Analytics by Wiley Publications<br>
http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html#sphx-glr-auto-examples-cluster-plot-cluster-iris-py<br>
http://marcharper.codes/2016-07-11/Clustering+with+Scikit-Learn.html