# Hierarchical Agglomerative Clustering (HAC)
We chose **Hierarchical Agglomerative Clustering (HAC)** as our main method of analysis

This clustering applies a *bottom-up approach*, where each data point starts in its own cluster, and all the clusters are then joined through various linkage methods. HAC will allow us to group players based on similarities in the data points generated from our *Principle Component Analysis*.

We will first evaluate each linkage method by plotting a dendrogram and considering the general distribution of the data to arrive at the optimal method. The dendrogram will allow us to determine the ideal number of clusters, while the value counts of the respective clusters will let us know the suitability of the clusters found in continuing our anaylsis.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from pyclustertend import hopkins
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

In [None]:
def create_cluster(model, max_d):
    # create linkage matrix
    counts = np.zeros(model.children_.shape[0])
    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    clusters = fcluster(linkage_matrix, max_d, criterion='distance')
    return clusters


def plotDendrogram(model):
    # create linkage matrix and then plot the dendrogram

    counts = np.zeros(model.children_.shape[0])
    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)    
    plt.figure(figsize=(15, 6)) 
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Data Points')
    plt.ylabel('distance')
    dendrogram(linkage_matrix,
                   orientation='top', #The direction to plot the dendrogram
                              #The root at the top, and descendent links going downwards
                   #labels=statesList,
                   distance_sort='descending',
                   show_leaf_counts=True)
    plt.show()

def truncatedDendrogram(model, n, y_min = 0, max_d = 0):
    # truncated dendrogram plot

    counts = np.zeros(model.children_.shape[0])
    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)    
    plt.title('Hierarchical Clustering Dendrogram (truncated)')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendro = dendrogram(
                    linkage_matrix,
                    truncate_mode='lastp',  # show only the last p merged clusters
                    p=n,  # show only the last p merged clusters
                    leaf_rotation=90.,
                    leaf_font_size=12.,
                    show_contracted=True,  # to get a distribution impression in truncated branches
                )
    for i, d, c in zip(dendro['icoord'], dendro['dcoord'], dendro['color_list']):
        x = 0.5 * sum(i[1:3])
        y = d[1]
        #if y > annotate_above:
        plt.plot(x, y, 'o', c=c)
        plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                            textcoords='offset points',
                            va='top', ha='center')
    if max_d:
            plt.axhline(y=max_d, c='k')

    plt.ylim(ymin = y_min)
    plt.show()

These helper functions will be used to support our hierarchical clustering analysis

In [None]:
pcaData = pd.read_csv('./Data/PCAData.csv')
pcaData.drop(['Player','Team'],axis=1,inplace=True)
pcaData.describe().round(1)

In [None]:
# to standardise the principle components
scaler = StandardScaler()
pcaDataScaled = scaler.fit_transform(pcaData)

In [None]:
# check if components are standardised
pcaDataScaled = pd.DataFrame(pcaDataScaled, columns = ['PC1','PC2','PC3','PC4','PC5'])
pcaDataScaled.describe().round(1)

The **Principle Components** are standardised before commencing on clustering the data

## Complete Linkage

In [None]:
completeLinkage = AgglomerativeClustering(distance_threshold=0, n_clusters=None,linkage='complete')
completeLinknage = completeLinkage.fit(pcaDataScaled)
plotDendrogram(completeLinkage)

In [None]:
truncatedDendrogram(completeLinkage,5,6.5)

In [None]:
pcaDataScaled['completeCluster'] = create_cluster(completeLinkage,6.5)
pcaDataScaled.groupby('completeCluster').count()

## Ward Linkage

In [None]:
wardLinkage = AgglomerativeClustering(distance_threshold=0, n_clusters=None,linkage='ward')
wardLinkage = wardLinkage.fit(pcaDataScaled)
plotDendrogram(wardLinkage)

In [None]:
truncatedDendrogram(wardLinkage,10,10.2)

In [None]:
pcaDataScaled['wardCluster'] = create_cluster(wardLinkage,10.2)
pcaDataScaled.groupby('wardCluster').count()

## Single Linkage

In [None]:
singleLinkage = AgglomerativeClustering(distance_threshold=0, n_clusters=None,linkage='single')
singleLinkage = singleLinkage.fit(pcaDataScaled)
plotDendrogram(singleLinkage)

In [None]:
truncatedDendrogram(singleLinkage,5,2)

In [None]:
pcaDataScaled['singleCluster'] = create_cluster(singleLinkage,2)
pcaDataScaled.groupby('singleCluster').count()

## Average Linkage

In [None]:
avgLinkage = AgglomerativeClustering(distance_threshold=0, n_clusters=None,linkage='average')
avgLinkage = avgLinkage.fit(pcaDataScaled)
plotDendrogram(avgLinkage)

In [None]:
truncatedDendrogram(avgLinkage,10,3.5)

In [None]:
pcaDataScaled['averageCluster'] = create_cluster(avgLinkage,3.5)
pcaDataScaled.groupby('averageCluster').count()

## Chosen Linkage Method: Ward Linkage

Our ideal results would include clusters that are best able to explain the variance in player behaviour within their teams. Basketball teams field **9-10 players** every game, hence our desired number of clusters should be close to this number. Based on this, we eliminate *Complete Linkage* as a method. Next, by observation we notice that *Average* and *Single Linkage* methods produce a desirable number of clusters, but they are not well distributed (Several Clusters with only 1-2 data points). Hence, the best method to deploy would be **Wards Linkage**, which returns 9 Clusters that are fairly well distributed, and should account for the maximum variance in the data.

In [None]:

fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111, projection='3d')
for i in pcaDataScaled['wardCluster']:
    x = np.array(pcaDataScaled[pcaDataScaled['wardCluster'] == i]['PC1'])
    y = np.array(pcaDataScaled[pcaDataScaled['wardCluster'] == i]['PC2'])
    z = np.array(pcaDataScaled[pcaDataScaled['wardCluster'] == i]['PC3'])
    ax.scatter(x, y, z, marker = 'o', s = 40)
plt.title('Scatterplot of Clusters')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.legend([1,2,3,4,5,6,7,8])
plt.show()

In [None]:
fig2 = px.scatter_3d(pcaDataScaled, x='PC1', y='PC2', z='PC3', color='wardCluster')
fig2

Plotting the first 3 Principle Components gives us a preliminary glance at the clustering that we can achieve using the *Ward Linkage* method for **Hierarchical Agglomerative Clustering**

We can observe that there is noticeable separation in clusters.

In [None]:
pcaDataFull = pd.read_csv('./Data/PCAData.csv')
pcaDataScaled['PLAYER'] = pcaDataFull['Player'].values
pcaDataScaled['TEAM'] = pcaDataFull['Team'].values
pcaDataScaled = pcaDataScaled[['PLAYER','TEAM','wardCluster','PC1','PC2','PC3','PC4','PC5']]
pcaDataScaled.rename({'wardCluster':'hCLUSTER'},axis=1,inplace=True)
pcaDataScaled.to_csv('Data/hierarchicalClustering.csv', index = False)