# K-Means Clustering

A secondary analysis using K-Means Clustering will allow us to do check against our results from Hierarchical Agglomerative Clustering, and affirm our conclusions about the data

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import Sci-Kit Learn
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import datasets
from kneed import KneeLocator
import plotly.express as px

# Default plot params
plt.style.use('seaborn')
cmap = 'tab10'

In [None]:
pcaData = pd.read_csv('./Data/PCAData.csv')
pcaData.drop(['Player','Team'],axis=1,inplace=True)
pcaData.describe().round(1)

In [None]:
pcaData

In [None]:
# to standardise the principle components
scaler = StandardScaler()
pcaDataScaled = scaler.fit_transform(pcaData)


In [None]:
# Picking Best number of k part, using elbow method 
# Specifying the dataset and initializing variables
X = pcaDataScaled
distorsions = []

# Calculate SSE for different K
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state = 301)
    kmeans.fit(X)
    distorsions.append(kmeans.inertia_)

# Plot values of SSE
plt.figure(figsize=(15,8))
plt.subplot(121, title='Elbow curve')
plt.xlabel('k')
plt.plot(range(2, 10), distorsions)
plt.grid(True)

In [None]:
# Generate silhouette coefficient for each k
X = pcaDataScaled
silhouette_plot = []
for k in range(2, 10):
    clusters = KMeans(n_clusters=k, random_state=10)
    cluster_labels = clusters.fit_predict(X)
    silhouette_avg = metrics.silhouette_score(X, cluster_labels)
    silhouette_plot.append(silhouette_avg)

In [None]:
# Plot Silhouette coefficient
plt.figure(figsize=(15,8))
plt.subplot(121, title='Silhouette coefficients over k')
plt.xlabel('k')
plt.ylabel('silhouette coefficient')
plt.plot(range(2, 10), silhouette_plot)
plt.axhline(y=np.mean(silhouette_plot), color="red", linestyle="--")
plt.grid(True)

Based on the *Elbow Plot* and *Silhouette Coefficient*, we can conclude that the optimal number of clusters is **6**

In [None]:
#Predict K-Means cluster membership
km_neat = KMeans(n_clusters=6, random_state=2).fit_predict(pcaDataScaled)
#km_messy = KMeans(n_clusters=3, random_state=2).fit_predict(x_messy)

plt.figure(figsize=(15,8))
plt.subplot(121, title='Cluster with PCA test1')

plt.scatter(pcaDataScaled[:,0], pcaDataScaled[:,1], c=km_neat, cmap=cmap)

In [None]:
# Incrementing each value up by one for ease of analysis of the 6 Clusters
for i in range(len(km_neat)):
    km_neat[i] += 1
km_neat

In [None]:
pcaDataScaled = pd.DataFrame(pcaDataScaled, columns = ['PC1','PC2','PC3','PC4','PC5'])

In [None]:
fig2 = px.scatter_3d(pcaDataScaled, x='PC1', y='PC2', z='PC3',color=km_neat)
fig2

We can observe that there is noticeable separation in clusters after plotting the first 3 Principle Components

In [None]:
pcaDataFull = pd.read_csv('./Data/PCAData.csv')
pcaDataScaled['PLAYER'] = pcaDataFull['Player'].values
pcaDataScaled['TEAM'] = pcaDataFull['Team'].values
pcaDataScaled['kCLUSTER'] = km_neat
pcaDataScaled.to_csv('Data/kmeansClustering.csv', index = False)