## k-means clustering

## 1. Prepare your workstation

In [None]:
# import all the necessary packages
import numpy as np
import pandas as pd

import warnings  
warnings.filterwarnings("ignore")

In [None]:
# import data into Python
df_ais = pd.read_csv('ais.csv')

df_ais.info()

In [None]:
# determine null values
df_ais.isnull().sum()

## 2. Evaluate the variables

In [None]:
# descriptive statistics
df_ais.describe()

## 3. Drop unneeded columns

In [None]:
# quick analysis on the variable 
print(len(df_ais['sex'].unique()))
#in cluster analysis we cannot use unique identifier so we drop this column
df_ais.drop('sex', axis=1, inplace=True)

In [None]:
len(df_ais['sport'].unique())
#so we do nothing 

In [None]:
# Display the column names.
df_ais.columns 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 

plt.figure(figsize=(12, 12))
ax = sns.countplot(x="sport", data=df_ais)
plt.title('Blood characteristics of athletes')
plt.xlabel('sport type')
plt.ylabel('frequency')

for p in ax.patches:
        ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50), va='center')

## 4. Specify the target variable 

In [None]:
#define the target variable
X = df_ais
y = df_ais['sport']

In [None]:
# Import the LabelEncoder class:
from sklearn.preprocessing import LabelEncoder

#convert the target variable to integers
le = LabelEncoder()

#replace in the existing DataFrame with the integer values
X['sport'] = le.fit_transform(X['sport'])
y = le.transform(y)

In [None]:
X.info()

In [None]:
X.head()
#to ensure that label encoding happend correctly

## 5. Normalise the data set

In [None]:
# Create an list with the column labels from X:
x_cols = X.columns

# Import the MinMaxScaler class.
from sklearn.preprocessing import MinMaxScaler 

# Create the object from ‘MinMaxScaler’.
ms = MinMaxScaler() 
# Modify X to scale values between 0 and 1.
X = ms.fit_transform(X) 
# Set X as equal to a new DataFrame.
X = pd.DataFrame(X, columns=[x_cols]) 

# Check the contents of the modified DataFrame.
X.head() 
#you can the see the difference between the above and below

## 6. Apply the clustering algorithm

In [None]:
# [1] Import the KMeans class.
from sklearn.cluster import KMeans 

#apply clustering 
kmeans = KMeans(n_clusters=2, random_state=0) 

kmeans.fit(X)

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.inertia_

## 7. Evaluate the output

In [None]:
#for evaluating our final values
labels = kmeans.labels_

# check how many of the samples were correctly labeled
correct_labels = sum(y == labels)

print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
#very weak classification

## 8. Improve the accuracy (elbow method)

In [None]:
#elbow chart for us to decide on the number of optimal clusters
cs = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(X)
    cs.append(kmeans.inertia_)
plt.plot(range(1, 11), cs)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()
#From the below chart we can take the optimal number as 4 

In [None]:
#for test purpose can evaluate with n=3
kmeans = KMeans(n_clusters=3, random_state=0)

kmeans.fit(X)

# check how many of the samples were correctly labeled
labels = kmeans.labels_

correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
#accuracy is still too little

In [None]:
#for test purpose can evaluate with n=4, play around with number
kmeans = KMeans(n_clusters=4, random_state=0)

kmeans.fit(X)

# check how many of the samples were correctly labeled
labels = kmeans.labels_

df_ais['labels'] = labels

correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
#accuracy is good it is acceptable 

## 9. Visualise the clusters

In [None]:
# compare height, weight and lean body mass
fig = plt.figure(figsize=(26, 26))
ax = fig.add_subplot(131, projection='3d')
                     #, projection='2d')
ax.scatter(df_ais['ht'], df_ais['wt'], df_ais['lbm'], c=labels, s=15)
ax.set_xlabel('ht')
ax.set_ylabel('wt')
ax.set_zlabel('lbm')
plt.show()

In [None]:
# rec blood cell count, white blood cell count, hemoglobin concentration

fig = plt.figure(figsize=(26, 26))
ax = fig.add_subplot(131, projection='3d')
                     #, projection='2d')
ax.scatter(df_ais['rcc'], df_ais['wcc'], df_ais['hg'], c=labels, s=15)
ax.set_xlabel('rcc')
ax.set_ylabel('wcc')
ax.set_zlabel('hg')
plt.show()

In [None]:
# body mass index, height, weight

fig = plt.figure(figsize=(26, 26))
ax = fig.add_subplot(131, projection='3d')
                     #, projection='2d')
ax.scatter(df_ais['bmi'], df_ais['ht'], df_ais['wt'], c=labels, s=15)
ax.set_xlabel('bmi')
ax.set_ylabel('ht')
ax.set_zlabel('wt')
plt.show()