# K-Means Clustering

## Importing the libraries

In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 

## Importing the dataset

In [None]:
dataset = pd.read_csv("Mall_Customers.csv")

# just for visualising, we are using two dimensional data
# otherwise, other columns except ID have correlations
X = dataset.iloc[:, [3, 4]].values      # specifying seperate indexes for specific columns

## Using the elbow method to find the optimal number of clusters

In [None]:
# WCSS is used for determining optimised Number of clusters value
# KMeans is run over each number of cluster to get its WCSS

from sklearn.cluster import KMeans

# Manually calculating and selecting cluster number
wcss = []
for i in range(1, 11):
    # k-means++ to prevent random initialisation trap
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_) # to get WCSS value of kmeans model

plt.plot(range(1, 11), wcss)
plt.title("Elbow Method")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show();    

## Training the K-Means model on the dataset

In [None]:
kmeans = KMeans(n_clusters = 5, init = "k-means++", random_state = 42)

# assigning data to clusters 
# this method also returns the modified dataset having clustered data
y_pred = kmeans.fit_predict(X)

In [None]:
print(y_pred)

## Visualising the clusters

In [None]:
# Scatter plotting each cluster one by one
plt.scatter(X[y_pred == 0, 0], X[y_pred == 0, 1], s = 100, c = 'red', label = 'Cluster 1')         # selecting rows which has y_means == 0 and 0 column in X
plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], s = 100, c = 'green', label = 'Cluster 2')
plt.scatter(X[y_pred == 2, 0], X[y_pred == 2, 1], s = 100, c = 'blue', label = 'Cluster 3')
plt.scatter(X[y_pred == 3, 0], X[y_pred == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_pred == 4, 0], X[y_pred == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')

# Plotting centroids of each cluster
# Kmeans.cluster_centers_ returns [X, Y] coordinate since 2D data
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = "yellow", label = "Centroids")

plt.title("Customers Clusters")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Income (1 - 100)")
plt.legend()    # Show legend of Plot    
plt.show()