# K-Means Clustering
### Examples
- k-means on a random generated dataset
- k-means for customer segmentation
- Using the Elbow method to find the optimal number of clusters(k)

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

from sklearn.datasets import make_blobs

# Ex.1) k-Means on a randomly generated dataset

In [None]:
np.random.seed(42)

In [None]:
X,y = make_blobs(n_samples=5000,centers=[[4,4], [-2, -1], [2, -3], [1, 1]],cluster_std=0.9)
X

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='.')

# Setting up K-Means

In [None]:
k_means_model = KMeans(init = "k-means++", n_clusters = 4, n_init = 12, verbose=1)

In [None]:
k_means_model.fit(X)

In [None]:
k_means_model_labels = k_means_model.labels_

print(k_means_model_labels)

print(len(k_means_model_labels))

In [None]:
k_means_model_centers = k_means_model.cluster_centers_

print(k_means_model_centers)

In [None]:
fig = plt.figure(figsize=(6, 4))

colors = plt.cm.Spectral(np.linspace(0, 1, len(set(k_means_model_labels))))

ax = fig.add_subplot(1, 1, 1)

for k, col in zip(range(len([[4,4], [-2, -1], [2, -3], [1, 1]])), colors):
    my_members = (k_means_model_labels == k)
    
    cluster_center = k_means_model_centers[k]
    
    # Plots the datapoints with color col.
    ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.')
    
    # Plots the centroids with specified color, but with a darker outline
    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,  markeredgecolor='k', markersize=6)

ax.set_title('KMeans')
plt.show()

# Customer Segmentation and Analysis

## Importing Libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

## Data Exploration

In [None]:
customers_df = pd.read_csv('Customer.csv')
customers_df.head()

In [None]:
customers_df.shape

In [None]:
customers_df.dtypes

In [None]:
customers_df.isnull().sum()

In [None]:
customers_df[customers_df.Defaulted.isnull()]

In [None]:
new_customer_df = customers_df.loc[:,['Age','Income']].copy()
new_customer_df

In [None]:
plt.scatter(new_customer_df.loc[:,'Age'],new_customer_df.loc[:,'Income'])

## Normalizing over the standard deviation

In [None]:
from sklearn.preprocessing import StandardScaler

X = new_customer_df.values[:,:]
X = np.nan_to_num(X)

Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet[0]

## Modeling

In [None]:
clusterNum = 2

k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)

k_means.fit(X)

labels = k_means.labels_

# labels
print(len(labels))

In [None]:
new_customer_df["Clus_km"] = labels
new_customer_df.head(5)

In [None]:
new_customer_df.groupby('Clus_km').mean()

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=labels.astype(np.float))
plt.xlabel('Age', fontsize=16)
plt.ylabel('Income', fontsize=16)

plt.show()

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
silhouette_score(X,labels)

# K-Means Clustering with Elbow method

## Importing the dataset

In [None]:
dataset = pd.read_csv('Mall_Customers.csv')
X = dataset.iloc[:, [3, 4]].values
dataset.head()

## Using the elbow method to find the optimal number of clusters

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

## Visualising the clusters

In [None]:
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 50, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 50, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 50, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 50, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 50, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 100, c = 'black', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

https://github.com/njiix/py4ds