# Unsupervised Learning | Clustering (K-Means)

Today's Outline:
- Intuition
- Full Case-study

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

==========

## Labeled vs. Unlabeled Data

### Labeled Data

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
iris_df=pd.DataFrame(iris.data)
iris_df['class']=iris.target
iris_df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
iris_df.dropna(how="all", inplace=True)
iris_df

### Unlabeled Data

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
iris_df=pd.DataFrame(iris.data)
iris_df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid']
iris_df.dropna(how="all", inplace=True)
iris_df

==========

## K-Means Clustering | Intuition (Blobs)

Scikit-Learn Clustering Module: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster

Visualizing K-Means Clustering: https://www.naftaliharris.com/blog/visualizing-k-means-clustering/

### Creating Data

In [None]:
from sklearn.datasets import make_blobs

In [None]:
data = make_blobs(n_samples=200, n_features=2, centers=4, cluster_std=1.8,random_state=101)

### Exploring Data

In [None]:
plt.scatter(data[0][:,0],data[0][:,1],c=data[1],cmap='coolwarm')

### Model Training & Predicting

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(5)

In [None]:
kmeans.fit(data[0])

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.labels_

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(10,6))
ax1.set_title('K-Means')
ax1.scatter(data[0][:,0],data[0][:,1],c=kmeans.labels_,cmap='rainbow')
ax2.set_title("Original")
ax2.scatter(data[0][:,0],data[0][:,1],c=data[1],cmap='rainbow')

==========

## K-Means Clustering | Full Case-study (Mall Customers)

### Importing Dataset

In [None]:
mall = pd.read_csv('data/mall-customers.csv')
mall

### Extracting Features

In [None]:
X = mall.iloc[:, [3, 4]].values
X

### Exploring Data

In [None]:
plt.scatter(X[:,0], X[:,1])
plt.title('Mall Customers')
plt.xlabel('Annual Income (K$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

### Model Training & Prediction

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)

In [None]:
kmeans.fit(X)

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.inertia_

In [None]:
y_kmeans = kmeans.fit_predict(X)
y_kmeans

In [None]:
mall_clustered = mall.copy()
mall_clustered['Clusters'] = y_kmeans
mall_clustered

In [None]:
plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow')
plt.title('Mall Customers')
plt.xlabel('Annual Income (K$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

### Finding the Optimal Number of Clusters (Elbow / Knee Method)

In [None]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

### Final Model

In [None]:
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

### Visualising Clusters

In [None]:
plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 200, c = 'k')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

==========

# THANK YOU!