In [None]:
import numpy as np
from time import time
from IPython.display import Image
from IPython.display import display
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

from sklearn.datasets import make_blobs

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_samples

## Vocabulary

__Supervised Learning:__

__Unsupervised Learning:__

__Clustering:__

__Centroid:__

__Inertia:__

__Silouette Score:__

__Hierarchical Clustering:__

__Dendogramm:__

In [None]:
x = Image(filename='figs/Clustering.PNG') 
display(x)
print("This image is a screenshot from the video by Luis Serrano https://www.youtube.com/watch?v=QXOkPvFM6NU")

In [None]:
x = Image(filename='figs/KMeans1.PNG') 
y = Image(filename='figs/KMeans2.PNG') 
z = Image(filename='figs/KMeans3.PNG') 
display(x, y, z)
print("This image is a screenshot from the video by Luis Serrano https://www.youtube.com/watch?v=QXOkPvFM6NU")

### Let's cluster some toy data

In [None]:
centers = [(0,0), (0,1), (1,0), (1,1)]
X, y = make_blobs(centers=centers, cluster_std=0.1)
x_min = min(X[:, 0])
x_max = max(X[:, 0])
y_min = min(X[:, 1])
y_max = max(X[:, 1])

###### plot the toy data

In [None]:
plt.scatter(X[:, 0], X[:,1], c=y)
plt.xlabel('$X_1$', size=16)
plt.xlim(x_min-0.1*(x_max-x_min), x_max+0.1*(x_max-x_min))
plt.ylabel('$X_2$', size=16)
plt.ylim(y_min-0.1*(y_max-y_min), y_max+0.1*(y_max-y_min))
plt.title("True Classes", size=16)

In [None]:
k_means_clf = KMeans(n_clusters=4)
k_means_clf.fit(X)

In [None]:
plt.scatter(X[:, 0], X[:,1], c = k_means_clf.labels_)
plt.xlabel('$X_1$', size=16)
plt.xlim(x_min-0.1*(x_max-x_min), x_max+0.1*(x_max-x_min))
#plt.ylabel('$X_2$', size=16)
plt.ylim(y_min-0.1*(y_max-y_min), y_max+0.1*(y_max-y_min))
plt.title("Found Classes",size=16)
plt.show()


##### Why are the colors different?

#### Let's look at the the model properties: 

In [None]:
Image(filename='figs/kmeans_properties.PNG')

#### What does k_means_clf.predict() do??

#### What does k_means_clf.score(X) do??

### Elbow plot

In [None]:
inertia = []
list_k = list(range(1, 20))

for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(X)
    inertia.append(km.inertia_)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(list_k, inertia, '-o')
plt.xticks(list_k)
plt.xlabel(r'Number of clusters', size=16)
plt.ylabel('Cluster Inertia', size=16)

### Silouette Analysis

$i$ .... a datapoint

$a(i)$......average distance $d(i,j)$ between $i$ and all other datapoints $j$ in the _same_ cluster

$b(i)$......shortest distance $d(i,h)$ between $i$ and it's nearest neighbor $h$ from a different cluster

# $s(i) = \frac{b(i)-a(i)}{max\{a(i),b(i)\}}$

# $s = mean(s(i))$

In [None]:
avg_scores = []
K = 3
km = KMeans(n_clusters=K)
labels = km.fit_predict(X)
#centroids = km.cluster_centers_

silhouette_vals = silhouette_samples(X,labels)

In [None]:
result = {}
for k in range(K): 
    idx = np.where(labels==k)[0]
    sil_vals = silhouette_vals[idx]
    result[k] = np.sort(sil_vals)

In [None]:
plt.figure(figsize=(5,5))
y_bottom = 0
y_top = 0
y_ticks =[]
for k in range(K): 
    y_top += len(result[k])
    y_ticks.append(np.mean([y_bottom, y_top]))
    plt.barh(range(y_bottom, y_top), result[k], edgecolor='none', height=1)
    y_bottom = y_top
plt.xlabel("Silouette coefficient", size=14)
plt.axvline(x=np.mean(silhouette_vals), linestyle='--', linewidth=3, color='k')    
plt.yticks(y_ticks, ["cluster " +str(k) for k in range(K)], size=14)
plt.title("{} Clusters ".format(K), size=18)

#### we can use the silouette score to find the optimal number of clusters

In [None]:
silouette_score = []
list_k = list(range(2,10))

for k in list_k:
    km = KMeans(n_clusters=k)
    labels = km.fit_predict(X)
    silhouette_vals = silhouette_samples(X,labels)
    silouette_score.append(np.mean(silhouette_vals))

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(list_k, silouette_score, '-o')
plt.xticks(list_k)
plt.xlabel(r'Number of clusters', size=16)
plt.ylabel('Silouette Score', size=16)

### Let's try k-means on the Charity dataset

In [None]:
data_train = pd.read_csv("clean_charity_ml.csv")

##### first we need to reduce the dataset. Why?

In [None]:
X_train=data_train[["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]]
y_train=data_train["income"]

In [None]:
km_clf = KMeans(n_clusters=2)
y_pred = km_clf.fit_predict(X_train)

##### make two clusters and interpret the results

In [None]:
km_clf = KMeans(n_clusters=2)
y_pred = km_clf.fit_predict(X_train)

In [None]:
km_clf.cluster_centers_

In [None]:
for c_idx, col in enumerate(X_train): 
    print("{:15}:{:.2f}\t {:.2f}".format(col, 
                               km_clf.cluster_centers_[0, c_idx],
                               km_clf.cluster_centers_[1, c_idx]))

In [None]:
accuracy_score(y_train, y_pred)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()

In [None]:
#Percison = 
#Recall = 

### When does K-Means not work?

#### K means relies on several strong assumptions: 

- that the clusters are circular
- that the clusters have the same variance
- that the clusters have the same size

In [None]:
random_state = 170
n_samples = 1500

#non_circular_data 
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.608, -0.636], [-0.4088, 0.8525]]
X_ellipses = np.dot(X, transformation)

# unequal blob sizes
X_varied, y_varied = make_blobs(n_samples=n_samples,cluster_std=[1.0, 2.5, 0.5],random_state=random_state)

# Unevenly sized blobs
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
y_filtered = np.zeros(610)
y_filtered[500:] = 1
y_filtered[600:] = 2

In [None]:
km_clf = KMeans(n_clusters=3, random_state=random_state)
y_pred_ellipse = km_clf.fit_predict(X_ellipses)
y_pred_uneq_var = km_clf.fit_predict(X_varied)
y_pred_uneq_size = km_clf.fit_predict(X_filtered)

In [None]:
plt.figure(figsize=(8, 8))

plt.subplot(2,3,1)
plt.scatter(X_ellipses[:, 0], X_ellipses[:, 1], c=y)
plt.title("Anisotropicly Distributed Blobs")

plt.subplot(2,3,2)
plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)
plt.title("Unequal Variance")

plt.subplot(2,3,3)
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)
plt.title("Unevenly Sized Blobs")


plt.subplot(2,3,4)
plt.scatter(X_ellipses[:, 0], X_ellipses[:, 1], c=y_pred_ellipse)

plt.subplot(2,3,5)
plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred_uneq_var)

plt.subplot(2,3,6)
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred_uneq_size)


# Hierarchical clustering

In [None]:
x = Image(filename='figs/AggClust.PNG')
display(x)
print("image is from https://www.displayr.com/what-is-hierarchical-clustering/")

In [None]:
t = 1.5 * np.pi * (1 + 3 * np.random.rand(1, n_samples))
x1 =  np.cos(t)+ 0.1 *np.random.rand(1,n_samples)
y1 = np.sin(t) + 0.1 *np.random.rand(1,n_samples)

x2 =  0.5*np.cos(t)+ 0.1 *np.random.rand(1,n_samples)
y2 = 0.5*np.sin(t) + 0.1 *np.random.rand(1,n_samples)

x = np.hstack((x1,x2))
y = np.hstack((y1,y2))
X_circles = np.vstack((x,y)).T

y_circles = np.zeros(2*n_samples)
y_circles[:n_samples] = 1

data = X_circles
labels = y_circles
N_clusters =2

In [None]:
km_clf = KMeans(n_clusters=N_clusters)
y_pred_kMeans = km_clf.fit_predict(data)

In [None]:
ac_clf = AgglomerativeClustering(n_clusters=N_clusters, linkage='single')
y_pred_Agg = ac_clf.fit_predict(data)

In [None]:
plt.figure(figsize=(10, 5))

plt.subplot(131)
plt.scatter(data[:, 0], data[:, 1], c=labels)
plt.title("True Labels", size =16)

plt.subplot(132)
plt.scatter(data[:, 0], data[:, 1], c=y_pred_kMeans)
plt.title("KMeans Labels", size =16)

plt.subplot(133)
plt.scatter(data[:, 0], data[:, 1], c=y_pred_Agg)
plt.title("AggClustering Labels", size =16)
plt.show()

### Try agglomerative clustering on the charity dataset

In [None]:
data_train = pd.read_csv("CleanNotebooksForClass/FrauenLoopIntroML/clean_charity_ml.csv")

In [None]:
X_train=data_train[["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]]
y_train=data_train["income"]

In [None]:
model = AgglomerativeClustering(n_clusters=2)
y_pred = model.fit_predict(X_train)

In [None]:
accuracy_score(y_pred, y_train)

#### An illustration of Agglomerative Clustering

In [None]:
import scipy.cluster.hierarchy as shc

In [None]:
X_small = X_train.sample(100)

In [None]:
labels = y_train[X_small.index]

In [None]:
plt.figure(figsize=(15, 10))  
plt.title("Dendrogram of Charity Data", size=16)  
dend = shc.dendrogram(shc.linkage(X_small, method='ward'), labels=list(labels), p=3)
plt.savefig('dendogram.png', bbox_inches='tight', dpi=300)

### resources

- A good overview of sklearns clustering algorithms: https://scikit-learn.org/stable/modules/clustering.html#k-means
- When k-means breaks down https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py
- https://towardsdatascience.com/k-means-clustering-algorithm-applications-evaluation-methods-and-drawbacks-aa03e644b48a