<a href="https://colab.research.google.com/github/AjayNRG/Machine_Learning_Algorithms_Giuseppe_Bonnacorso/blob/master/ClusteringFundamentals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Clustering Basics

#A. K means

In [0]:
from sklearn.datasets import make_blobs
nb_samples = 1000
X, _ = make_blobs(n_samples=nb_samples, n_features=2, centers=3,
cluster_std=1.5)

In [2]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=3)
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [3]:
print(km.cluster_centers_)

[[-1.34771499 -7.03948581]
 [ 6.89624861  1.76430318]
 [ 2.8894315   0.50068243]]


In [0]:
from sklearn.datasets import make_circles

nb_samples = 1000
X, Y = make_circles(n_samples=nb_samples, noise=0.05)

In [5]:
km = KMeans(n_clusters=2)
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

####Finding the optimal number of clusters

######1.Optimizing the inertia

In [0]:
nb_clusters = [2, 3, 5, 6, 7, 8, 9, 10]
inertias = []

for n in nb_clusters:
  km = KMeans(n_clusters=n)
  km.fit(X)
  inertias.append(km.inertia_)

######2.Silhouette score

In [0]:
from sklearn.metrics import silhouette_score
nb_clusters = [2, 3, 5, 6, 7, 8, 9, 10]
avg_silhouettes = []
for n in nb_clusters:
  km = KMeans(n_clusters=n)
  Y = km.fit_predict(X)
  avg_silhouettes.append(silhouette_score(X, Y))

######3.Calinski-Harabasz index

In [8]:
from sklearn.metrics import calinski_harabaz_score
nb_clusters = [2, 3, 5, 6, 7, 8, 9, 10]
ch_scores = []
km = KMeans(n_clusters=n)
Y = km.fit_predict(X)
for n in nb_clusters:
 km = KMeans(n_clusters=n)
 Y = km.fit_predict(X)
 ch_scores.append(calinski_harabaz_score(X, Y))



######4. Cluster instability

In [0]:
import numpy as np
nb_noisy_datasets = 4
X_noise = []
for _ in range(nb_noisy_datasets):
  Xn = np.ndarray(shape=(1000, 2))
  for i, x in enumerate(X):
    if np.random.uniform(0, 1) < 0.25:
      Xn[i] = X[i] + np.random.uniform(-2.0, 2.0)
    else:
      Xn[i] = X[i]
  X_noise.append(Xn)

#B. DBSCAN

In [0]:
from sklearn.datasets import make_moons
>>> nb_samples = 1000
>>> X, Y = make_moons(n_samples=nb_samples, noise=0.05)

In [0]:
from sklearn.cluster import DBSCAN
>>> dbs = DBSCAN(eps=0.1)
>>> Y = dbs.fit_predict(X)

#C. Spectral clustering

In [0]:
from sklearn.cluster import SpectralClustering
>>> Yss = []
>>> gammas = np.linspace(0, 12, 4)

for gamma in gammas:
  sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=gamma)
  Yss.append(sc.fit_predict(X))

In [14]:
sc = SpectralClustering(n_clusters=2, affinity='nearest_neighbors')
Ys = sc.fit_predict(X)



#Evaluation methods based 
#on the ground truth

#A. Homogeneity

In [15]:
from sklearn.metrics import homogeneity_score
km = KMeans(n_clusters=4)
Yp = km.fit_predict(X)
print(homogeneity_score(Y, Yp))

0.5107459767162363


#B.Completeness

In [16]:
from sklearn.metrics import completeness_score
km = KMeans(n_clusters=4)
Yp = km.fit_predict(X)
print(completeness_score(Y, Yp))

0.26132922801915814


#C. Adjusted rand index

In [17]:
from sklearn.metrics import adjusted_rand_score
km = KMeans(n_clusters=4)
Yp = km.fit_predict(X)
print(adjusted_rand_score(Y, Yp))

0.24611425203592058
