In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

import warnings
warnings.filterwarnings("ignore")

In [5]:
iris_df = pd.read_csv('datasets/iris/iris.data', skiprows=1,
                     names = ['sepal-length',
                             'sepal-width',
                             'petal-length',
                             'petal-width',
                             'class'])

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [7]:
#shuffling the 150 records in this dataset, why? - to make sure that the Iris flowers are scattered in a random 
#                                                  order

iris_df = iris_df.sample(frac=1).reset_index(drop=True)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.4,2.9,4.3,1.3,Iris-versicolor
1,5.0,2.3,3.3,1.0,Iris-versicolor
2,4.9,3.1,1.5,0.1,Iris-setosa
3,5.4,3.4,1.5,0.4,Iris-setosa
4,5.5,4.2,1.4,0.2,Iris-setosa


In [8]:
iris_df.shape

(149, 5)

In [14]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

# converting the class column from string to integer type for processing in ML
iris_df['class'] = label_encoding.fit_transform(iris_df['class']).astype(str)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.4,2.9,4.3,1.3,1
1,5.0,2.3,3.3,1.0,1
2,4.9,3.1,1.5,0.1,0
3,5.4,3.4,1.5,0.4,0
4,5.5,4.2,1.4,0.2,0


In [15]:
# Let's store the Iris features in a separate dataframe, without the class column

iris_features = iris_df.drop('class', axis=1)

iris_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,6.4,2.9,4.3,1.3
1,5.0,2.3,3.3,1.0
2,4.9,3.1,1.5,0.1
3,5.4,3.4,1.5,0.4
4,5.5,4.2,1.4,0.2


In [13]:
# Let's store the labels separately as iris_labels in a pandas series object

iris_labels = iris_df['class']

iris_labels.sample(5)

77     1
24     2
115    2
41     0
39     0
Name: class, dtype: object

In [20]:
# building a model

def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)
    
    print('homo\tcompl\tv-means\tARI\tAMI\tsilhouette')
    
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
           metrics.completeness_score(labels, model.labels_),
           metrics.v_measure_score(labels, model.labels_),
           metrics.adjusted_rand_score(labels, model.labels_),
           metrics.adjusted_mutual_info_score(labels, model.labels_),
           metrics.silhouette_score(data, model.labels_)))

In [21]:
# Now, let's instantiate KMeans clustering model

# here there will be 3 clusters - because it is that many types of Iris flowers we have in the dataset
# All this function does is to instantiate the KMeans estimator object, takes in the data, cluster count and 
# maximum number of iterations and call fit on the underlying data

def k_means(data, n_clusters=3, max_iter=1000):
    model = KMeans(n_clusters, max_iter=max_iter).fit(data)
    
    return model

In [22]:
# Now we are calling the build_model function we wrote earlier and passing the model type, features and labels

build_model(k_means, iris_features, iris_labels)

homo	compl	v-means	ARI	AMI	silhouette
--------------------------------------------------
0.750	0.763	0.757	0.727	0.753	0.550


In [23]:
# Agglomerative clustering - for notes refer the Notability note on iPad

def agglomerative_fn(data, n_clusters=3):
    model = AgglomerativeClustering(n_clusters = n_clusters).fit(data)
    
    return model

In [24]:
build_model(agglomerative_fn, iris_features, iris_labels)

homo	compl	v-means	ARI	AMI	silhouette
--------------------------------------------------
0.759	0.778	0.769	0.728	0.766	0.552


In [25]:
# here you can note from the above comparisons, that KMeans and Agglomerative did about the same

In [41]:
# Implementing DBSCAN Clustering (DBSCAN - Density Based Spatial Clustering of Applications with Noise)
# eps determines what we consider a dense region - smaller values are usually preferred
def dbscan_fn(data, eps=0.45, min_samples=4):
    model = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    return model

In [42]:
build_model(dbscan_fn, iris_features, iris_labels)

homo	compl	v-means	ARI	AMI	silhouette
--------------------------------------------------
0.574	0.607	0.590	0.503	0.581	0.369


In [43]:
# from the above output, you can see that DBSCAN didn't come close to the 2 other models before

In [44]:
# Mean-shift clustering

# bandwidth is a hyperparameter

def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    
    return model

In [45]:
build_model(mean_shift_fn, iris_features, iris_labels)

homo	compl	v-means	ARI	AMI	silhouette
--------------------------------------------------
0.759	0.770	0.764	0.740	0.761	0.549


In [46]:
# Mean shift cluster will try to discover blobs in a smooth cluster of data points
# Under the hood, this technique will try to discover the original seeds of the cluster by a binning technique

In [47]:
# BIRCH clustering implementation

def birch_fn(data, n_clusters=3):
    model = Birch(n_clusters=n_clusters).fit(data)
    
    return model;

In [48]:
build_model(birch_fn, iris_features, iris_labels)

homo	compl	v-means	ARI	AMI	silhouette
--------------------------------------------------
0.712	0.750	0.730	0.657	0.727	0.512


In [53]:
# Affinity Propagation Implementation

# damping is a hyperparameter here. It defines the extent to which the current value of a particular data point is
# maintained relative to incoming values. It is a learning rate for this algorithm.
def affinity_propagation_fn(data, damping=0.6, max_iter=1000):
    model = AffinityPropagation(damping=damping, max_iter=max_iter).fit(data)
    
    return model

In [54]:
build_model(affinity_propagation_fn, iris_features, iris_labels)

homo	compl	v-means	ARI	AMI	silhouette
--------------------------------------------------
0.850	0.491	0.622	0.435	0.611	0.349


In [56]:
# As per above socres, high homogeneity score means that every cluster contains members of all the classes,
# but all members of a class do not lie in the same cluster, that is why completeness score is low

In [60]:
# Mini-batch KMeans clustering function implementation

# This function/algorithm performs K-means clustering on random subsets of data rather than the entire dataset

# This runs much faster than the K-means algorithm on very large datasets. Here we provide one additional param
# than the regular K-means algorithm, which is batch_size
def mini_batch_kmeans_fn(data, n_clusters=3, max_iter=1000):
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=20).fit(data)
    
    return model

In [58]:
build_model(mini_batch_kmeans_fn, iris_features, iris_labels)

homo	compl	v-means	ARI	AMI	silhouette
--------------------------------------------------
0.750	0.763	0.757	0.727	0.753	0.550


In [61]:
# Performance if you compare with K-means clustering is only slightly different. Overall mini batch performs 
# almost as good as k-means