# Multiple Clustering Models

## Setting up helper functions

In [20]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

In [21]:
from sklearn import metrics

from sklearn.cluster import KMeans, AgglomerativeClustering,DBSCAN,MeanShift,Birch,AffinityPropagation, MiniBatchKMeans

import warnings

warnings.filterwarnings('ignore')

In [22]:
iris_df= pd.read_csv('datasets/iris.csv',
                     skiprows=1,
                     names=['sepal-length',
                     'sepal-width',
                     'petal-length',
                     'petal-width',
                     'class']
                     
                     )

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [23]:
iris_df= iris_df.sample(frac=1).reset_index(drop=True)
iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.4,2.8,5.6,2.1,Iris-virginica
1,5.2,3.5,1.5,0.2,Iris-setosa
2,7.2,3.0,5.8,1.6,Iris-virginica
3,5.2,4.1,1.5,0.1,Iris-setosa
4,4.6,3.2,1.4,0.2,Iris-setosa


In [24]:
iris_df.shape

(150, 5)

In [25]:
from sklearn import preprocessing

lable_encoding= preprocessing.LabelEncoder()
iris_df['class']= lable_encoding.fit_transform(iris_df['class']).astype(int)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.4,2.8,5.6,2.1,2
1,5.2,3.5,1.5,0.2,0
2,7.2,3.0,5.8,1.6,2
3,5.2,4.1,1.5,0.1,0
4,4.6,3.2,1.4,0.2,0


In [26]:
iris_df_features= iris_df.drop('class',axis=1)

iris_df_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,6.4,2.8,5.6,2.1
1,5.2,3.5,1.5,0.2
2,7.2,3.0,5.8,1.6
3,5.2,4.1,1.5,0.1
4,4.6,3.2,1.4,0.2


In [27]:
iris_df_labels= iris_df['class']
iris_df_labels.head()

0    2
1    0
2    2
3    0
4    0
Name: class, dtype: int64

In [28]:
def build_model(clustering_model,data,labels):
    model= clustering_model(data)

    print("homo\tcomp1\tv-meas\tARI\tAMI\tsilhouette")

    print(50 * "-")

    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'%(
        metrics.homogeneity_score(labels,model.labels_),
        metrics.completeness_score(labels,model.labels_),
        metrics.v_measure_score(labels,model.labels_),
        metrics.adjusted_rand_score(labels,model.labels_),
        metrics.adjusted_mutual_info_score(labels,model.labels_),
        metrics.silhouette_score(data,model.labels_)
    ))

In [29]:
def k_mean(data, n_clusters=3,max_iter=1000):
    return KMeans(n_clusters=n_clusters,max_iter=max_iter).fit(data)

In [30]:
build_model(k_mean,iris_df_features,iris_df_labels)

homo	comp1	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


## Agglomerative Clustering 

Bottom-up hierarchical clustering approach which recursively merges pairs of clusters, starting with single point clusters

In [31]:
def agglomeratie_fn(data, n_clusters=3):
    return AgglomerativeClustering(n_clusters=n_clusters).fit(data)

In [32]:
build_model(agglomeratie_fn,iris_df_features,iris_df_labels)

homo	comp1	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.767	0.554


## DBSCAN

`D`ensity-`B`ased spetial `C`lustering of `A`pplications with `N`oise

**Two Parameters of DBSCAN**:

1. *eps* : Minimum distance,  points closer than this are neighbors
    - if too small most of the data will not be clustered
    - if too larage clustering will be too coarse
2. *min_samples* : Minimum number of points to form a dense region
    - Generally this should be greater than number of dimensions in the data
    - Large values better for noisy data points



In [33]:
def dbscan_fn(data, eps=0.45,min_samples=4):
    return DBSCAN(eps=eps,min_samples=min_samples).fit(data)

In [34]:
build_model(dbscan_fn,iris_df_features,iris_df_labels)

homo	comp1	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.577	0.609	0.593	0.508	0.584	0.372


## Mean Shift clustering

In [35]:
def mean_shift_fn(data,bandwidth=0.85):
    return MeanShift(bandwidth=bandwidth).fit(data)

In [36]:
build_model(mean_shift_fn,iris_df_features,iris_df_labels)

homo	comp1	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.760	0.772	0.766	0.744	0.763	0.551


## BIRCH Clustering

`B`alanced `I`terative `R`educing and `C`lustering using `H`ierarchies

In [37]:
def brich_fn(data,n_clusters=3):
    return Birch(n_clusters=n_clusters).fit(data)

In [38]:
build_model(brich_fn,iris_df_features,iris_df_labels)

homo	comp1	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.630	0.798	0.704	0.564	0.700	0.534


## Affinittly Propagation Clustering



In [39]:
def affinity_propagation_fn(data,damping=0.6,max_iter=1000):
    return AffinityPropagation(damping=damping,max_iter=max_iter).fit(data)

build_model(affinity_propagation_fn,iris_df_features,iris_df_labels)

homo	comp1	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.851	0.492	0.623	0.437	0.612	0.349


## Mini-Batch k-mean 

In [43]:
def mini_batch_kmeans_fn(data,n_clusters=3,max_iter=1000):
    return MiniBatchKMeans( n_clusters=n_clusters,max_iter=max_iter,batch_size=20).fit(data)

build_model(mini_batch_kmeans_fn,iris_df_features,iris_df_labels)

homo	comp1	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


## Spectral Clustering

In [50]:
from sklearn.cluster import SpectralClustering

# self-similarity, the similarity of a data point with itself
SS=1000

# intra-cluster similarity between points in a cluster
IS=10
# Low similarity, between points in differents clusters
LS=0.01

In [51]:
similarity_mat = [[SS, IS, IS, LS, LS, LS, LS, LS, LS],
                [IS, SS, IS, LS, LS, LS, LS, LS, LS],
                [IS, IS, SS, LS, LS, LS, LS, LS, LS],
                [LS, LS, LS, SS, IS, IS, LS, LS, LS],
                [LS, LS, LS, IS, SS, IS, LS, LS, LS],
                [LS, LS, LS, IS, IS, SS, LS, LS, LS],
                [LS, LS, LS, LS, LS, LS, SS, IS, IS],
                [LS, LS, LS, LS, LS, LS, IS, SS, IS],
                [LS, LS, LS, LS, LS, LS, IS, IS, SS]] 

In [52]:
spectural_model= SpectralClustering(n_clusters=3,affinity='precomputed').fit(similarity_mat)

In [53]:
spectural_model.labels_

array([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=int32)