In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

import warnings
warnings.filterwarnings("ignore")

In [5]:
iris_df = pd.read_csv('datasets/iris/iris.data', skiprows=1,
                     names = ['sepal-length',
                             'sepal-width',
                             'petal-length',
                             'petal-width',
                             'class'])

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [7]:
#shuffling the 150 records in this dataset, why? - to make sure that the Iris flowers are scattered in a random 
#                                                  order

iris_df = iris_df.sample(frac=1).reset_index(drop=True)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.4,2.9,4.3,1.3,Iris-versicolor
1,5.0,2.3,3.3,1.0,Iris-versicolor
2,4.9,3.1,1.5,0.1,Iris-setosa
3,5.4,3.4,1.5,0.4,Iris-setosa
4,5.5,4.2,1.4,0.2,Iris-setosa


In [8]:
iris_df.shape

(149, 5)

In [14]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

# converting the class column from string to integer type for processing in ML
iris_df['class'] = label_encoding.fit_transform(iris_df['class']).astype(str)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.4,2.9,4.3,1.3,1
1,5.0,2.3,3.3,1.0,1
2,4.9,3.1,1.5,0.1,0
3,5.4,3.4,1.5,0.4,0
4,5.5,4.2,1.4,0.2,0


In [15]:
# Let's store the Iris features in a separate dataframe, without the class column

iris_features = iris_df.drop('class', axis=1)

iris_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,6.4,2.9,4.3,1.3
1,5.0,2.3,3.3,1.0
2,4.9,3.1,1.5,0.1
3,5.4,3.4,1.5,0.4
4,5.5,4.2,1.4,0.2


In [13]:
# Let's store the labels separately as iris_labels in a pandas series object

iris_labels = iris_df['class']

iris_labels.sample(5)

77     1
24     2
115    2
41     0
39     0
Name: class, dtype: object

In [20]:
# building a model

def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)
    
    print('homo\tcompl\tv-means\tARI\tAMI\tsilhouette')
    
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
           metrics.completeness_score(labels, model.labels_),
           metrics.v_measure_score(labels, model.labels_),
           metrics.adjusted_rand_score(labels, model.labels_),
           metrics.adjusted_mutual_info_score(labels, model.labels_),
           metrics.silhouette_score(data, model.labels_)))

In [21]:
# Now, let's instantiate KMeans clustering model

# here there will be 3 clusters - because it is that many types of Iris flowers we have in the dataset
# All this function does is to instantiate the KMeans estimator object, takes in the data, cluster count and 
# maximum number of iterations and call fit on the underlying data

def k_means(data, n_clusters=3, max_iter=1000):
    model = KMeans(n_clusters, max_iter=max_iter).fit(data)
    
    return model

In [22]:
# Now we are calling the build_model function we wrote earlier and passing the model type, features and labels

build_model(k_means, iris_features, iris_labels)

homo	compl	v-means	ARI	AMI	silhouette
--------------------------------------------------
0.750	0.763	0.757	0.727	0.753	0.550
