# 🌸 Clustering Analysis on Iris Dataset
We will explore clustering techniques (K-Means, Hierarchical, Mean Shift) on the Iris dataset using different preprocessing techniques. We’ll evaluate the clusters using Silhouette Score, Calinski-Harabasz Index, and Davies-Bouldin Index.

In [22]:
# 📦 Step 1: Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from scipy.cluster.hierarchy import linkage, fcluster
import warnings
warnings.filterwarnings('ignore')

In [23]:
# 🌼 Step 2: Load the Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [24]:
# ⚙️ Step 3: Define preprocessing functions
def normalize_data(data):
    return StandardScaler().fit_transform(data)

def transform_data(data):
    transformer = FunctionTransformer(np.log1p)
    return transformer.fit_transform(data)

def apply_pca(data, n=2):
    return PCA(n_components=n).fit_transform(data)

In [None]:
# 🔍 Step 4: Evaluation Function
def evaluate_clustering(X, labels):
    s = silhouette_score(X, labels)
    ch = calinski_harabasz_score(X, labels)
    db = davies_bouldin_score(X, labels)
    return s, ch, db

In [None]:
# 🔁 Step 5: Perform clustering with different preprocessing methods
results = []
methods = ["Raw", "Normalized", "Transformed", "PCA", "T+N", "T+N+PCA"]

for method in methods:
    if method == "Raw":
        data = X
    elif method == "Normalized":
        data = normalize_data(X)
    elif method == "Transformed":
        data = transform_data(X)
    elif method == "PCA":
        data = apply_pca(X)
    elif method == "T+N":
        data = normalize_data(transform_data(X))
    elif method == "T+N+PCA":
        data = apply_pca(normalize_data(transform_data(X)))

    for c in [3, 4, 5]:
        try:
            # KMeans
            km = KMeans(n_clusters=c, random_state=42).fit(data)
            s, ch, db = evaluate_clustering(data, km.labels_)
            results.append(["KMeans", method, c, s, ch, db])
        except:
            continue

results_df = pd.DataFrame(results, columns=["Algorithm", "Preprocessing", "Clusters", "Silhouette", "Calinski-Harabasz", "Davies-Bouldin"])
results_df.head()

Unnamed: 0,Algorithm,Preprocessing,Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin
0,KMeans,Raw,3,0.551192,561.593732,0.666039
1,KMeans,Raw,4,0.497643,529.529095,0.75414
2,KMeans,Raw,5,0.49308,495.243414,0.819384
3,KMeans,Normalized,3,0.479881,157.360153,0.789363
4,KMeans,Normalized,4,0.385045,206.680603,0.869779


In [None]:
# 📊 Step 6: Plot results
pivot_table = results_df.pivot_table(index=['Algorithm', 'Preprocessing'], 
                                     columns='Clusters', 
                                     values=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])
pivot_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Calinski-Harabasz,Calinski-Harabasz,Calinski-Harabasz,Davies-Bouldin,Davies-Bouldin,Davies-Bouldin,Silhouette,Silhouette,Silhouette
Unnamed: 0_level_1,Clusters,3,4,5,3,4,5,3,4,5
Algorithm,Preprocessing,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
KMeans,Normalized,157.360153,206.680603,202.63585,0.789363,0.869779,0.943894,0.479881,0.385045,0.345033
KMeans,PCA,693.708433,719.123544,642.060666,0.564816,0.615069,0.7525,0.597676,0.557741,0.510041
KMeans,Raw,561.593732,529.529095,495.243414,0.666039,0.75414,0.819384,0.551192,0.497643,0.49308
KMeans,T+N,181.170654,237.557272,201.491051,0.700796,0.879726,1.152163,0.516079,0.389411,0.299018
KMeans,T+N+PCA,207.244374,301.714865,319.083335,0.599665,0.746197,0.769124,0.560168,0.450322,0.430887
KMeans,Transformed,502.48743,721.754299,715.167999,0.937543,1.028285,1.066897,0.571878,0.392813,0.32671


In [33]:
# --- Clustering Evaluation Setup ---
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
import pandas as pd

# Ensure X is defined as your input data matrix before this block

def evaluate_clustering(X, labels):
    return {
        'Silhouette': silhouette_score(X, labels),
        'Calinski-Harabasz': calinski_harabasz_score(X, labels),
        'Davies-Bouldin': davies_bouldin_score(X, labels)
    }

def preprocess_data(X, method):
    scaler = StandardScaler()
    transformer = PowerTransformer()
    pca = PCA(n_components=2)
    
    if method == 'none':
        return X
    elif method == 'normalize':
        return scaler.fit_transform(X)
    elif method == 'transform':
        return transformer.fit_transform(X)
    elif method == 'pca':
        return pca.fit_transform(X)
    elif method == 't+n':
        return scaler.fit_transform(transformer.fit_transform(X))
    elif method == 't+n+pca':
        return pca.fit_transform(scaler.fit_transform(transformer.fit_transform(X)))

def run_all_evaluations(X):
    preprocessings = ['none', 'normalize', 'transform', 'pca', 't+n', 't+n+pca']
    cluster_range = [3, 4, 5]
    
    results = {
        'KMeans': [],
        'Hierarchical': [],
        'MeanShift': []
    }

    for method in preprocessings:
        X_proc = preprocess_data(X, method)

        for k in cluster_range:
            try:
                km = KMeans(n_clusters=k, random_state=0).fit(X_proc)
                km_metrics = evaluate_clustering(X_proc, km.labels_)
                results['KMeans'].append((method, k, *km_metrics.values()))
            except:
                results['KMeans'].append((method, k, 'NA', 'NA', 'NA'))
            
            try:
                hc = AgglomerativeClustering(n_clusters=k).fit(X_proc)
                hc_metrics = evaluate_clustering(X_proc, hc.labels_)
                results['Hierarchical'].append((method, k, *hc_metrics.values()))
            except:
                results['Hierarchical'].append((method, k, 'NA', 'NA', 'NA'))

        # Mean Shift (only once per method since it determines clusters automatically)
        try:
            ms = MeanShift().fit(X_proc)
            ms_metrics = evaluate_clustering(X_proc, ms.labels_)
            results['MeanShift'].append((method, len(set(ms.labels_)), *ms_metrics.values()))
        except:
            results['MeanShift'].append((method, 'NA', 'NA', 'NA', 'NA'))

    # Convert to DataFrames
    km_df = pd.DataFrame(results['KMeans'], columns=['Preprocessing', 'Clusters', 'Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])
    hc_df = pd.DataFrame(results['Hierarchical'], columns=['Preprocessing', 'Clusters', 'Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])
    ms_df = pd.DataFrame(results['MeanShift'], columns=['Preprocessing', 'Estimated Clusters', 'Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])

    display(km_df.pivot(index='Preprocessing', columns='Clusters', values='Silhouette'))
    display(hc_df.pivot(index='Preprocessing', columns='Clusters', values='Silhouette'))
    display(ms_df)

    return km_df, hc_df, ms_df

# Run the evaluations
kmeans_df, hierarchical_df, meanshift_df = run_all_evaluations(X)


Clusters,3,4,5
Preprocessing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
none,0.551192,0.498051,0.46095
normalize,0.459948,0.386941,0.345511
pca,0.597676,0.560957,0.545975
t+n,0.457197,0.426178,0.40063
t+n+pca,0.50763,0.441299,0.421849
transform,0.457197,0.426178,0.40063


Clusters,3,4,5
Preprocessing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
none,0.554324,0.488967,0.484383
normalize,0.446689,0.400636,0.330587
pca,0.598475,0.540977,0.548784
t+n,0.478043,0.427143,0.356892
t+n+pca,0.521362,0.483672,0.423184
transform,0.478043,0.427143,0.356892


Unnamed: 0,Preprocessing,Estimated Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin
0,none,2,0.685788,509.703427,0.388552
1,normalize,2,0.58175,251.349339,0.593313
2,transform,2,0.587311,259.895044,0.585648
3,pca,2,0.710311,565.734052,0.355059
4,t+n,2,0.587311,259.895044,0.585648
5,t+n+pca,3,0.528367,187.466727,0.568517
