In [1]:
!pip install pycaret
from pycaret.datasets import get_data
from pycaret.clustering import *
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

Collecting pycaret
  Downloading pycaret-3.2.0-py3-none-any.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.7/484.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting kaleido>=0.2.1 (from pycaret)
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib<=3.6,>=3.3.0 (from pycaret)
  Downloading matplotlib-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/1

In [2]:
data = get_data('iris')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
data = data.drop(columns=['species'])

In [4]:
clustering_setup = setup(data)

Unnamed: 0,Description,Value
0,Session id,1291
1,Original data shape,"(150, 4)"
2,Transformed data shape,"(150, 4)"
3,Numeric features,4
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,False


In [5]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
preprocessing_techniques = {
    'None': None,
    'normalize': StandardScaler(),
    'pca': PCA(),
    'transform': None,
    'scale': MinMaxScaler()
}

In [6]:
num_clusters = [3, 4, 5]

In [7]:
clustering_algorithms = ['kmeans', 'hclust', 'dbscan']

In [8]:
results_dict = {}

In [9]:
for algorithm in clustering_algorithms:
    algorithm_results = pd.DataFrame(index=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])
    for technique, preprocess_method in preprocessing_techniques.items():

        if preprocess_method is not None:
            preprocessed_data = preprocess_method.fit_transform(data)
        else:
            preprocessed_data = data


        model = create_model(algorithm)


        model.fit(preprocessed_data)


        if algorithm in ['dbscan', 'hclust']:
            labels = model.labels_
        else:
            labels = model.predict(preprocessed_data)


        if len(set(labels)) <= 1:
            continue


        silhouette = silhouette_score(preprocessed_data, labels)
        calinski_harabasz = calinski_harabasz_score(preprocessed_data, labels)
        davies_bouldin = davies_bouldin_score(preprocessed_data, labels)


        col_name = f'{technique}_c='
        for n in num_clusters:
            col_name_n = col_name + str(n)
            algorithm_results[col_name_n] = [silhouette, calinski_harabasz, davies_bouldin]


    results_dict[algorithm] = algorithm_results

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4978,529.3983,0.7806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4978,529.3983,0.7806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4978,529.3983,0.7806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4978,529.3983,0.7806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4978,529.3983,0.7806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4887,513.7721,0.7956,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4887,513.7721,0.7956,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4887,513.7721,0.7956,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4887,513.7721,0.7956,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4887,513.7721,0.7956,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4858,219.8703,7.2228,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4858,219.8703,7.2228,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4858,219.8703,7.2228,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4858,219.8703,7.2228,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4858,219.8703,7.2228,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
for algorithm, results in results_dict.items():
    results.to_csv(f'{algorithm}_results.csv')

In [None]:
hclust_result=pd.read_csv('/kaggle/working/hclust_results.csv')

In [None]:
hclust_result

In [None]:
dbscan_result=pd.read_csv('/kaggle/working/dbscan_results.csv')

In [None]:
dbscan_result

In [None]:
kmeans_result=pd.read_csv('/kaggle/working/kmeans_results.csv')

In [None]:
kmeans_result

In [None]:
import matplotlib.pyplot as plt

evaluation_metrics = ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']

for algorithm, results in results_dict.items():
    plt.figure(figsize=(10, 6))
    plt.title(f'{algorithm} Evaluation Metrics')
    for metric in evaluation_metrics:
        plt.bar(results.columns, results.loc[metric], label=metric)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()