<a href="https://colab.research.google.com/github/Bhawana102/Clustering/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
!pip install pycaret &> /dev/null


In [None]:
import numpy as np
from pycaret.datasets import get_data
from pycaret.clustering import *
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load a sample dataset
data = get_data('wholesale')
# data = data.drop(columns=['species'])

# Define the preprocessing methods
preprocessing_methods = ['None', 'Normalize', 'PCA', 'Transform', 'PCA+Transform', 'PCA+Transform+Normalize']

# Define the clustering techniques
clustering_techniques = ['kmeans', 'hclust', 'meanshift']  # Updated clustering techniques

# Define the number of clusters
num_clusters = [3, 4, 5]

# Initialize dictionaries to store the results matrices for each clustering technique
results_matrices = {}

In [None]:
for cluster_method in clustering_techniques:
    # Initialize an empty DataFrame to store results for this clustering method
    results_matrix = np.zeros((3, len(preprocessing_methods)), dtype=float)

    # Iterate over each preprocessing method
    for i, prep_method in enumerate(preprocessing_methods):
        # Apply the preprocessing method
        if 'Normalize' in prep_method:
            exp_clf_setup = setup(data, normalize=True, verbose=False)
        elif 'PCA' in prep_method:
            exp_clf_setup = setup(data, pca=True, pca_components=2, verbose=False)
        elif 'Transform' in prep_method:
            exp_clf_setup = setup(data, transformation=True, transformation_method='yeo-johnson', verbose=False)
        else:
            exp_clf_setup = setup(data, verbose=False)

        preprocessed_data = get_config('X')

        # Initialize lists to store the scores for each number of clusters
        silhouette_scores = []
        calinski_scores = []
        davies_scores = []

        # Iterate over each number of clusters
        for n_clusters in num_clusters:
            # Initialize the clustering model
            if cluster_method == 'kmeans':
                model = create_model('kmeans', num_clusters=n_clusters)
            elif cluster_method == 'hclust':
                model = create_model('hclust', linkage='ward', num_clusters=n_clusters)
            elif cluster_method == 'meanshift':
                model = create_model('meanshift')

            # Fit the model
            model.fit(preprocessed_data)
            save_model(model, f'{cluster_method}_{prep_method}_model')

            # Get cluster labels
            labels = model.labels_
             # Calculate silhouette score
            silhouette = silhouette_score(preprocessed_data, labels)
            silhouette_scores.append(silhouette)

            # Calculate calinski harabasz score
            calinski = calinski_harabasz_score(preprocessed_data, labels)
            calinski_scores.append(calinski)

            # Calculate davies bouldin score
            davies = davies_bouldin_score(preprocessed_data, labels)
            davies_scores.append(davies)

        # Store the scores in the results matrix
        results_matrix[0, i] = np.mean(silhouette_scores)
        results_matrix[1, i] = np.mean(calinski_scores)
        results_matrix[2, i] = np.mean(davies_scores)

    # Store the results matrix in the dictionary
    results_matrices[cluster_method] = results_matrix

# Writing each matrix to a CSV file
for algorithm, matrix in results_matrices.items():
    np.savetxt(f'{algorithm}_results.csv', matrix, delimiter=',', header=','.join(preprocessing_methods), comments='')