<a href="https://colab.research.google.com/github/EmptyAd/Clustering-Assignment/blob/main/Clustering_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [2]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
X = pd.read_csv(url, sep=';')

X = X.drop(columns='quality')

print(X.columns)


Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')


In [3]:
def normalize(data):
    return MinMaxScaler().fit_transform(data)

def standardize(data):
    return StandardScaler().fit_transform(data)

def transform_and_normalize(data):
    data_std = standardize(data)
    return normalize(data_std)

def apply_pca(data, n_components=2):
    return PCA(n_components=n_components).fit_transform(data)


In [4]:
def evaluate_clustering(X, method_name, cluster_func, c_values=[3, 4, 5]):
    results = []

    for c in c_values:
        try:
            if method_name == 'MeanShift':
                model = cluster_func()
                labels = model.fit_predict(X)
            else:
                model = cluster_func(n_clusters=c)
                labels = model.fit_predict(X)

            sil = silhouette_score(X, labels)
            ch = calinski_harabasz_score(X, labels)
            db = davies_bouldin_score(X, labels)
            results.append((sil, ch, db))
        except Exception:
            results.append(('NA', 'NA', 'NA'))

    return results

In [5]:
preprocessing_methods = {
    "No Processing": X,
    "Normalization": normalize(X),
    "Transform": standardize(X),
    "PCA": apply_pca(X),
    "T+N": transform_and_normalize(X),
    "T+N+PCA": apply_pca(transform_and_normalize(X)),
}

c_values = [3, 4, 5]
columns = ['c=3', 'c=4', 'c=5']
index = pd.MultiIndex.from_product([['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldins'], columns])

def run_all(method_name, cluster_func):
    all_results = []
    for method, data in preprocessing_methods.items():
        metrics = evaluate_clustering(data, method_name, cluster_func, c_values)
        # Flatten results row-wise
        row = []
        for result in metrics:
            row.extend(result)
        all_results.append(pd.Series(row, index=index, name=method))
    return pd.DataFrame(all_results)


In [6]:
# KMeans
kmeans_table = run_all("KMeans", lambda n_clusters: KMeans(n_clusters=n_clusters, n_init=10, random_state=42))

# Hierarchical Clustering
hierarchical_table = run_all("Hierarchical", lambda n_clusters: AgglomerativeClustering(n_clusters=n_clusters))

# Mean Shift Clustering
meanshift_table = run_all("MeanShift", lambda: MeanShift())

In [14]:
print("=== K-Means Clustering ===")
print(kmeans_table)

print("\n=== Hierarchical Clustering ===")
print(hierarchical_table)

print("\n=== K-Means Shift Clustering ===")
print(meanshift_table)

# Optionally export to CSV
kmeans_table.to_csv("kmeans_results.csv")
hierarchical_table.to_csv("hierarchical_results.csv")
meanshift_table.to_csv("meanshift_results.csv")


=== K-Means Clustering ===
              Silhouette                        Calinski-Harabasz  \
                     c=3          c=4       c=5               c=3   
No Processing   0.521346  3065.965093  0.667043          0.485921   
Normalization   0.210986   414.756084  1.709923          0.207707   
Transform       0.189204   313.326226  1.767265          0.171636   
PCA             0.531346  3142.429076  0.650960          0.496786   
T+N             0.210986   414.756084  1.709923          0.207707   
T+N+PCA         0.398742  1215.762932  0.952948          0.386057   

                                     Davies-Bouldins                         
                       c=4       c=5             c=3          c=4       c=5  
No Processing  3062.435662  0.714334        0.445863  3049.996421  0.754584  
Normalization   386.037629  1.540367        0.198950   357.401322  1.502546  
Transform       284.405220  1.657378        0.190139   291.688556  1.462505  
PCA            3163.140540  0.

In [20]:
def print_formatted_results(title, df):
    print(f"\n=== {title} ===\n")

    metrics = ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldins']
    cluster_counts = ['c=3', 'c=4', 'c=5']

    for metric in metrics:
        print(metric)
        print(f"{'Parameters':<20} {'c=3':<12} {'c=4':<12} {'c=5':<12}")
        print("-" * 60)
        for index, row in df.iterrows():
            values = []
            for c in cluster_counts:
                val = row.get((metric, c), 'NA')
                if isinstance(val, (int, float)):
                    values.append(f"{val:<12.3f}")
                else:
                    values.append(f"{str(val):<12}")
            print(f"{index:<20} {values[0]} {values[1]} {values[2]}")
        print()


# Example usage:
print_formatted_results("K-Means Clustering", kmeans_table)
print_formatted_results("Hierarchical Clustering", hierarchical_table)
print_formatted_results("K-Means Shift Clustering", meanshift_table)


=== K-Means Clustering ===

Silhouette
Parameters           c=3          c=4          c=5         
------------------------------------------------------------
No Processing        0.521        3065.965     0.667       
Normalization        0.211        414.756      1.710       
Transform            0.189        313.326      1.767       
PCA                  0.531        3142.429     0.651       
T+N                  0.211        414.756      1.710       
T+N+PCA              0.399        1215.763     0.953       

Calinski-Harabasz
Parameters           c=3          c=4          c=5         
------------------------------------------------------------
No Processing        0.486        3062.436     0.714       
Normalization        0.208        386.038      1.540       
Transform            0.172        284.405      1.657       
PCA                  0.497        3163.141     0.699       
T+N                  0.208        386.038      1.540       
T+N+PCA              0.386        1310.