In [1]:
pip install openml

Note: you may need to restart the kernel to use updated packages.


In [None]:
# clustering_analysis.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def load_and_normalize_data(filepath, features):
    df = pd.read_csv(filepath)
    scaler = StandardScaler()
    df_norm = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)
    return df, df_norm

def apply_pca(df_norm, n_components=2):
    pca = PCA(n_components=n_components)
    df_pca = pd.DataFrame(pca.fit_transform(df_norm), columns=[f"PC{i+1}" for i in range(n_components)])
    return df_pca

def compare_kmeans_clusters(df_pca, cluster_options=[2, 3, 4], n_iterations=10,
                            ellipse_color="yellow", ellipse_alpha=0.3,
                            suptitle="BankNotes Analysis - K-Means with Different Cluster Sizes",
                            save_path="images/visual_quantitative_comparison.png"):
    silhouette_scores = {k: [] for k in cluster_options}
    fig, ax = plt.subplots(len(cluster_options), n_iterations, figsize=(15, len(cluster_options) * 3))
    fig.suptitle(suptitle, fontsize=16, fontweight="bold")
    ax = ax.ravel()

    for idx, n_clusters in enumerate(cluster_options):
        for i in range(n_iterations):
            kmeans = KMeans(n_clusters=n_clusters, random_state=i, n_init=10)
            labels = kmeans.fit_predict(df_pca)
            centers = kmeans.cluster_centers_
            sil_score = silhouette_score(df_pca, labels)
            silhouette_scores[n_clusters].append(sil_score)

            cluster_colors = ["red", "blue", "green", "purple"][:n_clusters]
            ax_pos = idx * n_iterations + i

            if ax_pos < len(ax):
                for j in range(n_clusters):
                    ax[ax_pos].scatter(df_pca["PC1"][labels == j], df_pca["PC2"][labels == j],
                                       c=cluster_colors[j], alpha=0.5)

                for j in range(n_clusters):
                    cluster_points = df_pca[labels == j]
                    std_x = np.std(cluster_points["PC1"])
                    std_y = np.std(cluster_points["PC2"])
                    ellipse = Ellipse(xy=centers[j], width=2 * std_x, height=2 * std_y,
                                      edgecolor="gold", facecolor=ellipse_color, alpha=ellipse_alpha)
                    ax[ax_pos].add_patch(ellipse)

                ax[ax_pos].set_title(f"K={n_clusters}, Iter {i+1}\n(Sil={sil_score:.2f})", fontsize=10)
                ax[ax_pos].grid(True, linestyle="--", alpha=0.6)
                ax[ax_pos].set_xticks([])
                ax[ax_pos].set_yticks([])

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    Path(save_path).parent.mkdir(exist_ok=True)
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.show()

    for k in cluster_options:
        print(f"Average Silhouette Score for K={k}: {np.mean(silhouette_scores[k]):.2f}")

    return silhouette_scores

# ✅ Main execution block
if __name__ == "__main__":
    features = ["V1", "V2", "V3", "V4"]
    data_path = "data/banknote_data.csv"
    image_output = "images/visual_quantitative_comparison.png"

    df, df_norm = load_and_normalize_data(data_path, features)
    df_pca = apply_pca(df_norm)
    compare_kmeans_clusters(df_pca, cluster_options=[2, 3, 4], save_path=image_output)