In [None]:
# Lab Title: Clustering Analysis using K-Means and K-Medoids

# Step 1: Load and Prepare the Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score

# Load Wine dataset
wine = load_wine()
X = wine.data
y = wine.target
feature_names = wine.feature_names

# Basic data exploration
print("Feature Names:", feature_names)
print("Class Distribution:", np.bincount(y))
print("Dataset shape:", X.shape)

# Standardize the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: K-Means Clustering
from sklearn.cluster import KMeans

k = 3  # Number of clusters

kmeans = KMeans(n_clusters=k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Performance Metrics
kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)
kmeans_ari = adjusted_rand_score(y, kmeans_labels)

print(f"K-Means Silhouette Score: {kmeans_silhouette:.4f}")
print(f"K-Means Adjusted Rand Index (ARI): {kmeans_ari:.4f}")

# Step 3: K-Medoids Clustering
from sklearn_extra.cluster import KMedoids

kmedoids = KMedoids(n_clusters=k, random_state=42)
kmedoids_labels = kmedoids.fit_predict(X_scaled)

# Performance Metrics
kmedoids_silhouette = silhouette_score(X_scaled, kmedoids_labels)
kmedoids_ari = adjusted_rand_score(y, kmedoids_labels)

print(f"K-Medoids Silhouette Score: {kmedoids_silhouette:.4f}")
print(f"K-Medoids Adjusted Rand Index (ARI): {kmedoids_ari:.4f}")

# Step 4: Visualize and Compare Results
from sklearn.decomposition import PCA

# Reduce dimensions to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(12,5))

# K-Means plot
plt.subplot(1,2,1)
plt.scatter(X_pca[:,0], X_pca[:,1], c=kmeans_labels, cmap='viridis', s=50)
plt.scatter(pca.transform(kmeans.cluster_centers_)[:,0], 
            pca.transform(kmeans.cluster_centers_)[:,1],
            c='red', marker='X', s=200, label='Centroids')
plt.title("K-Means Clusters")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()

# K-Medoids plot
plt.subplot(1,2,2)
plt.scatter(X_pca[:,0], X_pca[:,1], c=kmedoids_labels, cmap='viridis', s=50)
plt.scatter(pca.transform(kmedoids.cluster_centers_)[:,0], 
            pca.transform(kmedoids.cluster_centers_)[:,1],
            c='red', marker='X', s=200, label='Medoids')
plt.title("K-Medoids Clusters")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()

plt.tight_layout()
plt.show()

# Analysis (Markdown cell content suggestion)
"""
Analysis:
- Both K-Means and K-Medoids were able to separate the wine classes reasonably well.
- K-Medoids is more robust to outliers, which may be reflected in slightly higher silhouette scores.
- Cluster shapes: K-Means assumes spherical clusters, while K-Medoids adapts better to irregular cluster shapes.
- K-Means is computationally faster for large datasets, whereas K-Medoids can handle noise/outliers better.
"""