In [None]:
# wine_kmeans_clustering.py

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score


In [None]:
# 1. Load dữ liệu Wine
data = load_wine()
X = data.data  # shape (178, 13)
y_true = data.target  # ground truth labels (0, 1, 2)

# 2. Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Áp dụng KMeans
k = 3  # vì Wine dataset có 3 class thật
kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
kmeans.fit(X_scaled)
clusters = kmeans.labels_

# 4. PCA để giảm chiều xuống 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 5. Trực quan hóa kết quả phân cụm
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='tab10', s=50)
plt.title(f"KMeans Clustering trên Wine (k = {k})")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.tight_layout()
plt.show()

# 6. Trực quan hóa nhãn thật
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true, cmap='Set1', s=50)
plt.title("Ground Truth Labels (Wine)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.tight_layout()
plt.show()

# 7. Đánh giá bằng Adjusted Rand Index
ari = adjusted_rand_score(y_true, clusters)
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
