<a href="https://colab.research.google.com/github/CoreTheGreat/HBPU-Machine-Learning-Course/blob/main/ML_Chapter4_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第四章：聚类
湖北理工学院《机器学习》课程资料

作者：李辉楚吴

笔记内容概述: K均值聚类 k-Means


### 一维k-Means聚类的应用: 二值分割（Binary Segmentation）

Step 1: 加载数据

In [None]:
import cv2
import matplotlib.pyplot as plt

label_size = 18 # Label size
ticklabel_size = 14 # Tick label size

# Load the car number image
image_carno = cv2.imread('./Data/car_num.jpg')

# Convert from BGR to RGB
image_carno = cv2.cvtColor(image_carno, cv2.COLOR_BGR2RGB)

# Display the image
fig, ax = plt.subplots(figsize=(7,7))
img = ax.imshow(image_carno)
plt.axis('off')  # Hide axes
plt.savefig('carno_base.png', dpi=300) # Make figure clearer
plt.show()

Step 2: 灰度化

In [None]:
# Change image_carno into grey image
image_carno_grey = cv2.cvtColor(image_carno, cv2.COLOR_BGR2GRAY)

# Display the image
fig, ax = plt.subplots(figsize=(7,7))
img = ax.imshow(image_carno_grey, cmap='gray')
plt.axis('off')  # Hide axes
plt.savefig('carno_grey.png', dpi=300) # Make figure clearer
plt.show()

# Create a histogram of grey values
plt.figure(figsize=(8, 4))
plt.hist(image_carno_grey.ravel(), bins=256, range=(0, 256))
plt.ylim(0, 5000)
plt.title('Histogram of Grey Values', fontsize=label_size)
plt.xlabel('Pixel Value', fontsize=label_size)
plt.ylabel('Frequency', fontsize=label_size)
plt.xticks(fontsize=ticklabel_size)
plt.yticks(fontsize=ticklabel_size)
plt.savefig('carno_grey_histogram.png', dpi=300, bbox_inches='tight')
plt.show()




Step 3: 根据灰度分为两类，生成掩码


In [None]:
import time
import numpy as np

# Reshape image_carno_grey into a 1-D vector
x = image_carno_grey.reshape(-1)

# Using k-Means to separate background and foreground by pixels
k = 2

# Initialize cluster centers
cluster_centers = np.random.rand(k) * 255
start_centers = cluster_centers.copy()

# Initialize distance array
distance = np.zeros((len(x), k))

# Iteration
start_time = time.time()
max_iter = 1000
for iter_id in range(max_iter):
    # Calculate distance between points to each center
    for i in range(k):
        distance[:,i] = np.abs(x - cluster_centers[i])

    # Assign to closest centroid
    cluster_idx = np.argmin(distance, axis=1)

    # Update cluster centers
    cluster_centers_prior = cluster_centers.copy()
    for i in range(k):
        cluster_centers[i] = np.mean(x[cluster_idx == i])

    # Check if cluster_centers are stable enough to stop training
    print(f'Iteration {iter_id}: Updated centers {cluster_centers}, Prior centers {cluster_centers_prior}')
    if np.sum(np.abs(cluster_centers-cluster_centers_prior)) == 0:
        break

    cluster_centers_prior = cluster_centers

end_time = time.time()
print(f'Stop after iteration {iter_id}, time consumption is {end_time-start_time}')

# Generate segmentation mask
carno_mask_pixel = np.zeros_like(cluster_idx)
low_value_cluster = np.argmin(cluster_centers)
carno_mask_pixel[cluster_idx != low_value_cluster] = 1 # Set pixels with higher grey value to 1
carno_mask_pixel = carno_mask_pixel.reshape(image_carno_grey.shape)

# Display the mask
fig, ax = plt.subplots(figsize=(7,7))
img = ax.imshow(carno_mask_pixel, cmap='gray')
ax.set_title(f'Final Centers: {cluster_centers}, Iteration: {iter_id}', fontsize=label_size)
plt.axis('off')  # Hide axes
plt.savefig('carno_mask_pixel_2.png', dpi=300) # Make figure clearer
plt.show()

### 多维k-Means聚类应用：图像压缩 (Image Compression)

Step 1: 加载数据

In [None]:
# Load the car number image
image_bmwk = cv2.imread('./Data/bmwk.png')

# Convert from BGR to RGB
image_bmwk_rgb = cv2.cvtColor(image_bmwk, cv2.COLOR_BGR2RGB)

x_r = image_bmwk_rgb[:, :, 0].reshape(-1) # Store colors in red channel
x_g = image_bmwk_rgb[:, :, 1].reshape(-1) # Store colors in green channel
x_b = image_bmwk_rgb[:, :, 2].reshape(-1) # Store colors in blue channel

# Display the image with no margin
plt.figure(figsize=(image_bmwk_rgb.shape[1]/100, image_bmwk_rgb.shape[0]/100))  # Convert pixels to inches
plt.imshow(image_bmwk_rgb)
plt.axis('off')  # Hide axes
plt.subplots_adjust(left=0, right=1, top=1, bottom=0)  # Remove margins
plt.savefig('bmwk.png', format='png', bbox_inches='tight', pad_inches=0)
plt.show()

# Compute image storing buffer
image_buffer = x_r.size * 3  # 3 channels (R, G, B) for each pixel
print(f"Image buffer size: {image_buffer} bytes")
# Calculate image size in megabytes (MB)
image_size_mb = image_buffer / (1024 * 1024)
print(f"Image size: {image_size_mb:.2f} MB")

# Calculate image dimensions
height, width = image_bmwk_rgb.shape[:2]
print(f"Image dimensions: {width}x{height} pixels")

# Calculate total number of pixels
total_pixels = height * width
print(f"Total number of pixels: {total_pixels}")

# Calculate bits per pixel
bits_per_pixel = (image_buffer * 8) / total_pixels
print(f"Bits per pixel: {bits_per_pixel:.2f}")

Step 2: 使用k-Means进行聚类，观察不同k值的聚类结果

In [None]:
from sklearn.cluster import KMeans

def KMeansImage(img, k):
    # Get image size
    w, h, c = img.shape

    # Reshape image along channel
    x = np.reshape(img, (w * h, c))

    # Train k-Means model
    mdl_km = KMeans(n_clusters=k, n_init='auto')
    mdl_km.fit(x)

    # Predict labels of each pixels
    labels = mdl_km.predict(x).reshape(w, h)

    # Get centers
    center_colors = mdl_km.cluster_centers_ / 255.0

    # Use center colors to generate compressed image
    img_comp = np.zeros((w, h, c))
    for i in range(w):
        for j in range(h):
            img_comp[i][j] = center_colors[labels[i][j]]
    return img_comp, center_colors

claster_num = [2, 4, 8, 16, 32, 64]
for k in claster_num:
    img_comp, center_colors = KMeansImage(image_bmwk_rgb, k)

    # Display center colors
    fig, ax = plt.subplots(figsize=(16,1))
    ax.imshow([center_colors])
    plt.axis('off')
    # plt.savefig(f'bmwk_center_{k}.png', format='png', compress_level=9)
    plt.show()

    # Display compressed image
    plt.figure(figsize=(image_bmwk_rgb.shape[1]/100, image_bmwk_rgb.shape[0]/100))  # Convert pixels to inches
    plt.imshow(img_comp)
    plt.axis('off')
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)  # Remove margins
    plt.savefig(f'bmwk_comp_{k}.png', format='png', bbox_inches='tight', pad_inches=0)
    plt.show()

### 确定k值——手肘法

Step 1: 生成数据

In [None]:
from sklearn.datasets import make_blobs

# Using make_blobs to generate data of ten clustering
X_mb, y_mb = make_blobs(n_samples=500, n_features=2, centers=6, random_state=42)

fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(X_mb[:, 0], X_mb[:, 1], marker="o", c=y_mb, s=10**2, edgecolor="k")
plt.axis('off')
# plt.savefig(f'make_blobs_base.png', dpi=300)
plt.show()

In [None]:
k_list = np.arange(2, 20, 1)
sse_list = np.zeros(len(k_list))

mdl_km_list = []
for i in range(len(k_list)):
    mdl_km = KMeans(n_clusters=k_list[i], n_init='auto')
    mdl_km.fit(X_mb)
    mdl_km_list.append(mdl_km)
    sse_list[i] = mdl_km.inertia_

# Plot sse_list
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(k_list, sse_list, marker='o', linestyle='-', color='tab:blue')
ax.set_xticks(k_list)
ax.set_xlabel('Number of clusters (k)', fontsize=label_size)
ax.set_ylabel('SSE', fontsize=label_size)
ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size
# plt.savefig(f'make_blobs_sse.png', dpi=300)
plt.show()


Step 2: 观察不同k值的聚类结果

In [None]:
# Display clustering result of k = 5, 6, 7
k_disp = [2, 3, 4, 5, 6, 7]
for k in k_disp:
    mdl_km = mdl_km_list[k-2]

    fig, ax = plt.subplots(figsize=(10,10))
    ax.scatter(X_mb[:, 0], X_mb[:, 1], marker="o", c=mdl_km_list[k-2].labels_, s=10**2, edgecolor="k")
    ax.set_title(f'Number of clusters (k): {k}', fontsize=label_size)
    plt.axis('off')

    plt.savefig(f'make_blobs_{k}.png', dpi=300)
    plt.show()

在Moon data上进行聚类

In [None]:
from sklearn.datasets import make_moons

# Generate virtual data (Moon data)
X_mm, y_mm = make_moons(n_samples=400, noise=0.05, random_state=42)

# Normalize X_mm using z-score
X_mm = (X_mm - np.min(X_mm, axis=0)) / (np.max(X_mm, axis=0)-np.min(X_mm, axis=0))

fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(X_mm[:, 0], X_mm[:, 1], marker="o", c=y_mm, s=10**2, edgecolor="k")
plt.axis('off')
# plt.savefig(f'make_moon_base.png', dpi=300)
plt.show()

用肘部法确定k值


In [None]:
k_list = np.arange(2, 20, 1)
sse_list = np.zeros(len(k_list))

mdl_km_list = []
for i in range(len(k_list)):
    mdl_km = KMeans(n_clusters=k_list[i], n_init='auto')
    mdl_km.fit(X_mm)
    mdl_km_list.append(mdl_km)
    sse_list[i] = mdl_km.inertia_

# Plot sse_list
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(k_list, sse_list, marker='o', linestyle='-', color='tab:blue')
ax.set_xticks(k_list)
ax.set_xlabel('Number of clusters (k)', fontsize=label_size)
ax.set_ylabel('SSE', fontsize=label_size)
ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size
# plt.savefig(f'make_moon_sse.png', dpi=300)
plt.show()

观察聚类结果，思考K-means方法存在的问题

In [None]:
# Display clustering result of k = 5, 6, 7
k_disp = [2, 11, 19]
for k in k_disp:
    mdl_km = mdl_km_list[k-2]

    fig, ax = plt.subplots(figsize=(10,10))
    ax.scatter(X_mm[:, 0], X_mm[:, 1], marker="o", c=mdl_km_list[k-2].labels_, s=10**2, edgecolor="k")
    ax.set_title(f'Number of clusters (k): {k}', fontsize=label_size)
    plt.axis('off')

    # plt.savefig(f'make_moon_{k}.png', dpi=300)
    plt.show()