In [7]:
!pip install scikit-learn
!pip install -U scikit-fuzzy

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "E:\Scripts\pip.exe\__main__.py", line 4, in <module>
ModuleNotFoundError: No module named 'pip'
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "E:\Scripts\pip.exe\__main__.py", line 4, in <module>
ModuleNotFoundError: No module named 'pip'


**K-means clustering**

K-means clustering is a widely used unsupervised learning technique that aims to partition n observations into k clusters, where each observation belongs to the cluster with the nearest mean value.

The K-means algorithm works by first randomly selecting k points from the dataset, which serve as the initial centers of the clusters (the "means"). For each pixel in the image, the algorithm calculates the Euclidean distance to each of the k means and assigns the pixel to the cluster of the closest mean.

Once all pixels have been assigned to clusters, the algorithm recalculates the means of the clusters, which now represent the "center" of the pixels in each cluster. The pixel assignment step is then repeated with the updated means.

These two steps are iterated until the means no longer move significantly or a predefined number of iterations is reached. The result is k clusters of pixels, where each pixel in a cluster is closer to its own cluster's mean than to any other cluster's mean.

The output of the K-means clustering algorithm, in the context of image processing, is typically a segmented image where each pixel is labeled according to the cluster it belongs to. This segmented image highlights regions of the original image that share similar intensity values, providing a simplified representation that can be useful for various subsequent image processing tasks.

**Imports**

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans


**Setup Directory Paths**

Path - location of raw datasets

path_output - where preprocessed images will be saved

In [9]:
#drive path to directory containg datasets
path = "../Data Sets/Processed Datasets"

#drive path to output directory for preprocessed_data
path_output = "../Data Sets/Clustered Datasets"

**Import data to dictonary**

Imports images to a dicontary of format dataset[folder]->Image

**Reshape Images**

Reshape images to have a consistint shape via adding downsizing and padding

clustering expects images to have a standard size

In [10]:
def resize_image(image, target_size):
    height, width = image.shape[:2]
    target_height, target_width = target_size

    # Calculate the aspect ratio and the new dimensions
    aspect_ratio = width / height
    new_width = target_width
    new_height = int(target_width / aspect_ratio)

    if new_height > target_height:
        new_height = target_height
        new_width = int(target_height * aspect_ratio)

    # Resize the image
    resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)

    # Add padding to the resized image to match the target size
    pad_height = target_height - new_height
    pad_width = target_width - new_width
    padding = [(pad_height // 2, pad_height - pad_height // 2), (pad_width // 2, pad_width - pad_width // 2)]

    padded_image = np.pad(resized_image, padding, mode='constant', constant_values=0)

    return padded_image

In [11]:
# Empty dictionary to store the processed image data
resized_dataset = {}
target_size = (256, 256)  # Set your target size here

# Loop through all folders in the output directory and import images
# Images will be stored as processed_dataset['folder'][data]
for folder in sorted(os.listdir(path)):
    folder_path = os.path.join(path, folder)
    data = []
    for file_name in sorted(os.listdir(folder_path)):
        # Import data
        file_path = os.path.join(folder_path, file_name)
        image = np.load(file_path)

        if image is not None:
            image = resize_image(image, target_size)
            data.append(image)
    
    # Convert data to numpy arrays
    data = np.array(data)
    # Add the data to the processed_dataset dictionary
    resized_dataset[folder] = data

**Cluster Images**

This will give off a ton of warnings due to deprication of n_init in kmeans, however this is not an issue.

In [12]:
kmeans_results = []
# K-means
num_clusters = 4

kmeans = KMeans(n_clusters=num_clusters)

for label, images in resized_dataset.items():
    for img in images:
        # Flatten the image
        img_flattened = img.reshape(-1, 1)
        
        # Perform K-means clustering
        kmeans_labels = kmeans.fit_predict(img_flattened)
        
        kmeans_results.append(kmeans_labels.reshape(img.shape))



**Save the cluster images**

applys the binary masks generated by clustering in order to create 4 cluster images per image in dataset.

These clusted images are saved to the output directory in "K-means_Timestamp" format

In [13]:
def save_individual_clusters(image, cluster_labels, num_clusters, label, image_index, base_path):
    for i in range(num_clusters):
        single_cluster_image = np.zeros_like(image, dtype=np.float32)
        single_cluster_image[cluster_labels == i] = 255
        
        # Save the single cluster image
        cluster_folder = os.path.join(base_path, label)
        if not os.path.exists(cluster_folder):
            os.makedirs(cluster_folder)
        
        file_name = f"image{image_index}_cluster{i + 1}.png"
        file_path = os.path.join(cluster_folder, file_name)
        
        cv2.imwrite(file_path, single_cluster_image)

In [14]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
base_path = os.path.join(path_output, f"Kmeans_{timestamp}")

global_image_index = 0  # Global image index for position in the dictionary
for label, images in resized_dataset.items():
    local_image_index = 1  # Local image index for naming, resets for each folder
    for img in images:
        fcm_labels = kmeans_results[global_image_index]
        save_individual_clusters(img, fcm_labels, num_clusters, label, local_image_index, base_path)
        global_image_index += 1
        local_image_index += 1
print(f"All images were successfully saved in the directory: '{base_path}'")

All images were successfully saved in the directory: '../Data Sets/Clustered Datasets\Kmeans_20230518_012955'
