In [None]:
import cv2
import time
import math
from numba import cuda

# CUDA kernel to calculate distances between each point and centroids
@cuda.jit
def calculate_distances(X_r, X_g, X_b, centroids_r, centroids_g, centroids_b, distance_matrix):
    x_idx, y_idx = cuda.grid(2)
    if x_idx < len(X_r) and y_idx < len(centroids_r):
        diff_r = X_r[x_idx] - centroids_r[y_idx]
        diff_g = X_g[x_idx] - centroids_g[y_idx]
        diff_b = X_b[x_idx] - centroids_b[y_idx]
        sum_dist = diff_r**2 + diff_g**2 + diff_b**2
        distance_matrix[x_idx, y_idx] = math.sqrt(sum_dist)

# CUDA kernel to find the closest centroid for each point
@cuda.jit
def find_closest_centroid(distance_matrix, closest_cluster_ids):
    idx = cuda.grid(1)
    if idx < distance_matrix.shape[0]:
        min_dist = 1e10
        min_idx = -1
        for i in range(distance_matrix.shape[1]):
            if distance_matrix[idx, i] < min_dist:
                min_dist = distance_matrix[idx, i]
                min_idx = i
        closest_cluster_ids[idx] = min_idx

# CUDA kernel to update centroids based on assigned points
@cuda.jit
def update_centroids(X_r, X_g, X_b, closest_cluster_ids, centroids_r, centroids_g, centroids_b, counts):
    centroid_idx = cuda.grid(1)
    if centroid_idx < len(centroids_r):
        sum_r = 0.0
        sum_g = 0.0
        sum_b = 0.0
        count = 0
        for i in range(len(X_r)):
            if closest_cluster_ids[i] == centroid_idx:
                sum_r += X_r[i]
                sum_g += X_g[i]
                sum_b += X_b[i]
                count += 1
        if count > 0:
            centroids_r[centroid_idx] = sum_r / count
            centroids_g[centroid_idx] = sum_g / count
            centroids_b[centroid_idx] = sum_b / count
        counts[centroid_idx] = count

# Function to perform k-means clustering using CUDA without NumPy
def perform_k_means_cuda(X_r, X_g, X_b, k, num_iterations=10):
    threads_per_block = (16, 16)
    blocks_per_grid_x = (len(X_r) + threads_per_block[0] - 1) // threads_per_block[0]
    blocks_per_grid_y = (k + threads_per_block[1] - 1) // threads_per_block[1]
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

    X_r_device = cuda.to_device(X_r)
    X_g_device = cuda.to_device(X_g)
    X_b_device = cuda.to_device(X_b)

    centroids_indices = list(range(0, len(X_r), len(X_r) // k))[:k]
    centroids_r = [X_r[i] for i in centroids_indices]
    centroids_g = [X_g[i] for i in centroids_indices]
    centroids_b = [X_b[i] for i in centroids_indices]

    centroids_r_device = cuda.to_device(centroids_r)
    centroids_g_device = cuda.to_device(centroids_g)
    centroids_b_device = cuda.to_device(centroids_b)

    distance_matrix = cuda.device_array((len(X_r), k), dtype=float)
    closest_cluster_ids = cuda.device_array(len(X_r), dtype=int)

    for _ in range(num_iterations):
        calculate_distances[blocks_per_grid, threads_per_block](X_r_device, X_g_device, X_b_device, centroids_r_device, centroids_g_device, centroids_b_device, distance_matrix)
        find_closest_centroid[(blocks_per_grid_x, 1), 16](distance_matrix, closest_cluster_ids)
        counts = cuda.device_array(k, dtype=int)
        update_centroids[(k, 1), 16](X_r_device, X_g_device, X_b_device, closest_cluster_ids, centroids_r_device, centroids_g_device, centroids_b_device, counts)

    final_centroids_r = centroids_r_device.copy_to_host()
    final_centroids_g = centroids_g_device.copy_to_host()
    final_centroids_b = centroids_b_device.copy_to_host()

    return list(zip(final_centroids_r, final_centroids_g, final_centroids_b))

# Entry point
if __name__ == '__main__':
    img = cv2.imread("/kaggle/input/picture2/image.webp")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    h, w, d = img.shape

    X_r, X_g, X_b = zip(*[img[row, col] for row in range(h) for col in range(w)])
    X_r, X_g, X_b = list(map(float, X_r)), list(map(float, X_g)), list(map(float, X_b))

    k_values = [2, 4, 8, 16, 32, 64, 128, 256]

    total_start_time = time.time()
    for k in k_values:
        start_time = time.time()
        centroids = perform_k_means_cuda(X_r, X_g, X_b, k)
        print(f"Time taken for k={k}: {time.time() - start_time:.4f} seconds")

    print(f"Total time taken: {time.time() - total_start_time:.4f} seconds")
