In [1]:
import os
import cv2
import joblib
import numpy as np
from sklearn.cluster import KMeans

In [2]:
def extract_sift_descriptors(image_path: np.ndarray) -> np.ndarray:
  image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
  sift = cv2.SIFT_create()
  keypoints, descriptors = sift.detectAndCompute(image, None)
  return descriptors

In [3]:
def getAllDescs(folder_path: str) -> list:
  all_descriptors = []
  for filename in os.listdir(folder_path):
    if filename.endswith(('.jpg', '.jpeg', '.png')):
      image_path = os.path.join(folder_path, filename)
      descriptors = extract_sift_descriptors(image_path)
      all_descriptors.append(descriptors)
  return all_descriptors

In [4]:
folder_path = "coco128/images/train2017"
descs = getAllDescs(folder_path)

In [5]:
descs = np.concatenate(descs, axis=0)

In [6]:
descs_size = 1000 # 1000+
descs = descs[np.random.choice(descs.shape[0], size=descs_size, replace=False)]

In [7]:
descs

array([[114.,  13.,   0., ...,   0.,   2.,  66.],
       [  2.,   3.,   8., ...,   9.,   0.,   2.],
       [  3.,   1.,   0., ...,  30.,   1.,   2.],
       ...,
       [  8.,   3.,   5., ...,   7.,  14.,  23.],
       [ 15.,  14.,   8., ...,   7.,   0.,   0.],
       [ 12.,  29.,  12., ...,   0.,   0.,   0.]], dtype=float32)

In [8]:
n_clusters = 512 # (512-2048)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=0).fit(descs)

In [9]:
model_filename = f'kmeans_{descs_size}_{n_clusters}.joblib'
joblib.dump(kmeans, model_filename)

['kmeans_1000_512.joblib']