In [49]:
import os
import cv2
import joblib
import numpy as np
from sklearn.cluster import KMeans


In [50]:
def extract_sift_descriptors(image_path: np.ndarray, sift: cv2.SIFT) -> np.ndarray:
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, descriptors = sift.detectAndCompute(image, None)
    return descriptors


In [51]:
def get_all_descriptors(folder_path: str, sift: cv2.SIFT) -> list:
    all_descriptors = []
    for filename in os.listdir(folder_path):
        if filename.endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(folder_path, filename)
            descriptors = extract_sift_descriptors(image_path, sift)
            all_descriptors.append(descriptors)
    return all_descriptors


In [52]:
sift = cv2.SIFT.create()
folder_path = 'coco128/images/train2017'
descriptors = get_all_descriptors(folder_path, sift)


In [53]:
descriptors = np.concatenate(descriptors, axis=0)


In [54]:
descriptors_size = 1000 # 1000+
descriptors = descriptors[np.random.choice(
    descriptors.shape[0],
    size=descriptors_size,
    replace=False)
]


In [55]:
descriptors


array([[ 14.,  11.,  17., ...,  11.,   8.,   4.],
       [  1.,  85.,  53., ...,   7.,  50.,  47.],
       [ 28.,  28.,  92., ...,   6.,  47.,  59.],
       ...,
       [  0.,   0.,   0., ...,   0.,  56., 127.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [ 66.,   9.,   7., ...,   2.,   0.,   0.]], dtype=float32)

In [56]:
n_clusters = 1024  # (512-2048)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=0).fit(descriptors)


In [57]:
model_filename = f'kmeans_{descriptors_size}_{n_clusters}.joblib'
joblib.dump(kmeans, model_filename)


['kmeans_20000_1024.joblib']