In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing import image

from sklearn.cluster import KMeans 
from sklearn.decomposition import PCA
import cv2

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os

In [None]:
np.random.seed(42)

In [None]:
DATASET_PATH = "dataset"
IMAGE_SIZE = (224,224)

In [None]:
model = VGG16(include_top=False, weights='imagenet', input_tensor=None, input_shape=(224,224,3), pooling='avg')

In [None]:
# the list comprehension of this was extremely long, not comprehensive
def list_image_paths(dataset_path):
    image_path_list = []
    for folder_path in os.listdir(dataset_path):
        for image_path in os.listdir(os.path.join(dataset_path, folder_path)):
            image_path_list.append(os.path.join(dataset_path, folder_path, image_path))
    return image_path_list

In [None]:
def extract_vector(path, img_size, image_path_list):
    feature_vector = []
    for img_path in image_path_list:
        img = image.load_img(img_path, target_size=img_size)
        img = image.img_to_array(img)
        img = preprocess_input(img)
        x = np.expand_dims(img, 0)
        feature = model.predict(x)    
        feature_np = np.array(feature)
        feature_vector.append(feature_np.flatten())
    return np.array(feature_vector)

In [None]:
image_path_list = list_image_paths(DATASET_PATH)
feature_vector = extract_vector(DATASET_PATH, IMAGE_SIZE, image_path_list)

In [None]:
n_clusters=3, 
max_iter=5, 
n_init=5

In [None]:
# temp, classified_points, means = cv2.kmeans(
#     data=feature_vector, 
#     K=n_clusters, 
#     bestLabels=None, 
#     criteria=(cv2.TERM_CRITERIA_MAX_ITER, max_iter, 0.9), 
#     attempts=n_init,
#     flags=cv2.KMEANS_PP_CENTERS
# )
temp, classified_points, means = cv2.kmeans(
    feature_vector,
    3, 
    bestLabels=None, 
    criteria=(cv2.TERM_CRITERIA_MAX_ITER, 1500000, 0.9), 
    attempts=2000000, 
    flags=cv2.KMEANS_PP_CENTERS)

In [None]:
dict(zip(image_path_list, [p[0] for p in classified_points]))

In [None]:
kmeans = KMeans(
    n_clusters=n_clusters, 
    max_iter=max_iter, 
    n_init=n_init, 
    random_state=random_state).fit(feature_vector)

In [None]:
paths = [image_path_list[i] for i in range(len(kmeans.labels_))]
dict(zip(paths, kmeans.labels_))

In [None]:
reduced_data = PCA(n_components=2).fit_transform(feature_vector)
kmeans = KMeans(
    init='k-means++', 
    n_clusters=n_clusters, 
    max_iter=max_iter, 
    n_init=n_init, 
    random_state=random_state).fit(reduced_data)

In [None]:
paths = [image_path_list[i] for i in range(len(kmeans.labels_))]
dict(zip(paths, kmeans.labels_))