In [51]:
from keras.utils import load_img
from keras.utils import img_to_array
from keras.applications.vgg16 import preprocess_input

# models
from keras.applications.vgg16 import VGG16
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

In [40]:
path = r"/home/vlad/universitet/ProjektDA/Image-clustering-project/Image_Data/ZAsaa1YOBks/"
# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
image_paths = []
data_dir = "/Image_Data/ZAsaa1YOBks/"
# creates a ScandirIterator aliased as files
for index, directories in enumerate(os.walk(path)):
    for sample in directories[2]:
        if sample.endswith('.png'):
            image_paths.append(path + sample)

In [41]:
# load model
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)


def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224, 224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img)
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1, 224, 224, 3)
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [46]:
data = {}
p = r"/home/vlad/universitet/ProjektDA/Image-clustering-project/features/flower_features.pkl"

# lop through each image in the dataset
for flower in image_paths:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(flower, model)
        data[flower] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p, 'wb') as file:
            pickle.dump(data, file)


# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
feat.shape
(210, 1, 4096)

# reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1, 4096)
feat.shape
(210, 4096)







(210, 4096)

In [52]:
print(feat)

[[0.         0.5909953  1.1337146  ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.44416353 0.         0.        ]
 [0.         0.838642   0.         ... 0.23798046 0.         2.5965295 ]
 ...
 [1.4522324  0.         0.         ... 0.         0.         0.89516115]
 [0.         0.         0.36865452 ... 0.4514351  0.         0.        ]
 [0.7258918  0.         0.         ... 0.         0.         2.481297  ]]


In [53]:
# Standardize z
scaler = StandardScaler()
scaler.fit(feat)
feat = scaler.transform(feat)
print(feat)

[[-0.33709022  0.508146    0.569208   ... -0.5683452  -0.5880838
  -0.5276706 ]
 [-0.33709022 -0.43858364 -0.48764735 ... -0.01201404 -0.5880838
  -0.5276706 ]
 [-0.33709022  0.90485716 -0.48764735 ... -0.2702659  -0.5880838
   2.0185947 ]
 ...
 [ 2.1590397  -0.43858364 -0.48764735 ... -0.5683452  -0.5880838
   0.35016185]
 [-0.33709022 -0.43858364 -0.14398547 ... -0.00290614 -0.5880838
  -0.5276706 ]
 [ 0.91058916 -0.43858364 -0.48764735 ... -0.5683452  -0.5880838
   1.9055929 ]]


In [54]:
pca = PCA(0.95, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

print(x.shape)
print(x)

kmeans = KMeans(n_clusters=6, random_state=22)
kmeans.fit(x)

groups = {}
for file, cluster in zip(filenames, kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)


def view_cluster(cluster):
    plt.figure(figsize=(25, 25))
    # gets the list of filenames for a cluster
    files = groups[cluster]
    # only allow up to 30 images to be shown at a time
    if len(files) > 30:
        print(f"Clipping cluster size from {len(files)} to 30")
        files = files[:29]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(10, 10, index+1)
        img = load_img(file)
        img = np.array(img)
        plt.imshow(img)
        plt.axis('off')


(202, 135)
[[-15.19459     -1.2965397   -7.3013306  ...  -0.12145817  -4.451663
   -1.9141393 ]
 [-11.912085    -9.539061    -7.208286   ...   3.2372754    5.88469
   -1.664389  ]
 [  9.383682    31.053522   -11.57286    ...  -1.2817128    1.9661008
    5.629104  ]
 ...
 [ 25.311045     4.818861    -3.9584239  ...   0.6636342    0.529751
   -0.15682739]
 [ -0.06531703 -10.382985     2.248289   ...  -0.08514495  -1.6230881
   -0.03792426]
 [ 26.599028     9.4117365   -5.3938146  ...   0.14646342  -5.1211624
    2.2524421 ]]
