In [23]:
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import os, shutil, glob, os.path
from PIL import Image as pil_image
image.LOAD_TRUNCATED_IMAGES = True 
model = VGG16(weights='imagenet', include_top=False)

In [24]:
# Variables
imdir = 'D:/{Samir}/Personal/Education/ADA/{{Study}}/Thesis/Server/Bot_data_all/imagesMerge'
targetdir = "D:/{Samir}/Personal/Education/ADA/{{Study}}/Thesis/Server/Bot_data_all/labels/"
number_clusters = 32
letters = ['A', 'B', 'C', 'Ç', 'D', 'E', 'Ə', 'F', 'G', 'Ğ', 'H', 'I', 'İ', 'J', 'K',
           'L', 'M', 'N', 'O', 'Ö', 'P', 'Q', 'R', 'S', 'Ş', 'T', 'U', 'Ü', 'V', 'X', 'Y', 'Z', 'NA']
labelsList = dict()

# Loop over files and get features
filelist = []

for folder in os.listdir(imdir):
    filelist.extend(glob.glob(os.path.join(imdir, folder, '*.jpg')))

filelist.sort()
featurelist = []
for i, imagepath in enumerate(filelist):
    print("    Status: %s / %s" %(i, len(filelist)), end="/r")
    img = image.load_img(imagepath, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = np.array(model.predict(img_data))
    featurelist.append(features.flatten())

# Clustering
kmeans = KMeans(n_clusters=number_clusters, random_state=0).fit(np.array(featurelist))

try:
    os.makedirs(targetdir)
except OSError:
    pass

In [25]:
def parseLetterFromPath(path):
    replacedPath = path.replace(imdir, '')
    letter = replacedPath.split('\\')[1]
    return letter

In [27]:
for i, m in enumerate(kmeans.labels_):
    letter = parseLetterFromPath(filelist[i])
    if m in labelsList:
        if letter in letters:
            labelsList[m][letter] += 1
        else:
            labelsList[m]['NA'] += 1
    else:
        labelDict = dict(zip(letters, [0] * len(letters)))
        if letter in letters:
            labelDict[letter] = 1
        else:
            labelDict['NA'] = 1

        labelsList[m] = labelDict

# print("    Copy: %s / %s" %(i, len(kmeans.labels_)), end="/r")
# shutil.copy(filelist[i], targetdir + str(m) + "_" + str(i) + ".jpg")

In [29]:
stats = []
labelNames = []
for key, value in labelsList.items():
    arr = []
    for letter, count in value.items():
        arr.append(count)
    stats.append(arr)
    labelNames.append(key)

csv_file = f'result.csv'
df = pd.DataFrame(stats, columns=letters, index=labelNames)
df['Total'] = df.sum(axis=1)
df.to_csv(csv_file, encoding='utf-8-sig')