In [1]:
from tensorflow.keras.applications.vgg19 import preprocess_input
from sklearn.metrics import accuracy_score
from keras.preprocessing import image
from sklearn.cluster import KMeans
from sklearn import metrics
import tensorflow as tf
import numpy as np
import cv2
import os

In [2]:
data = []
label = []
path = "./archive/train/train1/"
c = 0
d = 0
IMG_SIZE = 32
for file in os.listdir(path):
    img=cv2.imread(path+file)
    img=cv2.resize(img,(IMG_SIZE,IMG_SIZE))
    img=img.astype('float32')
    if file[:3]=='cat':
        if c==4: continue
        c+=1
        label.append("cat")
    else:
        if d==4: continue
        d+=1
        label.append("dog")
    data.append(img)
data = np.array(data)

In [3]:
data_label = []
for i in label:
    if i=="cat": data_label.append(0)
    else: data_label.append(1)
data_label = np.array(data_label)

In [4]:
data_label

array([0, 0, 0, 0, 1, 1, 1, 1])

In [5]:
data = data/255.0
reshaped_data = data.reshape(len(data),-1)

In [6]:
reshaped_data.shape = (8,3072)

In [7]:
model = tf.keras.applications.vgg19.VGG19(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE,3))

In [8]:
# 從vgg19提取特徵
def extract_vector(data_holder):
    resnet_feature_list = []

    for im in data_holder:
        im = image.img_to_array(im) 
        img = preprocess_input(np.expand_dims(im.copy(), axis=0))
        resnet_feature = model.predict(img)
        resnet_feature_np = np.array(resnet_feature)
        resnet_feature_list.append(resnet_feature_np.flatten())

    return np.array(resnet_feature_list)

In [9]:
# 提取向量並進行KMeans
array = extract_vector(data)
kmeans = KMeans(n_clusters=2, random_state=8)
clusters = kmeans.fit_predict(array)

In [10]:
clusters

array([1, 0, 0, 0, 0, 1, 1, 1])

In [11]:
# 因為K-Means不知道我們的標籤，所以必須根據我們的用法更改K-Means的標籤

# mapping labels from cluster to original labels
def get_reference_dict(clusters,data_label):
    reference_label = {}
    # For loop to run through each label of cluster label
    for i in range(len(np.unique(clusters))):
        index = np.where(clusters == i,1,0)
        num = np.bincount(data_label[index==1]).argmax()
        reference_label[i] = num
    return reference_label

# Mapping predictions to original labels
def get_labels(clusters,refernce_labels):
    temp_labels = np.random.rand(len(clusters))
    for i in range(len(clusters)):
        temp_labels[i] = refernce_labels[clusters[i]]
    return temp_labels

In [12]:
reference_labels = get_reference_dict(clusters,data_label)
predicted_labels = get_labels(clusters,reference_labels)

## 外部評估指標

In [13]:
print("準確度:",accuracy_score(data_label,predicted_labels))

準確度: 0.75


In [14]:
def purity_score(y_true,y_pred):
    y_voted_labels = np.zeros(y_true.shape)

    labels = np.unique(y_true)
    ordered_labels = np.arange(labels.shape[0])
    for k in range(labels.shape[0]):
        y_true[y_true==labels[k]] = ordered_labels[k]

    labels = np.unique(y_true)

    bins = np.concatenate((labels, [np.max(labels)+1]), axis=0)

    for cluster in np.unique(y_pred):
        hist, _ = np.histogram(y_true[y_pred==cluster], bins=bins)

        winner = np.argmax(hist)
        y_voted_labels[y_pred==cluster] = winner

    return accuracy_score(y_true, y_voted_labels)

In [15]:
print("純度為:",purity_score(data_label,predicted_labels))

純度為: 0.75


In [16]:
print("RI為:",metrics.rand_score(data_label,predicted_labels))

RI為: 0.5714285714285714


In [17]:
print("ARI為:",metrics.adjusted_rand_score(data_label,predicted_labels))

ARI為: 0.125


In [18]:
print("FMI為:",metrics.fowlkes_mallows_score(data_label,predicted_labels))

FMI為: 0.5000000000000001


In [19]:
print("同質性，完整性，V-measure為:",metrics.homogeneity_completeness_v_measure(data_label,predicted_labels))

同質性，完整性，V-measure為: (0.18872187554086706, 0.18872187554086706, 0.18872187554086706)


## 內部評估指標

In [20]:
print("CH為:",metrics.calinski_harabasz_score(array, clusters))

CH為: 7.111188843382713


In [21]:
print("輪廓分數:",metrics.silhouette_score(array,clusters,metric='euclidean'))

輪廓分數: 0.3646816
