### Initial Setting

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
!pip install efficientnet_pytorch



In [4]:
from efficientnet_pytorch import EfficientNet
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import os
import logging
import torch
import pickle
from tqdm import tqdm # progress bar
from torchvision import transforms
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt

In [5]:
# dataset path
data_path = "/content/drive/MyDrive/Colab/Sketch_RNN/torch_sketch/test/"
# cluster_list.txt path
cluster_list_path = "/content/drive/MyDrive/Colab/Sketch_RNN/torch_sketch/cluster_list.txt"
# feature path
feat_path = "/content/drive/MyDrive/Colab/skku_Clustering/features_hw_2500.npy"
# filename path
filename_path = "/content/drive/MyDrive/Colab/skku_Clustering/filenames_hw_2500.npy"
# textfile path
textfile_path = "/content/drive/MyDrive/Colab/Sketch_RNN/torch_sketch/result.txt"

In [6]:
# make data_list : .npy files
os.chdir(data_path) # change directory path
data_list = []
with os.scandir(data_path) as files:
    for file in files:
        if file.name.endswith('.npy'):
            data_list.append(file.name)
# make cluster_list : cluster label name files
cluster_list = open(cluster_list_path , 'r').read().split('\n')
# make option_list
option_list = {'feature':'features_hw_2500'}

### Feature Extraction

In [None]:
# transform
tfms = transforms.Compose([transforms.Resize(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),]) # compose image (resize, totensor, normalize)

def ExtractFeature(image, model):
    img = tfms(Image.fromarray(image.reshape(28,28)).convert("RGB")).unsqueeze(0) # unsqueeze: vector to tensor
    features = model.extract_features(img)
    return features

In [None]:
#efficientNet
#feature extraction
model = EfficientNet.from_pretrained('efficientnet-b0')
data = {} # {} set
for npy in tqdm(data_list):
    image_list = np.load(npy)
    for i, image in enumerate(image_list[:1000]):
      feat = ExtractFeature(image, model)
      feat = feat.detach().numpy().reshape(-1) # detach: copy tensor without gradient
      name = npy.split('.')[0] + '_' + str(i)
      data[name] = (feat) 
      if(i % 200 == 0):
        np.save(feat_path, np.array(list(data.values())))
        np.save(filename_path, np.array(list(data.keys())))

np.save(feat_path, np.array(list(data.values()))) # get a list of the filenames
np.save(filename_path, np.array(list(data.keys()))) # get a list of just the features

### Dimension Reduction

In [7]:
feat = np.load(feat_path)

In [None]:
# PCA
from sklearn.decomposition import PCA
option_list['reduction'] = 'PCA'
option_list['pcaNComponents'] = 2
pca = PCA(n_components=option_list['pcaNComponents'], random_state=0)
pca.fit(feat)
x = pca.transform(feat)

In [8]:
# TSNE
from sklearn.manifold import TSNE
option_list['reduction'] = 'TSNE'
option_list['tsneNComponents'] = 3
tsne = TSNE(n_components=option_list['tsneNComponents'], init='pca', random_state=0)
x = tsne.fit_transform(feat)

### Clustering

In [126]:
def Cluster(cmd, cluster_num, input):
  if cmd == 'kmeans':
    from sklearn.cluster import KMeans
    option_list['clustering'] = 'KMeans'
    kmeans = KMeans(n_clusters = cluster_num, n_jobs = -1, random_state = 0)
    kmeans.fit(input)
    return kmeans.labels_
  elif cmd == 'affinity':
    from sklearn.cluster import AffinityPropagation
    option_list['clustering'] = 'AffinityPropagation'
    option_list['APDamping'] = 0.9;
    option_list['APPreference'] = -50;
    affinity = AffinityPropagation(preference=option_list['AP_preference'])
    affinity.fit(input)
    return affinity.labels_
  elif cmd == 'dbscan':
    from sklearn.cluster import DBSCAN
    option_list['clustering'] = 'DBSCAN'
    option_list['DBEps'] = 0.6
    option_list['DBMinSamples'] = 1
    dbscan = DBSCAN(eps=option_list['DBEps'], min_samples=option_list['DBMinSamples'])
    dbscan.fit(input)
    return dbscan.labels_
  elif cmd == 'birch':
    from sklearn.cluster import Birch
    option_list['clustering'] = 'BIRCH'
    brc = Birch(n_clusters=cluster_num)
    brc.fit(input)
    return brc.labels_

In [127]:
labels = Cluster('dbscan', len(cluster_list), x)

In [128]:
import plotly.express as px
from plotly.offline import plot
fig = px.scatter(x, x=0, y=1, color = labels)
plot(fig)

'temp-plot.html'

### Testing

In [129]:
filenames = np.load(filename_path)
groups = {}
# holds the cluster id and the images { id: [images] }
for f, cluster in zip(filenames, labels):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(f)
    else:
        groups[cluster].append(f)

# make cluster_dict for calculate acc
cluster_dict = {}
for cluster in groups:
  image_count = []
  image_name = []
  for image in groups[cluster]:
    image_name.append(image.split('_')[3])
  for name in cluster_list:
    image_count.append(image_name.count(name))

  cluster_dict[cluster] = cluster_list[image_count.index(max(image_count))] # Select the most frequently classified label as the correct label

# get accuracy
from sklearn.metrics import f1_score
pred = []
gt = []    
for cluster in groups:
    for category in groups[cluster]:
        pred.append(cluster_dict[cluster])
        gt.append(category.split('_')[3])
acc = str(f1_score(gt, pred, average='micro') * 100) + '%'
option_list['accuracy'] = acc
print(acc)

# write the result on textfile
f = open(textfile_path, 'a', encoding='UTF-8')
for key, value in option_list.items():
  f.write(f'{value} ')
f.write(f'\n')
f.close()

89.32%
