라이브러리

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from sklearn.cluster import KMeans




data 불러오기

In [2]:
# 데이터 불러오기
breed_data = pd.read_csv('./dog_data/labels.csv')
image_folder = "./dog_data/train/"

In [3]:
breed_data.tail()

Unnamed: 0,id,breed
10217,ffd25009d635cfd16e793503ac5edef0,borzoi
10218,ffd3f636f7f379c51ba3648a9ff8254f,dandie_dinmont
10219,ffe2ca6c940cddfee68fa3cc6c63213f,airedale
10220,ffe5f6d8e2bff356e9482a80a6e29aac,miniature_pinscher
10221,fff43b07992508bc822f33d8ffd902ae,chesapeake_bay_retriever


In [4]:
breed_data.breed.value_counts()

breed
scottish_deerhound      126
maltese_dog             117
afghan_hound            116
entlebucher             115
bernese_mountain_dog    114
                       ... 
golden_retriever         67
brabancon_griffon        67
komondor                 67
eskimo_dog               66
briard                   66
Name: count, Length: 120, dtype: int64

품종별 데이터수 일치시키기

In [5]:
# 품종별 최소 데이터 갯수 찾기
min_samples_per_breed = 66 # 위의 코드에서 value_counts()함수로 가장적은 품종 갯수 확인

# 각 품종에서 최소 데이터 갯수만큼 랜덤하게 샘플 선택
selected_samples = []
for breed in breed_data['breed'].unique():
    breed_samples = breed_data[breed_data['breed'] == breed].sample(min_samples_per_breed, random_state=42)
    selected_samples.append(breed_samples)

# 선택된 샘플을 하나의 데이터프레임으로 합치기
selected_data = pd.concat(selected_samples, ignore_index=True)

클러스터링 (k=20)

In [None]:
# 이미지 특성 추출을 위한 모델 로드
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# 이미지 경로를 특성 벡터로 변환하는 함수
def path_to_features(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(224, 224))
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = base_model.predict(img_array)
    return features.flatten()

# 실험할 데이터 크기 조절
sample_size = len(selected_data)  # 전체 데이터 크기
breed_data_sample = selected_data.sample(n=sample_size, random_state=42)

# 특성 추출
image_paths_sample = [f"{image_folder}{img}.jpg" for img in breed_data_sample['id']]
features_sample = np.array([path_to_features(img_path) for img_path in image_paths_sample])

# K-means 클러스터링
num_clusters = 20  # 대표종으로 선택할 클러스터 수
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
breed_data_sample['cluster'] = kmeans.fit_predict(features_sample)


클러스터별 대표종 선택

In [9]:
# 빈도수 기반 대표종 선택
representative_breeds_sample = breed_data_sample.groupby('cluster')['breed'].value_counts().groupby('cluster').idxmax().apply(lambda x: x[1])

# 대표종 출력
print("Representative 20 Breeds")
print(representative_breeds_sample)

Representative 20 Breeds
cluster
0                  airedale
1                     dhole
2              bull_mastiff
3                   redbone
4             silky_terrier
5     flat-coated_retriever
6                      chow
7         shetland_sheepdog
8               toy_terrier
9               entlebucher
10                   borzoi
11                 malamute
12      irish_water_spaniel
13         sealyham_terrier
14         english_foxhound
15         brittany_spaniel
16                 komondor
17           sussex_spaniel
18       kerry_blue_terrier
19         japanese_spaniel
Name: count, dtype: object
