<a href="https://colab.research.google.com/github/2025-02-FML-team/WV-Team/blob/main/notebooks/05_class_balance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import shutil
from pathlib import Path

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_DIR = Path('/content/unpacked/')
    PACK_DIR = Path('/content/drive/My Drive/colab_drive/prepacked.zip')
    shutil.copy(PACK_DIR, '/content/')
    !unzip -o -q /content/prepacked.zip -d {DATA_DIR}
else:
    DATA_DIR= Path(os.path.join(os.getcwd(), "../data/")).resolve()
DATA_DIR

Mounted at /content/drive


PosixPath('/content/unpacked')

In [2]:
import re
import glob
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# CSV 로드 및 정리, 본인 경로에 맞게 변환
CSV_PATH = DATA_DIR / 'whiskies_relabel.csv'
IMAGE_SIZE = (256, 256)
RANDOM_STATE = 42

tf.random.set_seed(RANDOM_STATE)

In [3]:
df = pd.read_csv(CSV_PATH, dtype={"id": str})
df["id"] = df["id"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
df["category"] = df["category"].astype(str).str.strip()
paths = [DATA_DIR / p for p in df["local_full_path"]]

bar = tqdm(paths, desc="Processing Images", unit="img")

# 이미지 로드
X_list = []
for p in bar:
    with Image.open(p) as im:
        im = im.convert("RGB")
        im = im.resize(IMAGE_SIZE)
        arr = np.asarray(im, dtype=np.uint8)
        X_list.append(arr)
X = np.stack(X_list, axis=0)

Processing Images:   0%|          | 0/2943 [00:00<?, ?img/s]

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 라벨 인코딩
labels = df["category"].values
le = LabelEncoder()
y_int = le.fit_transform(labels)

# test 분리
X_rest, X_test, y_rest, y_test = train_test_split(
    X, y_int,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y_int
)

# train / valid 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    X_rest, y_rest,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y_rest
)

print("X_train:", X_train.shape)
print("X_valid:", X_valid.shape)
print("X_test :", X_test.shape)

print("y_train 분포:", np.bincount(y_train))
print("y_valid 분포:", np.bincount(y_valid))
print("y_test  분포:", np.bincount(y_test))

print("class mapping:", dict(zip(le.classes_, range(len(le.classes_)))))

X_train: (1883, 256, 256, 3)
X_valid: (471, 256, 256, 3)
X_test : (589, 256, 256, 3)
y_train 분포: [316 148  97 468  49 147 155 503]
y_valid 분포: [ 79  37  24 117  12  37  39 126]
y_test  분포: [ 99  46  30 146  15  46  49 158]
class mapping: {'Blended': 0, 'Bourbon': 1, 'Brandy': 2, 'Other': 3, 'Rye': 4, 'SM_40_43': 5, 'SM_43_46': 6, 'SM_G46': 7}


In [7]:
from sklearn.utils.class_weight import compute_class_weight

class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0,1,2,3,4,5,6,7]),
    y=y_train,
)

class_weight_dict = {}

i = 0;
for weight in class_weights_array:
    class_weight_dict[i] = weight
    i += 1

print(class_weight_dict)
# 예: {0: 0.8, 1: 1.2, 2: 3.4, ...}

{0: np.float64(0.7448575949367089), 1: np.float64(1.5903716216216217), 2: np.float64(2.426546391752577), 3: np.float64(0.5029380341880342), 4: np.float64(4.803571428571429), 5: np.float64(1.6011904761904763), 6: np.float64(1.5185483870967742), 7: np.float64(0.4679423459244533)}


# 이전의 교훈
1. model의 dense layer activation으로 gelu 이용
2. batch noramlization 적용

# 바꿔야할 것
1. 불균형 해소
1) other class 분해
결과 : 여러개의 작은 subclass가 생겨남 노이즈가 줄었을 것이라고 추측
2) 부족했던 rye, tequila(라이, 테킬라) 클래스의 샘플을 각각 50개씩 추가(증강)
결과 : 일단 소수 클래스는 균등해짐 130개 가량...
3) class별 weight 부과
결과 : -- 다시 실험 해봐야함 

2. layer 탐색
1) dense layer
2) conv layer
3) input layer
- 아마도 input의 경우는 세로가 긴게 연산수를 크게 늘리지 않고서도 좋은 방법이라 사료됨...

In [5]:
#03 노트북 코드랑 동일함
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val, start_from_epoch=12, patient=3):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.f1_scores = [] #this is for cumilating f1 per epoch
        self.start_from_epoch = start_from_epoch
        self.patient = patient
        self.out = 0
        self.best_f1 = -1

    #원래는 GPT가 f1스코어를 넣는 부분만 제공을 하였습니다만, EarlyStopping이 원하는 대로 작동하지 않은 이유로,
    #여기서 f1스코어를 계산한후 지속적으로 감지해서 Callback의 명세에 쓰여있는 self.model.stop_training = True
    #구문을 사용해서 EarlyStopping과 비슷하게 작동을 정지 시킵니다.
    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_val, verbose=0)
        y_pred = np.argmax(y_pred, axis=1)

        if self.y_val.ndim == 2:
            y_true = np.argmax(self.y_val, axis=1)
        else:
            y_true = self.y_val

        f1 = f1_score(y_true, y_pred, average='macro')
        self.f1_scores.append(f1)
        logs['val_macro_f1'] = f1

        if f1 > self.best_f1:
            self.best_f1 = f1

        if 1 < epoch and epoch > self.start_from_epoch and f1 < self.f1_scores[-2]:
            print(f"\nNon Improvement detected at EP : {epoch}, f1 : {f1}")
            self.out += 1

        if self.out >= self.patient:
            print(f"\nStopping at EP : {epoch}, f1 : {f1}")
            self.model.stop_training = True

In [8]:
from tensorflow.keras import layers, models
from tensorflow.keras.activations import gelu

#the name keyword is just there to use kwargs, it's not actually used.
def build_model(
    hidden=[300, 300], 
    conv=[32, 64, 128], 
    conv_double=False, 
    input_dim=(256, 256), 
    name=""
):
    inputs = keras.Input(shape=(input_dim[0], input_dim[1], 3))

    x = inputs
    for cl in conv:
        x = layers.Conv2D(cl, (3,3), activation='relu', padding='same')(x)
        if conv_double:
            x = layers.Conv2D(cl, (3,3), activation='relu', padding='same')(x)
        x = layers.MaxPooling2D((2,2))(x)

    x = layers.Flatten()(x)

    for hl in hidden:
        x = layers.Dense(hl)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('gelu')(x)

    outputs = layers.Dense(8, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [10]:
from tensorflow.keras.callbacks import ModelCheckpoint

configs = [
    #hidden layer
    {"name": "hl300x2_100x2_50x2", "hidden": [300, 300, 100, 100, 50, 50]},
    {"name": "hl100x4", "hidden": [100, 100, 100, 100]},
    {"name": "hl300x3", "hidden": [300, 300, 300]},
    {"name": "hl300x2", "hidden": [300, 300]},

    #conv layer
    {"name": "cl16_32_64", "conv": [16, 32, 64]},
    {"name": "cl48_96_192", "conv": [48, 96, 192]},
    {"name": "cld16_32_64", "conv": [16, 32, 64], "conv_double": True},
    {"name": "cld48_96_192", "conv": [48, 96, 192], "conv_double": True},

    #input layer
    {"name": "id320x192", "input_dim": (320, 192)},
    {"name": "id288x216", "input_dim": (288, 216)},
]

cv_results = []

In [None]:
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score

# ----- 설정 값들 -----
N_SPLITS   = 5      # k-fold 개수
EPOCHS     = 30     # 최대 epoch
BATCH_SIZE = 64
CONFIG_INDEX = 0

skf = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE,
)

cfg = configs[CONFIG_INDEX]
name = cfg["name"]
print(f"\n===== K-Fold CV for config: {name} =====")

fold_accuracies = []
fold_precisions = []
fold_f1s        = []
fold_last_f1s   = []

# k-fold 루프
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_rest, y_rest), start=1):
    print(f"\n[{name}] Fold {fold_idx}/{N_SPLITS}")

    X_tr, X_val = X_rest[train_idx], X_rest[val_idx]
    y_tr, y_val = y_rest[train_idx], y_rest[val_idx]

    # 모델 생성(컴파일도 여기서 진행!)
    model = build_model(**cfg)

    # f1 + early stopping
    f1_cb = F1ScoreCallback(X_val, y_val)

    history = model.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[f1_cb],
        class_weight=class_weight_dict, #이렇게 하면 class weight를 줄 수 있음. 근데 그냥 이렇게 하고 끝낼 예정...
        verbose=1,   # 필요하면 1로 바꿔도 됨
    )

    # ---- 이 fold에서 metrics 계산 ----
    # 1) loss / accuracy (evaluate)
    loss, acc = model.evaluate(X_val, y_val, verbose=0)

    # 2) 예측값 가져와서 precision / f1 (macro) 계산
    y_prob = model.predict(X_val, verbose=0)
    y_pred = np.argmax(y_prob, axis=1)

    y_true = y_val

    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    f1        = f1_cb.best_f1
    last_f1   = f1_cb.f1_scores[-1]

    fold_accuracies.append(acc)
    fold_precisions.append(precision)
    fold_f1s.append(f1)
    fold_last_f1s.append(last_f1)

    print(f"[{name}] Fold {fold_idx}: "
          f"loss={loss:.4f}, acc={acc:.4f}, "
          f"prec_macro={precision:.4f}, f1_macro={f1:.4f}")

# ----- config별 평균/표준편차 정리 -----
cfg_row = {
    "name": name,
    "acc_mean":  float(np.mean(fold_accuracies)),
    "acc_std":   float(np.std(fold_accuracies)),
    "prec_macro_mean": float(np.mean(fold_precisions)),
    "prec_macro_std":  float(np.std(fold_precisions)),
    "best_f1_macro_mean":   float(np.mean(fold_f1s)),
    "best_f1_macro_std":    float(np.std(fold_f1s)),
    "last_f1_macro_mean":   float(np.mean(fold_last_f1s)),
    "last_f1_macro_std":    float(np.std(fold_last_f1s)),
}

print(f"\n>>> [CV Summary] {name}: "
      f"f1_macro={cfg_row['best_f1_macro_mean']:.4f} ± {cfg_row['best_f1_macro_std']:.4f}, "
      f"last_f1_macro={cfg_row['last_f1_macro_mean']:.4f} ± {cfg_row['last_f1_macro_std']:.4f}, "
      f"acc={cfg_row['acc_mean']:.4f} ± {cfg_row['acc_std']:.4f}, ")

cv_results.append(cfg_row)


===== K-Fold CV for config: 300x2_100x2_50x2 =====

[300x2_100x2_50x2] Fold 1/5
Epoch 1/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 756ms/step - accuracy: 0.2469 - loss: 2.1059 - val_accuracy: 0.0870 - val_loss: 6.6902 - val_macro_f1: 0.0292
Epoch 2/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 146ms/step - accuracy: 0.3456 - loss: 1.7595 - val_accuracy: 0.0913 - val_loss: 3.0588 - val_macro_f1: 0.0346
Epoch 3/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 144ms/step - accuracy: 0.4290 - loss: 1.5149 - val_accuracy: 0.2293 - val_loss: 2.1687 - val_macro_f1: 0.0985
Epoch 4/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 151ms/step - accuracy: 0.5200 - loss: 1.2642 - val_accuracy: 0.0870 - val_loss: 2.0681 - val_macro_f1: 0.0477
Epoch 5/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 147ms/step - accuracy: 0.6251 - loss: 1.0118 - val_accuracy: 0.0913 - val_loss: 2.2582 - val_macro_f1: 