<a href="https://colab.research.google.com/github/2025-02-FML-team/WV-Team/blob/main/notebooks/05_class_balance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import os
import shutil
from pathlib import Path

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_DIR = Path('/content/unpacked/')
    PACK_DIR = Path('/content/drive/My Drive/colab_drive/prepacked.zip')
    shutil.copy(PACK_DIR, '/content/')
    !unzip -o -q /content/prepacked.zip -d {DATA_DIR}
else:
    DATA_DIR= Path(os.path.join(os.getcwd(), "../data/")).resolve()
DATA_DIR

PosixPath('/Volumes/Backup/Workspace/ML/WV-Team/data')

In [9]:
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# CSV 로드 및 정리, 본인 경로에 맞게 변환
CSV_PATH = DATA_DIR / 'whiskies_relabel.csv'
IMAGE_SIZE = (256, 256)
RANDOM_STATE = 42

tf.random.set_seed(RANDOM_STATE)

In [10]:
df = pd.read_csv(CSV_PATH, dtype={"id": str})
df["id"] = df["id"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
df["category"] = df["category"].astype(str).str.strip()
paths = [DATA_DIR / p for p in df["local_full_path"]]

bar = tqdm(paths, desc="Processing Images", unit="img")

# 이미지 로드
X_list = []
for p in bar:
    with Image.open(p) as im:
        im = im.convert("RGB")
        im = im.resize(IMAGE_SIZE)
        arr = np.asarray(im, dtype=np.uint8)
        X_list.append(arr)
X = np.stack(X_list, axis=0)

Processing Images:   0%|          | 0/3042 [00:00<?, ?img/s]

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 라벨 인코딩
labels = df["category"].values
le = LabelEncoder()
y_int = le.fit_transform(labels)

# test 분리
X_rest, X_test, y_rest, y_test = train_test_split(
    X, y_int,
    test_size=0.15,
    random_state=RANDOM_STATE,
    stratify=y_int
)

# train / valid 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    X_rest, y_rest,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y_rest
)

print("X_train:", X_train.shape)
print("X_valid:", X_valid.shape)
print("X_test :", X_test.shape)

print("y_train 분포:", np.bincount(y_train))
print("y_valid 분포:", np.bincount(y_valid))
print("y_test  분포:", np.bincount(y_test))

CLASS_NUM = len(le.classes_)
print("class mapping:", dict(zip(le.classes_, range(CLASS_NUM))))


X_train: (2068, 256, 256, 3)
X_valid: (517, 256, 256, 3)
X_test : (457, 256, 256, 3)
y_train 분포: [352 177 104  88  98  93  86 158 174 544  91 103]
y_valid 분포: [ 88  44  26  22  24  23  22  39  44 136  23  26]
y_test  분포: [ 78  39  23  20  21  21  19  35  38 120  20  23]
class mapping: {'Blended': 0, 'Bourbon': 1, 'Brandy': 2, 'Gin': 3, 'Liqueur': 4, 'Rum': 5, 'Rye': 6, 'SM_40_43': 7, 'SM_43_46': 8, 'SM_G46': 9, 'Tequila': 10, 'Vodka': 11}


In [12]:
from sklearn.utils.class_weight import compute_class_weight

class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=np.array(range(CLASS_NUM)),
    y=y_train,
)

class_weight_dict = {}

i = 0;
for weight in class_weights_array:
    class_weight_dict[i] = weight
    i += 1

print(class_weight_dict)
# 예: {0: 0.8, 1: 1.2, 2: 3.4, ...}

{0: 0.4895833333333333, 1: 0.9736346516007532, 2: 1.6570512820512822, 3: 1.9583333333333333, 4: 1.7585034013605443, 5: 1.853046594982079, 6: 2.003875968992248, 7: 1.090717299578059, 8: 0.9904214559386973, 9: 0.3167892156862745, 10: 1.8937728937728937, 11: 1.6731391585760518}


# 이전의 교훈
1. model의 dense layer activation으로 gelu 이용
2. batch noramlization 적용

# 바꿔야할 것
1. 불균형 해소
1) other class 분해\
결과 : 여러개의 작은 subclass가 생겨남 노이즈가 줄었을 것이라고 추측
2) 부족했던 rye, tequila(라이, 테킬라) 클래스의 샘플을 각각 50개씩 추가(증강)\
결과 : 일단 소수 클래스는 균등해짐 130개 가량...
3) class별 weight 부과\
결과 : metric 차이가 있는지는 잘 관찰이 안됨.

2. layer 탐색
- Dense Layer\
일단 f1 score의 경우 경향성에 있어 차이는 크게 없었지만, score 자체는 0.05+정도 올라온 느낌
다른 accuracy나 precision등에 있어서는 경향성의 차이도 생겼는데, 데이터 품질이 개선되어서 그런 것인지는 정확히 모르겠음. 둘 다 균형있게 보는 f1 score를 중심으로 보아 여전히 hl300x2가 좋은 것으로 보임. 다만 같은 2층짜리 구조에서 실험하거나 더 낮은 1층 구조에서 실험하는 것도 가능성이 있어보임.

- Conv layer
- Input layer
- 아마도 input의 경우는 세로가 긴게 연산수를 크게 늘리지 않고서도 좋은 방법이라 사료됨...

In [16]:
#03 노트북 코드랑 동일함
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val, start_from_epoch=12, patient=3):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.f1_scores = [] #this is for cumilating f1 per epoch
        self.start_from_epoch = start_from_epoch
        self.patient = patient
        self.out = 0
        self.best_f1 = -1

    #원래는 GPT가 f1스코어를 넣는 부분만 제공을 하였습니다만, EarlyStopping이 원하는 대로 작동하지 않은 이유로,
    #여기서 f1스코어를 계산한후 지속적으로 감지해서 Callback의 명세에 쓰여있는 self.model.stop_training = True
    #구문을 사용해서 EarlyStopping과 비슷하게 작동을 정지 시킵니다.
    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_val, verbose=0)
        y_pred = np.argmax(y_pred, axis=1)

        if self.y_val.ndim == 2:
            y_true = np.argmax(self.y_val, axis=1)
        else:
            y_true = self.y_val

        f1 = f1_score(y_true, y_pred, average='macro')
        self.f1_scores.append(f1)
        logs['val_macro_f1'] = f1

        if f1 > self.best_f1:
            self.best_f1 = f1

        if 1 < epoch and epoch > self.start_from_epoch and f1 < self.f1_scores[-2]:
            print(f"\nNon Improvement detected at EP : {epoch}, f1 : {f1}")
            self.out += 1

        if self.out >= self.patient:
            print(f"\nStopping at EP : {epoch}, f1 : {f1}")
            self.model.stop_training = True

In [17]:
from tensorflow.keras import layers, models
from tensorflow.keras.activations import gelu

#the name keyword is just there to use kwargs, it's not actually used.
def build_model(
    hidden=[300, 300],
    conv=[32, 64, 128],
    conv_double=False,
    input_dim=IMAGE_SIZE,
    name=""
):
    inputs = keras.Input(shape=(input_dim[0], input_dim[1], 3)) #근데 이거 조절 할라면 위에서도 바꿔 줘야하지 않나...

    x = inputs
    for cl in conv:
        x = layers.Conv2D(cl, (3,3), activation='relu', padding='same')(x)
        if conv_double:
            x = layers.Conv2D(cl, (3,3), activation='relu', padding='same')(x)
        x = layers.MaxPooling2D((2,2))(x)

    x = layers.Flatten()(x)

    for hl in hidden:
        x = layers.Dense(hl)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('gelu')(x)

    outputs = layers.Dense(CLASS_NUM, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=3e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [18]:
from tensorflow.keras.callbacks import ModelCheckpoint

configs = [
    #hidden layer
    {"name": "hl300x2_100x2_50x2", "hidden": [300, 300, 100, 100, 50, 50]},
    {"name": "hl100x4", "hidden": [100, 100, 100, 100]},
    {"name": "hl300x3", "hidden": [300, 300, 300]},
    {"name": "hl300x2", "hidden": [300, 300]},
    #later added, less layer seems effective???
    {"name": "hl400x2", "hidden": [400, 400]},
    {"name": "hl200x2", "hidden": [200, 200]},
    {"name": "hl400", "hidden": [400]},
    {"name": "hl300", "hidden": [300]},
    {"name": "hl200", "hidden": [200]},

    #conv layer
    {"name": "cl16_32_64", "conv": [16, 32, 64]},
    {"name": "cl48_96_192", "conv": [48, 96, 192]},
    {"name": "cld16_32_64", "conv": [16, 32, 64], "conv_double": True},
    {"name": "cld48_96_192", "conv": [48, 96, 192], "conv_double": True},

    #input layer
    {"name": "id320x192", "input_dim": (320, 192)},
    {"name": "id288x216", "input_dim": (288, 216)},
]

cv_results = []

In [None]:
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score

# ----- 설정 값들 -----
N_SPLITS   = 5      # k-fold 개수
EPOCHS     = 50     # 최대 epoch
BATCH_SIZE = 32
#CONFIG_INDEX = 0

skf = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE,
)

test_configs = configs[4:]

for cfg in test_configs:
    name = cfg["name"]
    print(f"\n===== K-Fold CV for config: {name} =====")

    fold_accuracies = []
    fold_precisions = []
    fold_f1s        = []
    fold_last_f1s   = []

    # k-fold 루프
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_rest, y_rest), start=1):
        print(f"\n[{name}] Fold {fold_idx}/{N_SPLITS}")

        X_tr, X_val = X_rest[train_idx], X_rest[val_idx]
        y_tr, y_val = y_rest[train_idx], y_rest[val_idx]

        # 모델 생성(컴파일도 여기서 진행!)
        model = build_model(**cfg)
        #model.summary() #debug

        # f1 + early stopping
        f1_cb = F1ScoreCallback(
            X_val, y_val,
            start_from_epoch=15,
            patient=5
        )

        history = model.fit(
            X_tr, y_tr,
            validation_data=(X_val, y_val),
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            callbacks=[f1_cb],
            class_weight=class_weight_dict, #이렇게 하면 class weight를 줄 수 있음. 근데 그냥 이렇게 하고 끝낼 예정...
            verbose=1,   # 필요하면 1로 바꿔도 됨
        )

        # ---- 이 fold에서 metrics 계산 ----
        # 1) loss / accuracy (evaluate)
        loss, acc = model.evaluate(X_val, y_val, verbose=0)

        # 2) 예측값 가져와서 precision / f1 (macro) 계산
        y_prob = model.predict(X_val, verbose=0)
        y_pred = np.argmax(y_prob, axis=1)

        y_true = y_val

        precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
        f1        = f1_cb.best_f1
        last_f1   = f1_cb.f1_scores[-1]

        fold_accuracies.append(acc)
        fold_precisions.append(precision)
        fold_f1s.append(f1)
        fold_last_f1s.append(last_f1)

        print(f"[{name}] Fold {fold_idx}: "
              f"loss={loss:.4f}, acc={acc:.4f}, "
              f"prec_macro={precision:.4f}, f1_macro={f1:.4f}")

        #now this actually helps
        del model
        model = None
        gc.collect()

    # ----- config별 평균/표준편차 정리 -----
    cfg_row = {
        "name": name,
        "acc_mean":  float(np.mean(fold_accuracies)),
        "acc_std":   float(np.std(fold_accuracies)),
        "prec_macro_mean": float(np.mean(fold_precisions)),
        "prec_macro_std":  float(np.std(fold_precisions)),
        "best_f1_macro_mean":   float(np.mean(fold_f1s)),
        "best_f1_macro_std":    float(np.std(fold_f1s)),
        "last_f1_macro_mean":   float(np.mean(fold_last_f1s)),
        "last_f1_macro_std":    float(np.std(fold_last_f1s)),
    }

    print(f"\n>>> [CV Summary] {name}: "
          f"f1_macro={cfg_row['best_f1_macro_mean']:.4f} ± {cfg_row['best_f1_macro_std']:.4f}, "
          f"last_f1_macro={cfg_row['last_f1_macro_mean']:.4f} ± {cfg_row['last_f1_macro_std']:.4f}, "
          f"acc={cfg_row['acc_mean']:.4f} ± {cfg_row['acc_std']:.4f}, ")

    cv_results.append(cfg_row)
    print(cv_results)


===== K-Fold CV for config: hl400x2 =====

[hl400x2] Fold 1/5


2025-11-23 13:54:49.267494: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-11-23 13:54:49.269319: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-11-23 13:54:49.269323: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-11-23 13:54:49.271593: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-11-23 13:54:49.272569: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/50


2025-11-23 13:54:51.081337: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 280ms/step - accuracy: 0.2679 - loss: 2.0913 - val_accuracy: 0.2573 - val_loss: 4.0172 - val_macro_f1: 0.0416
Epoch 2/50
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 259ms/step - accuracy: 0.5846 - loss: 1.1295 - val_accuracy: 0.2108 - val_loss: 3.3981 - val_macro_f1: 0.1122
Epoch 3/50
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 268ms/step - accuracy: 0.8366 - loss: 0.4914 - val_accuracy: 0.1064 - val_loss: 3.5468 - val_macro_f1: 0.0536
Epoch 4/50
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 256ms/step - accuracy: 0.9328 - loss: 0.2435 - val_accuracy: 0.2379 - val_loss: 2.6385 - val_macro_f1: 0.1516
Epoch 5/50
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 250ms/step - accuracy: 0.9729 - loss: 0.1296 - val_accuracy: 0.2108 - val_loss: 3.2318 - val_macro_f1: 0.1431
Epoch 6/50
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 253ms

In [7]:
cv_df = pd.DataFrame(cv_results).sort_values("best_f1_macro_mean", ascending=False)
print("\nK-Fold 결과 best_f1_macro_mean DESC")
print(cv_df.to_string(index=False))

kfold_result_csv_path = DATA_DIR / "kfold_result_class.csv"

cv_df.to_csv(kfold_result_csv_path)


K-Fold 결과 best_f1_macro_mean DESC
              name  acc_mean  acc_std  prec_macro_mean  prec_macro_std  best_f1_macro_mean  best_f1_macro_std  last_f1_macro_mean  last_f1_macro_std
           hl300x2  0.539265 0.014751         0.540555        0.015302            0.522504           0.013446            0.513433           0.015328
           hl300x3  0.545068 0.010901         0.554231        0.019216            0.511060           0.014264            0.498991           0.014582
        cl16_32_64  0.535397 0.010832         0.536960        0.016697            0.506638           0.011692            0.502457           0.010623
hl300x2_100x2_50x2  0.510251 0.033667         0.566498        0.045067            0.490764           0.017120            0.464768           0.031682
           hl100x4  0.517602 0.006309         0.546708        0.021414            0.488260           0.012076            0.478842           0.018194
