<a href="https://colab.research.google.com/github/2025-02-FML-team/WV-Team/blob/main/notebooks/05_class_balance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import shutil
from pathlib import Path

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_DIR = Path('/content/unpacked/')
    PACK_DIR = Path('/content/drive/My Drive/colab_drive/prepacked.zip')
    shutil.copy(PACK_DIR, '/content/')
    !unzip -o -q /content/prepacked.zip -d {DATA_DIR}
else:
    DATA_DIR= Path(os.path.join(os.getcwd(), "../data/")).resolve()
DATA_DIR

PosixPath('/workspace/WV-Team/data')

In [2]:
from tensorflow.keras.callbacks import ModelCheckpoint

configs = [
    {"name": "id384x256", "input_dim": (384, 256)},
    {"name": "id336x224", "input_dim": (336, 224)},
    {"name": "id256x384", "input_dim": (256, 384)},
    {"name": "id224x336", "input_dim": (224, 336)},
]

cv_results = []

2025-11-23 17:39:51.342090: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# CSV 로드 및 정리, 본인 경로에 맞게 변환
CSV_PATH = DATA_DIR / 'whiskies_relabel.csv'
#IMAGE_SIZE = (256, 256)
RANDOM_STATE = 42

tf.random.set_seed(RANDOM_STATE)

In [4]:
df = pd.read_csv(CSV_PATH, dtype={"id": str})
df["id"] = df["id"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
df["category"] = df["category"].astype(str).str.strip()
paths = [DATA_DIR / p for p in df["local_full_path"]]

bar = tqdm(paths, desc="Processing Images", unit="img")

# 이미지 로드
X_list = []
for p in bar:
    with Image.open(p) as im:
        im = im.convert("RGB")
        #im = im.resize(IMAGE_SIZE) #retain the original size
        arr = np.asarray(im, dtype=np.uint8)
        X_list.append(arr)

Processing Images:   0%|          | 0/3042 [00:00<?, ?img/s]

In [5]:
def resize_batch(images, size):
    out = []
    for img in images:
        im = Image.fromarray(img)
        im = im.resize((size[1], size[0], ))
        out.append(np.asarray(im, dtype=np.uint8))
    return np.stack(out)

In [6]:
config_batches = []

for cfg in configs:
    X_adjusted = resize_batch(X_list, cfg["input_dim"])
    config_batches.append(X_adjusted)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 라벨 인코딩
labels = df["category"].values
le = LabelEncoder()
y_int = le.fit_transform(labels)
CLASS_NUM = len(le.classes_)
datas = []

for index in range(len(configs)):
    name = configs[index]["name"]
    X_adjusted = config_batches[index]
    # test 분리
    X_rest, X_test, y_rest, y_test = train_test_split(
        X_adjusted, y_int,
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=y_int
    )

    data = { "X_rest": X_rest, "X_test": X_test, "y_rest": y_rest, "y_test": y_test }
    datas.append(data)

In [8]:
from sklearn.utils.class_weight import compute_class_weight

#굳이 이렇게 할 필요는 1도 없지만 지금 도저히 더 좋은 코드를 짤 수 있는 상황이 아님
def get_class_weight(y_train):
    class_weights_array = compute_class_weight(
        class_weight="balanced",
        classes=np.array(range(CLASS_NUM)),
        y=y_train,
    )
    
    class_weight_dict = {}
    
    i = 0;
    for weight in class_weights_array:
        class_weight_dict[i] = weight
        i += 1

    return class_weight_dict

In [9]:
#03 노트북 코드++
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback

class ControllerCallback(Callback):
    def __init__(self, X_val, y_val, start_from_epoch=12, patient=3, tqdm=None):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.f1_scores = [] #this is for cumilating f1 per epoch
        self.start_from_epoch = start_from_epoch
        self.patient = patient
        self.out = 0
        self.best_f1 = -1
        self.epochs = 0
        self.tqdm = tqdm

    def on_epoch_end(self, epoch, logs=None):
        self.epochs += 1
        y_pred = self.model.predict(self.X_val, verbose=0)
        y_pred = np.argmax(y_pred, axis=1)

        if self.y_val.ndim == 2:
            y_true = np.argmax(self.y_val, axis=1)
        else:
            y_true = self.y_val

        f1 = f1_score(y_true, y_pred, average='macro')
        self.f1_scores.append(f1)
        logs['val_macro_f1'] = f1

        if f1 > self.best_f1:
            self.best_f1 = f1

        if 1 < epoch and epoch > self.start_from_epoch and f1 < self.f1_scores[-2]:
            if not tqdm:
                print(f"\nNon Improvement detected at EP : {epoch}, f1 : {f1}")
            self.out += 1

        if self.tqdm:
            self.tqdm.set_postfix(epochs=self.epochs, curr_f1=f1, best_f1=self.best_f1, strikes=self.out)

        if self.out >= self.patient:
            if not tqdm:
                print(f"\nStopping at EP : {epoch}, f1 : {f1}")
            self.model.stop_training = True

In [10]:
from tensorflow.keras import layers, models
from tensorflow.keras.activations import gelu

#the name keyword is just there to use kwargs, it's not actually used.
def build_model(
    hidden=[200, 200],
    conv=[16, 32, 48],
    conv_double=True,
    input_dim=(256, 256),
    name=""
):
    inputs = keras.Input(shape=(input_dim[0], input_dim[1], 3))

    x = inputs
    for cl in conv:
        x = layers.Conv2D(cl, (3,3), activation='relu', padding='same')(x)
        if conv_double:
            x = layers.Conv2D(cl, (3,3), activation='relu', padding='same')(x)
        x = layers.MaxPooling2D((2,2))(x)

    x = layers.Flatten()(x)

    for hl in hidden:
        x = layers.Dense(hl)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('gelu')(x)

    outputs = layers.Dense(CLASS_NUM, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=3e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [11]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

print(tf.config.list_physical_devices())

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [13]:
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score

# ----- 설정 값들 -----
N_SPLITS   = 5      # k-fold 개수
EPOCHS     = 50     # 최대 epoch
BATCH_SIZE = 32

skf = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE,
)

#버그로 끊겨서 이렇게 하였을 뿐, 나중에 다시 돌려놔야함 TODO
bar_cfg = tqdm(configs[1:], desc="Model Configurations", unit="config")
data_index = 1

for cfg in bar_cfg:
    name = cfg["name"]
    bar_cfg.set_postfix(name=name)

    fold_accuracies = []
    fold_precisions = []
    fold_f1s        = []
    fold_last_f1s   = []

    # k-fold 루프
    bar_fold = tqdm(enumerate(skf.split(X_rest, y_rest), start=1), desc="st K-fold", unit="fold", total=N_SPLITS)
    for fold_idx, (train_idx, val_idx) in bar_fold:
        bar_fold.desc = f'st K-fold, fold:{fold_idx}'

        X_tr, X_val = datas[data_index]["X_rest"][train_idx], datas[data_index]["X_rest"][val_idx]
        y_tr, y_val = datas[data_index]["y_rest"][train_idx], datas[data_index]["y_rest"][val_idx]
        class_weight = get_class_weight(y_tr)

        # 모델 생성(컴파일도 여기서 진행!)
        model = build_model(**cfg)
        #model.summary() #debug

        # f1 + early stopping + progress
        controller = ControllerCallback(
            X_val, y_val,
            start_from_epoch=15,
            patient=5,
            tqdm=bar_fold
        )

        history = model.fit(
            X_tr, y_tr,
            validation_data=(X_val, y_val),
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            callbacks=[controller],
            class_weight=class_weight, #이렇게 하면 class weight를 줄 수 있음. 근데 그냥 이렇게 하고 끝낼 예정...
            verbose=0,
        )

        # ---- 이 fold에서 metrics 계산 ----
        # 1) loss / accuracy (evaluate)
        loss, acc = model.evaluate(X_val, y_val, verbose=0)

        # 2) 예측값 가져와서 precision / f1 (macro) 계산
        y_prob = model.predict(X_val, verbose=0)
        y_pred = np.argmax(y_prob, axis=1)

        y_true = y_val

        precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
        f1        = controller.best_f1
        last_f1   = controller.f1_scores[-1]

        fold_accuracies.append(acc)
        fold_precisions.append(precision)
        fold_f1s.append(f1)
        fold_last_f1s.append(last_f1)

        #now this actually helps
        del model
        model = None
        gc.collect()

    # ----- config별 평균/표준편차 정리 -----
    cfg_row = {
        "name": name,
        "acc_mean":  float(np.mean(fold_accuracies)),
        "acc_std":   float(np.std(fold_accuracies)),
        "prec_macro_mean": float(np.mean(fold_precisions)),
        "prec_macro_std":  float(np.std(fold_precisions)),
        "best_f1_macro_mean":   float(np.mean(fold_f1s)),
        "best_f1_macro_std":    float(np.std(fold_f1s)),
        "last_f1_macro_mean":   float(np.mean(fold_last_f1s)),
        "last_f1_macro_std":    float(np.std(fold_last_f1s)),
    }

    print(f"\n>>> [CV Summary] {name}: "
          f"f1_macro={cfg_row['best_f1_macro_mean']:.4f} ± {cfg_row['best_f1_macro_std']:.4f}, "
          f"last_f1_macro={cfg_row['last_f1_macro_mean']:.4f} ± {cfg_row['last_f1_macro_std']:.4f}, "
          f"acc={cfg_row['acc_mean']:.4f} ± {cfg_row['acc_std']:.4f}, ")

    cv_results.append(cfg_row)
    data_index += 1
    #print(cv_results)

Model Configurations:   0%|          | 0/3 [00:00<?, ?config/s]

st K-fold:   0%|          | 0/5 [00:00<?, ?fold/s]

2025-11-23 18:02:25.456394: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.







>>> [CV Summary] id336x224: f1_macro=0.4922 ± 0.0125, last_f1_macro=0.4256 ± 0.1286, acc=0.4612 ± 0.1208, 


st K-fold:   0%|          | 0/5 [00:00<?, ?fold/s]

2025-11-23 18:09:12.086129: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.






>>> [CV Summary] id256x384: f1_macro=0.4926 ± 0.0249, last_f1_macro=0.4849 ± 0.0243, acc=0.5154 ± 0.0211, 


st K-fold:   0%|          | 0/5 [00:00<?, ?fold/s]

2025-11-23 18:16:39.954146: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.




>>> [CV Summary] id224x336: f1_macro=0.5153 ± 0.0166, last_f1_macro=0.5033 ± 0.0123, acc=0.5302 ± 0.0153, 


In [15]:
cv_df = pd.DataFrame(cv_results).sort_values("best_f1_macro_mean", ascending=False)
print("\nK-Fold 결과 best_f1_macro_mean DESC")
print(cv_df.to_string(index=False))

kfold_result_csv_path = DATA_DIR / "kfold_result_input.csv"

cv_df.to_csv(kfold_result_csv_path)


K-Fold 결과 best_f1_macro_mean DESC
     name  acc_mean  acc_std  prec_macro_mean  prec_macro_std  best_f1_macro_mean  best_f1_macro_std  last_f1_macro_mean  last_f1_macro_std
id224x336  0.530204 0.015296         0.525065        0.018379            0.515298           0.016560            0.503272           0.012275
id256x384  0.515416 0.021094         0.504169        0.022116            0.492634           0.024869            0.484855           0.024319
id336x224  0.461210 0.120845         0.468711        0.081118            0.492225           0.012477            0.425649           0.128623
id384x256  0.519512 0.019123         0.509456        0.034301            0.490849           0.023655            0.482758           0.027319
