# ※현재는 1차 모델링 코드로, 확정된 코드들이 아닙니다.
## 1차 코드는 모델 설계까지만 구현되어있습니다.   모델 평가나 성능 확인은 추후에 추가할 예정...

In [1]:
import os, json, math, random
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
# ------------------------------
# 유틸
# ------------------------------
def load_csv_features_labels(csv_path: str):
    df = pd.read_csv(csv_path)
    # 숫자형만 입력특징으로 사용. (frame, drowsiness_* 제외)
    drop_cols = set(["drowsiness_level", "drowsiness_label", "label_name"])  # label_name은 문자열
    num_cols = [c for c in df.columns 
                if c not in drop_cols and pd.api.types.is_numeric_dtype(df[c])]
    # frame은 시간/인덱스 성격이라 보통 제외 권장 (원하면 포함 가능)
    if "frame" in num_cols:
        num_cols.remove("frame")
    X = df[num_cols].to_numpy(dtype=np.float32)
    y = df["drowsiness_level"].astype(int).to_numpy()
    frames = df["frame"].astype(int).to_numpy() if "frame" in df.columns else np.arange(len(df))
    meta = {"feature_names": num_cols}
    return X, y, frames, meta

def load_segments_json(json_path: str):
    with open(json_path, "r", encoding="utf-8") as f:
        js = json.load(f)
    fps = js["dataset"].get("fps", 30.0)  # 예: 29.76
    segments = js["segments"]
    # [start,end) half-open
    return fps, segments

def make_windows_from_segments(
    X: np.ndarray, y: np.ndarray, frames: np.ndarray, segments: List[Dict],
    win_len: int, stride: int
) -> Tuple[np.ndarray, np.ndarray]:
    """
    1차(약라벨)용: 각 세그먼트 안에서만 윈도우 슬라이딩. label=-1(전이/무시) 제외.
    세그먼트 라벨을 그대로 윈도우 라벨로 사용.
    """
    Xw, yw = [], []
    frame_to_index = {int(fr): i for i, fr in enumerate(frames)}
    for seg in segments:
        lab = int(seg["label"])
        if lab == -1:  # 전이/무시 제외
            continue
        s, e = int(seg["start"]), int(seg["end"])   # [s, e)
        # 세그먼트 내부에서만 윈도우 가능 (e - s >= win_len)
        if e - s < win_len: 
            continue
        # 슬라이딩
        f = s
        while f + win_len <= e:
            # 프레임 -> 인덱스 매핑(연속 프레임 가정)
            # 일부 프레임 누락이 있다면 try/except로 skip 처리
            try:
                start_idx = frame_to_index[f]
                end_idx   = frame_to_index[f + win_len - 1] + 1
                x_win = X[start_idx:end_idx]
                if x_win.shape[0] == win_len:
                    Xw.append(x_win)
                    yw.append(lab)
            except KeyError:
                pass
            f += stride
    if len(Xw) == 0:
        return np.empty((0, win_len, X.shape[1]), dtype=np.float32), np.empty((0,), dtype=np.int32)
    return np.stack(Xw), np.array(yw, dtype=np.int32)

def make_windows_from_frames_majority(
    X: np.ndarray, y: np.ndarray, frames: np.ndarray, 
    win_len: int, stride: int, majority_thr: float = 0.5
) -> Tuple[np.ndarray, np.ndarray]:
    """
    2차(정밀라벨)용: 전체 프레임에서 윈도우를 만들고 
    윈도우 안 프레임 라벨의 다수결로 윈도우 라벨 결정.
    majority_thr는 최다 라벨 비율의 최소 기준(0.5~0.7 권장).
    """
    Xw, yw = [], []
    n = len(frames)
    start_idx = 0
    while start_idx + win_len <= n:
        end_idx = start_idx + win_len
        y_win = y[start_idx:end_idx]
        # 다수결
        vals, cnts = np.unique(y_win, return_counts=True)
        maj_lab = int(vals[np.argmax(cnts)])
        maj_ratio = float(np.max(cnts)) / win_len
        if maj_ratio >= majority_thr:
            Xw.append(X[start_idx:end_idx])
            yw.append(maj_lab)
        start_idx += stride
    if len(Xw) == 0:
        return np.empty((0, win_len, X.shape[1]), dtype=np.float32), np.empty((0,), dtype=np.int32)
    return np.stack(Xw), np.array(yw, dtype=np.int32)

def split_by_time(Xw: np.ndarray, yw: np.ndarray, train=0.7, val=0.15):
    """윈도우 순서를 유지한 채 시계열 분할(시간 누수 방지)."""
    n = len(Xw)
    n_train = int(n * train)
    n_val   = int(n * val)
    X_tr, y_tr = Xw[:n_train], yw[:n_train]
    X_va, y_va = Xw[n_train:n_train+n_val], yw[n_train:n_train+n_val]
    X_te, y_te = Xw[n_train+n_val:], yw[n_train+n_val:]
    return (X_tr, y_tr), (X_va, y_va), (X_te, y_te)

def compute_class_weights(y: np.ndarray, n_classes=4):
    from collections import Counter
    cnt = Counter(y.tolist())
    total = sum(cnt.values())
    weights = {i: total / (n_classes * cnt.get(i, 1)) for i in range(n_classes)}
    return weights

def standardize_fit(X: np.ndarray):
    mean = X.mean(axis=(0,1), keepdims=True)
    std  = X.std(axis=(0,1), keepdims=True) + 1e-8
    return mean, std

def standardize_apply(X: np.ndarray, mean, std):
    return (X - mean) / std

def make_tf_dataset(X, y, batch=64, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle:
        ds = ds.shuffle(len(X), reshuffle_each_iteration=True)
    ds = ds.batch(batch).prefetch(tf.data.AUTOTUNE)
    return ds

def build_lstm_model(input_shape, n_classes=4):
    inp = tf.keras.Input(shape=input_shape)  # (win_len, n_features)
    x = tf.keras.layers.Masking(mask_value=0.0)(inp)
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(128, return_sequences=False)
    )(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    out = tf.keras.layers.Dense(n_classes, activation="softmax")(x)
    model = tf.keras.Model(inp, out)
    return model

In [None]:
# ------------------------------
# 경로 설정
# ------------------------------
PATH_CSV  = "/mnt/data/gC_15_s5_2019-03-12T11;03;23+01;00_rgb_face-labeled.csv"
PATH_JSON = "/mnt/data/gC_15_s5_2019-03-12T11;03;23+01;00_rgb_face-labeled.json"

In [None]:
# ------------------------------
# 실행
# ------------------------------

# 0) 데이터 로드
X_all, y_all, frames_all, meta = load_csv_features_labels(PATH_CSV)
fps, segments = load_segments_json(PATH_JSON)   # fps≈29.76, segments는 [start,end)  :contentReference[oaicite:1]{index=1}

# 1) 윈도우 파라미터 (10초)
win_sec = 10.0
stride_sec_stage1 = 2.0   # 약라벨은 비교적 큰 stride로도 충분
stride_sec_stage2 = 1.0   # 정밀 단계는 더 촘촘히
win_len   = int(round(fps * win_sec))
stride_w1 = max(1, int(round(fps * stride_sec_stage1)))
stride_w2 = max(1, int(round(fps * stride_sec_stage2)))

print(f"fps={fps:.2f}, win_len(frames)={win_len}, stride_stage1={stride_w1}, stride_stage2={stride_w2}")

# 2) 1차: JSON 세그먼트 기반 약라벨 윈도우
Xw_weak, yw_weak = make_windows_from_segments(
    X_all, y_all, frames_all, segments, win_len=win_len, stride=stride_w1
)
print("Stage1 weak windows:", Xw_weak.shape, yw_weak.shape, "labels:", np.unique(yw_weak, return_counts=True))

# 3) 2차: CSV 정밀라벨 기반 윈도우 (다수결)
Xw_strong, yw_strong = make_windows_from_frames_majority(
    X_all, y_all, frames_all, win_len=win_len, stride=stride_w2, majority_thr=0.6
)
print("Stage2 strong windows:", Xw_strong.shape, yw_strong.shape, "labels:", np.unique(yw_strong, return_counts=True))

# 4) 표준화(학습세트 통계에 맞춰)
#   - Stage1: weak train 세트로 fit
#   - Stage2: strong train 세트로 다시 미세조정에서만 재-fit(선택사항). 
#     여기서는 Stage1 통계를 그대로 사용하여 일관성 유지.
(Xw1_tr, yw1_tr), (Xw1_va, yw1_va), (Xw1_te, yw1_te) = split_by_time(Xw_weak, yw_weak, train=0.8, val=0.1)
mean1, std1 = standardize_fit(Xw1_tr)
Xw1_tr = standardize_apply(Xw1_tr, mean1, std1)
Xw1_va = standardize_apply(Xw1_va, mean1, std1)
Xw1_te = standardize_apply(Xw1_te, mean1, std1)

# Stage2도 동일 통계 사용(권장). 만약 재-fit 원하면 mean2,std2로 교체 가능.
(Xw2_tr, yw2_tr), (Xw2_va, yw2_va), (Xw2_te, yw2_te) = split_by_time(Xw_strong, yw_strong, train=0.8, val=0.1)
Xw2_tr = standardize_apply(Xw2_tr, mean1, std1)
Xw2_va = standardize_apply(Xw2_va, mean1, std1)
Xw2_te = standardize_apply(Xw2_te, mean1, std1)

# 5) tf.data
train1 = make_tf_dataset(Xw1_tr, yw1_tr, batch=64, shuffle=True)
valid1 = make_tf_dataset(Xw1_va, yw1_va, batch=64, shuffle=False)
test1  = make_tf_dataset(Xw1_te, yw1_te, batch=64, shuffle=False)

train2 = make_tf_dataset(Xw2_tr, yw2_tr, batch=64, shuffle=True)
valid2 = make_tf_dataset(Xw2_va, yw2_va, batch=64, shuffle=False)
test2  = make_tf_dataset(Xw2_te, yw2_te, batch=64, shuffle=False)

# 6) 모델
n_classes = 4
input_shape = (win_len, X_all.shape[1])
model = build_lstm_model(input_shape, n_classes=n_classes)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# 콜백
ckpt1 = tf.keras.callbacks.ModelCheckpoint("stage1_pretrained.keras", save_best_only=True, monitor="val_accuracy", mode="max")
es1   = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor="val_accuracy", mode="max")

# 클래스 불균형 보정(약라벨 분포 기준)
class_weights1 = compute_class_weights(yw1_tr, n_classes=n_classes)

# 7) 1차 학습 (약라벨)
hist1 = model.fit(train1, validation_data=valid1, epochs=50, callbacks=[ckpt1, es1],
                  class_weight=class_weights1, verbose=2)

print("Stage1 (weak) eval:", model.evaluate(test1, verbose=0))

# 8) 2차 학습: 정밀라벨로 미세조정
#    - 더 낮은 학습률, 새 체크포인트
model.compile(optimizer=tf.keras.optimizers.Adam(3e-4),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
ckpt2 = tf.keras.callbacks.ModelCheckpoint("stage2_finetuned.keras", save_best_only=True, monitor="val_accuracy", mode="max")
es2   = tf.keras.callbacks.EarlyStopping(patience=7, restore_best_weights=True, monitor="val_accuracy", mode="max")
class_weights2 = compute_class_weights(yw2_tr, n_classes=n_classes)

hist2 = model.fit(train2, validation_data=valid2, epochs=50, callbacks=[ckpt2, es2],
                  class_weight=class_weights2, verbose=2)

print("Stage2 (strong) eval:", model.evaluate(test2, verbose=0))

# 9) 저장
model.save("drowsiness_lstm_final.keras")
np.savez("standardizer_stage1_stats.npz", mean=mean1, std=std1, win_len=win_len)
print("Done.")
