# 셀 1 : 환경 설정 & 라이브러리 로드

In [1]:
import os, random, json, pickle, math
import numpy as np
import pandas as pd
from tqdm import tqdm

# Stage‑1 과 동일 GPU 환경 고정
os.environ["CUDA_DEVICE_ORDER"]    = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"      # 물리 GPU 3번만 사용

# 재현성
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

print("Stage‑2 Bag‑Generator 초기화 완료")

Stage‑2 Bag‑Generator 초기화 완료


# 셀 2 : 전역 하이퍼파라미터 선언

In [2]:
CONFIG = {
    # ---------- 파일 경로 ----------
    "embedding_root": "/workspace/MIL/data/processed/embeddings",
    "embedding_tag" : "arcface_margin_0.4",          # Stage‑1 결과 파일 prefix
    "output_dir"    : "/workspace/MIL/data/processed/bags",

    # ---------- MIL 슬라이딩 윈도우 ----------
    "window_size" : 5,        # 단어 5개 = 1 Instance
    "stride"      : 1,

    # ---------- Bag 합성 ----------
    "bag_block"   : 10,       # 1 block = 10 윈도우
    "neg_mix_ratio": [0.05, 0.10, 0.20, 0.30],  # 위조 난이도별 비율
    "bags_per_doc": 8,        # 문서당 Bag 수

    # ---------- 저장 ----------
    "pickle_protocol": 4
}

os.makedirs(CONFIG["output_dir"], exist_ok=True)
print(json.dumps(CONFIG, indent=2))


{
  "embedding_root": "/workspace/MIL/data/processed/embeddings",
  "embedding_tag": "arcface_margin_0.4",
  "output_dir": "/workspace/MIL/data/processed/bags",
  "window_size": 5,
  "stride": 1,
  "bag_block": 10,
  "neg_mix_ratio": [
    0.05,
    0.1,
    0.2,
    0.3
  ],
  "bags_per_doc": 8,
  "pickle_protocol": 4
}


# 셀 3 : 임베딩 CSV 로드 함수

In [3]:
def load_split(split: str) -> pd.DataFrame:
    """
    split ∈ {'train','val','test'}
    CSV → DataFrame 반환
    """
    csv_path = f"{CONFIG['embedding_root']}/mil_{CONFIG['embedding_tag']}_{split}_data.csv"
    df = pd.read_csv(csv_path)
    assert df.shape[1] >= 3, f"컬럼 수 이상함: {csv_path}"
    return df

df_train = load_split("train")   # 0‑179
df_val   = load_split("val")     # 180‑239
df_test  = load_split("test")    # 240‑299
print("CSV 로드 완료:", [len(df_train), len(df_val), len(df_test)])


CSV 로드 완료: [208233, 70533, 72457]


# 셀 4 : 슬라이딩‑윈도우 Instance 생성 유틸

In [4]:
def build_instances(df: pd.DataFrame, win: int, stride: int):
    """
    DataFrame → List[ (writer_id, doc_id, win_embeddings (win×D)) ]
    * doc_id는 작성자ID/세션 조합으로 정의
    """
    inst_list = []
    D = df.filter(like="embedding_").shape[1]

    # 문서 단위 그룹핑 (작성자/세션 조합)
    # path 형식: {label}/{word}_{repetition}.png
    def get_session(path):
        rep_num = int(path.split('_')[-1].split('.')[0])
        return rep_num // 9  # 0-8, 9-17, 18-26 (3개 세션)
    
    df["doc_key"] = df.apply(lambda row: f"{row['label']}/session_{get_session(row['path'])}", axis=1)
    
    # 디버깅: 문서 수 확인
    print(f"총 문서 수: {df['doc_key'].nunique()}")
    
    for doc_key, g in tqdm(df.groupby("doc_key"), total=df["doc_key"].nunique(), desc="docs"):
        emb = g.filter(like="embedding_").to_numpy()
        lab = g["label"].iloc[0]
        
        # 문서가 너무 작으면 스킵
        if len(emb) < win:
            continue
            
        for s in range(0, len(emb) - win + 1, stride):
            window_emb = emb[s: s + win]         # (win, D)
            inst_list.append(
                {
                    "writer": int(lab),
                    "doc": doc_key,
                    "emb": window_emb.astype(np.float32)
                }
            )
    return inst_list

inst_train = build_instances(df_train, CONFIG["window_size"], CONFIG["stride"])
inst_val   = build_instances(df_val,   CONFIG["window_size"], CONFIG["stride"])
inst_test  = build_instances(df_test,  CONFIG["window_size"], CONFIG["stride"])

print("Instance 개수:", [len(inst_train), len(inst_val), len(inst_test)])

총 문서 수: 720


docs: 100%|██████████| 720/720 [00:01<00:00, 398.50it/s]


총 문서 수: 240


docs: 100%|██████████| 240/240 [00:00<00:00, 408.28it/s]


총 문서 수: 240


docs: 100%|██████████| 240/240 [00:00<00:00, 396.12it/s]

Instance 개수: [205356, 69573, 71500]





# 셀 5 : Positive / Negative Bag 합성

In [5]:
def synthesize_bags(inst_list, neg_source, mode:str):
    """
    mode ∈ {'train','val','test'}
    neg_source: negative sampling에 사용할 instance pool
    반환: List[ {bag_emb:(n_inst,D,win), bag_label(0/1)} ]
    """
    random.shuffle(inst_list)           # in‑place
    bags = []
    block = CONFIG["bag_block"]
    win   = CONFIG["window_size"]

    # 문서별로 그룹핑
    from collections import defaultdict
    doc_dic = defaultdict(list)
    for inst in inst_list:
        doc_dic[inst["doc"]].append(inst)

    # Bag 생성
    for doc_key, insts in doc_dic.items():
        total_blocks = math.floor(len(insts) / block)
        for b in range(min(total_blocks, CONFIG["bags_per_doc"])):
            start = b * block
            block_insts = insts[start: start + block]

            base_writer = block_insts[0]["writer"]
            base_emb    = np.stack([i["emb"] for i in block_insts])  # (block, win, D)

            # Positive Bag
            bags.append({
                "bag_emb": base_emb,               # (block,win,D)
                "bag_label": 0,                    # 0 = single‑writer
                "writer":   base_writer,
                "doc":      doc_key,
                "neg_ratio": 0.0,                 # positive bag은 ratio 0
                "bag_id": f"{doc_key}_block{b}_pos"  # 고유 식별자 추가
            })

            # Negative Bags (mix ratios)
            for r in CONFIG["neg_mix_ratio"]:
                # 최소 1개는 교체되도록 보장
                k_replace = max(1, int(block * r))
                new_emb = base_emb.copy()

                # 다른 작성자 instance pool (neg_source에서 가져옴)
                neg_pool = [inst for inst in neg_source if inst["writer"] != base_writer]
                repl_idxs = random.sample(range(block), k_replace)
                for idx in repl_idxs:
                    new_emb[idx] = random.choice(neg_pool)["emb"]

                bags.append({
                    "bag_emb": new_emb,
                    "bag_label": 1,                # 1 = multi‑writer
                    "writer":   base_writer,
                    "doc":      doc_key,
                    "neg_ratio": r,                # negative ratio 기록
                    "bag_id": f"{doc_key}_block{b}_neg{r}"  # 고유 식별자 추가
                })
    
    print(f"{mode.upper()} Bag 생성 완료: {len(bags)} bags (불균형 상태)")
    
    # 클래스 불균형 완화 (모든 split에 적용)
    pos_bags = [b for b in bags if b["bag_label"] == 0]
    neg_bags = [b for b in bags if b["bag_label"] == 1]
    
    # 현재 비율 출력
    print(f"  - 원본: Positive {len(pos_bags)}, Negative {len(neg_bags)} (1:{len(neg_bags)/len(pos_bags):.1f})")
    
    # 1:2 비율로 다운샘플링 (negative를 줄임)
    target_neg = len(pos_bags) * 2
    if len(neg_bags) > target_neg:
        # neg_ratio별로 균등하게 샘플링
        neg_by_ratio = defaultdict(list)
        for b in neg_bags:
            neg_by_ratio[b["neg_ratio"]].append(b)
        
        sampled_neg = []
        per_ratio = target_neg // len(CONFIG["neg_mix_ratio"])
        for ratio in CONFIG["neg_mix_ratio"]:
            ratio_bags = neg_by_ratio[ratio]
            sampled = random.sample(ratio_bags, min(per_ratio, len(ratio_bags)))
            sampled_neg.extend(sampled)
        
        # 부족한 수만큼 랜덤 추가
        if len(sampled_neg) < target_neg:
            remaining = target_neg - len(sampled_neg)
            # 고유 ID를 사용한 안전한 비교 (오류 해결)
            sampled_ids = set(b["bag_id"] for b in sampled_neg)
            pool = [b for b in neg_bags if b["bag_id"] not in sampled_ids]
            sampled_neg.extend(random.sample(pool, min(remaining, len(pool))))
        
        bags = pos_bags + sampled_neg[:target_neg]
        random.shuffle(bags)
        print(f"  - 균형화: Positive {len(pos_bags)}, Negative {len([b for b in bags if b['bag_label']==1])} (1:2)")
    
    print(f"{mode.upper()} 최종 Bag 개수: {len(bags)}")
    return bags

# 각 split별로 자신의 pool 사용
bags_train = synthesize_bags(inst_train, inst_train, "train")
bags_val   = synthesize_bags(inst_val,   inst_val,   "val")
bags_test  = synthesize_bags(inst_test,  inst_test,  "test")

TRAIN Bag 생성 완료: 21605 bags (불균형 상태)
  - 원본: Positive 4321, Negative 17284 (1:4.0)
  - 균형화: Positive 4321, Negative 8642 (1:2)
TRAIN 최종 Bag 개수: 12963
VAL Bag 생성 완료: 7200 bags (불균형 상태)
  - 원본: Positive 1440, Negative 5760 (1:4.0)
  - 균형화: Positive 1440, Negative 2880 (1:2)
VAL 최종 Bag 개수: 4320
TEST Bag 생성 완료: 7200 bags (불균형 상태)
  - 원본: Positive 1440, Negative 5760 (1:4.0)
  - 균형화: Positive 1440, Negative 2880 (1:2)
TEST 최종 Bag 개수: 4320


# 셀 6 : 저장 (Pickle + 메타 JSON)

In [6]:
def save_bags(bags, split:str):
    out_pk = f"{CONFIG['output_dir']}/bags_{CONFIG['embedding_tag']}_{split}.pkl"
    with open(out_pk, "wb") as f:
        pickle.dump(bags, f, protocol=CONFIG["pickle_protocol"])
    print(f"✔ {split.upper()} Pickle 저장: {out_pk} ({len(bags)} bags)")

for split, obj in zip(["train","val","test"], [bags_train, bags_val, bags_test]):
    save_bags(obj, split)

# neg_ratio별 분포 계산
def get_ratio_distribution(bags):
    from collections import Counter
    ratios = [b.get("neg_ratio", 0.0) for b in bags]
    return dict(Counter(ratios))

# 메타 정보 기록
meta = {
    "config": CONFIG,
    "seed": SEED,
    "stats": {
        "train_bags": len(bags_train),
        "val_bags":   len(bags_val),
        "test_bags":  len(bags_test),
        "train_pos_neg": {
            "positive": len([b for b in bags_train if b["bag_label"] == 0]),
            "negative": len([b for b in bags_train if b["bag_label"] == 1])
        },
        "val_pos_neg": {
            "positive": len([b for b in bags_val if b["bag_label"] == 0]),
            "negative": len([b for b in bags_val if b["bag_label"] == 1])
        },
        "test_pos_neg": {
            "positive": len([b for b in bags_test if b["bag_label"] == 0]),
            "negative": len([b for b in bags_test if b["bag_label"] == 1])
        },
        "neg_ratio_distribution": {
            "train": get_ratio_distribution(bags_train),
            "val": get_ratio_distribution(bags_val),
            "test": get_ratio_distribution(bags_test)
        }
    }
}
with open(f"{CONFIG['output_dir']}/bags_{CONFIG['embedding_tag']}_meta.json","w") as f:
    json.dump(meta, f, indent=2)
print("메타 정보 저장 완료 (neg_ratio 분포 포함)")

✔ TRAIN Pickle 저장: /workspace/MIL/data/processed/bags/bags_arcface_margin_0.4_train.pkl (12963 bags)
✔ VAL Pickle 저장: /workspace/MIL/data/processed/bags/bags_arcface_margin_0.4_val.pkl (4320 bags)
✔ TEST Pickle 저장: /workspace/MIL/data/processed/bags/bags_arcface_margin_0.4_test.pkl (4320 bags)
메타 정보 저장 완료 (neg_ratio 분포 포함)


# 셀 7 : 빠른 검증 — Bag 크기 & 라벨 분포

In [7]:
import collections

# 클래스 균형 및 neg_ratio 분포 검증
print("===== 클래스 균형 검증 =====")
for split, bags in [("train", bags_train), ("val", bags_val), ("test", bags_test)]:
    pos_cnt = sum(1 for b in bags if b["bag_label"] == 0)
    neg_cnt = sum(1 for b in bags if b["bag_label"] == 1)
    ratio = neg_cnt / pos_cnt if pos_cnt > 0 else 0
    
    print(f"\n{split.upper()} 세트:")
    print(f"  - Positive (single-writer): {pos_cnt}")
    print(f"  - Negative (multi-writer): {neg_cnt}")
    print(f"  - 비율 (Neg:Pos): {ratio:.2f}:1")
    
    # neg_ratio별 분포
    if neg_cnt > 0:
        neg_ratios = [b["neg_ratio"] for b in bags if b["bag_label"] == 1]
        ratio_dist = collections.Counter(neg_ratios)
        print(f"  - Negative ratio 분포:")
        for r in sorted(ratio_dist.keys()):
            print(f"    * {r}: {ratio_dist[r]} bags ({ratio_dist[r]/neg_cnt*100:.1f}%)")

# 샘플 Bag 확인
print("\n===== 샘플 Bag 정보 =====")
sample_pos = next(b for b in bags_train if b["bag_label"] == 0)
sample_neg = next(b for b in bags_train if b["bag_label"] == 1)

print(f"Positive Bag: shape={sample_pos['bag_emb'].shape}, label={sample_pos['bag_label']}, neg_ratio={sample_pos.get('neg_ratio', 'N/A')}")
print(f"Negative Bag: shape={sample_neg['bag_emb'].shape}, label={sample_neg['bag_label']}, neg_ratio={sample_neg.get('neg_ratio', 'N/A')}")

# 메모리 사용량 추정
import sys
total_size = sum(sys.getsizeof(b['bag_emb']) for b in bags_train)
print(f"\n훈련 데이터 메모리 사용량 (대략): {total_size / 1024**3:.2f} GB")

===== 클래스 균형 검증 =====

TRAIN 세트:
  - Positive (single-writer): 4321
  - Negative (multi-writer): 8642
  - 비율 (Neg:Pos): 2.00:1
  - Negative ratio 분포:
    * 0.05: 2160 bags (25.0%)
    * 0.1: 2160 bags (25.0%)
    * 0.2: 2161 bags (25.0%)
    * 0.3: 2161 bags (25.0%)

VAL 세트:
  - Positive (single-writer): 1440
  - Negative (multi-writer): 2880
  - 비율 (Neg:Pos): 2.00:1
  - Negative ratio 분포:
    * 0.05: 720 bags (25.0%)
    * 0.1: 720 bags (25.0%)
    * 0.2: 720 bags (25.0%)
    * 0.3: 720 bags (25.0%)

TEST 세트:
  - Positive (single-writer): 1440
  - Negative (multi-writer): 2880
  - 비율 (Neg:Pos): 2.00:1
  - Negative ratio 분포:
    * 0.05: 720 bags (25.0%)
    * 0.1: 720 bags (25.0%)
    * 0.2: 720 bags (25.0%)
    * 0.3: 720 bags (25.0%)

===== 샘플 Bag 정보 =====
Positive Bag: shape=(10, 5, 256), label=0, neg_ratio=0.0
Negative Bag: shape=(10, 5, 256), label=1, neg_ratio=0.1

훈련 데이터 메모리 사용량 (대략): 0.62 GB


# 셀 8 : 요약 출력

In [8]:
print("\n===== Stage‑2 Bag Generation 완료 =====")
print(f"저장 경로: {CONFIG['output_dir']}")
print(f"- Pickle 3 종 + meta.json")
print(f"윈도우: {CONFIG['window_size']} / stride {CONFIG['stride']}")
print(f"Block:  {CONFIG['bag_block']}  | Negative ratios: {CONFIG['neg_mix_ratio']}")
print("Stage‑3 Notebook(AB_MIL_arcface_256d.ipynb)에서 바로 로드 가능합니다.")



===== Stage‑2 Bag Generation 완료 =====
저장 경로: /workspace/MIL/data/processed/bags
- Pickle 3 종 + meta.json
윈도우: 5 / stride 1
Block:  10  | Negative ratios: [0.05, 0.1, 0.2, 0.3]
Stage‑3 Notebook(AB_MIL_arcface_256d.ipynb)에서 바로 로드 가능합니다.
