In [1]:
# -*- coding: utf-8 -*-

import os
import json
import random
from pathlib import Path
from typing import Dict

from mmengine.config import Config
from mmengine.runner import Runner
from mmdet.utils import register_all_modules

# -----------------------
# 고정 경로
# -----------------------

MMD_ROOT = Path("/data/ephemeral/home/model/baseline/mmdetection")

FULL_DATA_ROOT = Path("/data/ephemeral/home/model/dataset")
TRAIN_JSON_FULL = FULL_DATA_ROOT / "train.json"
TEST_JSON_FULL  = FULL_DATA_ROOT / "test.json"

# 학습 산출물 저장 위치를 명확히 고정
WORK_DIR = Path("/data/ephemeral/home/model/work_dirs_single")
WORK_DIR.mkdir(parents=True, exist_ok=True)

# 이미지 스케일
IMAGE_SCALE = (1024, 1024)

# 클래스
CLASSES = (
    "General trash",
    "Paper",
    "Paper pack",
    "Metal",
    "Glass",
    "Plastic",
    "Styrofoam",
    "Plastic bag",
    "Battery",
    "Clothing",
)

# 기본 실험 이름
EXP_NAME = "cascade_rcnn_r50_1024_single"

# split 비율
TRAIN_RATIO = 0.85  # 0.8/0.2보다 조금 더 학습 데이터에 힘을 실음
RANDOM_SEED = 42

print("MMD_ROOT:", MMD_ROOT)
print("TRAIN_JSON_FULL:", TRAIN_JSON_FULL)
print("TEST_JSON_FULL :", TEST_JSON_FULL)
print("WORK_DIR:", WORK_DIR)


MMD_ROOT: /data/ephemeral/home/model/baseline/mmdetection
TRAIN_JSON_FULL: /data/ephemeral/home/model/dataset/train.json
TEST_JSON_FULL : /data/ephemeral/home/model/dataset/test.json
WORK_DIR: /data/ephemeral/home/model/work_dirs_single


In [2]:
def load_coco(json_path: Path) -> Dict:
    with open(json_path, "r") as f:
        return json.load(f)

def save_coco(data: Dict, json_path: Path) -> None:
    json_path.parent.mkdir(parents=True, exist_ok=True)
    with open(json_path, "w") as f:
        json.dump(data, f)

def make_train_val_split(
    src_json: Path,
    out_dir: Path,
    train_ratio: float = 0.85,
    seed: int = 42
) -> Dict[str, Path]:
    """
    COCO train.json을 이미지 id 기준으로 랜덤 분할.
    detection에서는 이미지 단위 분할이 가장 기본적이고 안전함.
    """
    random.seed(seed)

    data = load_coco(src_json)
    images = data["images"]
    anns = data["annotations"]

    img_ids = [img["id"] for img in images]
    random.shuffle(img_ids)

    split_idx = int(len(img_ids) * train_ratio)
    train_ids = set(img_ids[:split_idx])
    val_ids   = set(img_ids[split_idx:])

    def _filter(ids: set):
        imgs = [img for img in images if img["id"] in ids]
        img_set = {img["id"] for img in imgs}
        filtered_anns = [ann for ann in anns if ann["image_id"] in img_set]
        return {**data, "images": imgs, "annotations": filtered_anns}

    out_dir.mkdir(parents=True, exist_ok=True)
    train_json = out_dir / "train_split.json"
    val_json   = out_dir / "val_split.json"

    save_coco(_filter(train_ids), train_json)
    save_coco(_filter(val_ids), val_json)

    return {"train": train_json, "val": val_json}


In [3]:
def set_img_scale(pipeline, scale):
    """
    Resize / RandomResize / RandomChoiceResize 등 스케일을 1024로 통일.
    """
    for t in pipeline:
        if isinstance(t, list):
            set_img_scale(t, scale)
            continue
        if not isinstance(t, dict):
            continue

        if t.get("type") in ("Resize", "RandomResize", "RandomChoiceResize"):
            if "scale" in t:
                t["scale"] = scale
            if "img_scale" in t:
                t["img_scale"] = scale
            if "scales" in t:
                t["scales"] = [scale]

        if "transforms" in t:
            set_img_scale(t["transforms"], scale)

def set_num_classes(model_cfg, num_classes: int):
    """
    bbox_head/roi_head 내부의 num_classes를 재귀적으로 10으로 고정.
    """
    if isinstance(model_cfg, dict):
        if "num_classes" in model_cfg:
            model_cfg["num_classes"] = num_classes
        for v in model_cfg.values():
            set_num_classes(v, num_classes)
    elif isinstance(model_cfg, list):
        for v in model_cfg:
            set_num_classes(v, num_classes)


In [4]:
def build_cascade_cfg(split_json: Dict[str, Path]) -> Config:
    """
    Cascade R-CNN 단일 모델 학습용 cfg 구성.
    """
    register_all_modules(init_default_scope=True)

    base_cfg_rel = "configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py"
    base_cfg_path = MMD_ROOT / base_cfg_rel

    if not base_cfg_path.exists():
        raise FileNotFoundError(f"base config 없음: {base_cfg_path}")

    cfg = Config.fromfile(str(base_cfg_path))
    cfg.default_scope = "mmdet"

    # 데이터 루트
    cfg.data_root = str(FULL_DATA_ROOT)

    # train/val dataset 교체
    for key, ann_path in [
        ("train_dataloader", split_json["train"]),
        ("val_dataloader", split_json["val"]),
    ]:
        loader = cfg[key]
        ds_cfg = loader["dataset"] if "dataset" in loader else loader

        ds_cfg.metainfo = dict(classes=CLASSES)
        ds_cfg.data_root = str(FULL_DATA_ROOT)

        # ann_file은 절대경로로 줘도 안전
        ds_cfg.ann_file = str(ann_path)
        ds_cfg.data_prefix = dict(img="")

    # test_dataloader는 val을 재활용해서 val mAP 계산용으로 사용
    cfg.test_dataloader = cfg.val_dataloader
    cfg.test_evaluator = dict(
        type="CocoMetric",
        ann_file=str(split_json["val"]),
        metric="bbox",
        iou_thrs=[0.5],
        format_only=False,
    )
    # --- [추가] val_evaluator도 반드시 교체해야 COCO 기본 경로 에러가 사라짐 ---
    cfg.val_evaluator = dict(
        type="CocoMetric",
        ann_file=str(split_json["val"]),
        metric="bbox",
        iou_thrs=[0.5],
        format_only=False,
    )

    # 스케일 고정
    set_img_scale(cfg.train_dataloader.dataset.pipeline, IMAGE_SCALE)
    set_img_scale(cfg.val_dataloader.dataset.pipeline, IMAGE_SCALE)

    # 클래스 수 설정
    set_num_classes(cfg.model, len(CLASSES))

    # 배치/워커
    cfg.train_dataloader.batch_size = 2
    cfg.train_dataloader.num_workers = 4
    cfg.val_dataloader.batch_size = 1
    cfg.val_dataloader.num_workers = 2
    cfg.test_dataloader.batch_size = 1
    cfg.test_dataloader.num_workers = 2

    # epoch 확장: 1x보다 약간 길게
    cfg.train_cfg.max_epochs = 12
    cfg.train_cfg.val_interval = 1
    cfg.val_cfg = dict(type="ValLoop")

    # lr 스케일링 (단일 GPU + bs=2 기준 보수적으로)
    base_lr = cfg.optim_wrapper.optimizer.get("lr", 0.02)
    cfg.optim_wrapper.optimizer["lr"] = base_lr * 2 / 16

    # checkpoint/로그 간격
    cfg.default_hooks["checkpoint"]["interval"] = 1
    cfg.default_hooks["checkpoint"]["max_keep_ckpts"] = 2
    cfg.default_hooks["logger"]["interval"] = 50

    # work_dir 고정
    cfg.work_dir = str(WORK_DIR / EXP_NAME)

    # 시드
    cfg.randomness = dict(seed=RANDOM_SEED, deterministic=False)

    return cfg


In [5]:
# split 생성
split_dir = FULL_DATA_ROOT / "splits_single"
split_json = make_train_val_split(
    src_json=TRAIN_JSON_FULL,
    out_dir=split_dir,
    train_ratio=TRAIN_RATIO,
    seed=RANDOM_SEED
)

print("train split:", split_json["train"])
print("val split  :", split_json["val"])

# cfg 생성
cfg = build_cascade_cfg(split_json)

# 학습
runner = Runner.from_cfg(cfg)
runner.train()

# val mAP50 평가
metrics = runner.test()
print("val metrics:", metrics)

print("학습 완료. work_dir:", cfg.work_dir)


train split: /data/ephemeral/home/model/dataset/splits_single/train_split.json
val split  : /data/ephemeral/home/model/dataset/splits_single/val_split.json
12/06 06:58:15 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 42
    GPU 0: Tesla V100-SXM2-32GB
    CUDA_HOME: None
    GCC: n/a
    PyTorch: 2.1.2+cu121
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 12.1
  - NVCC a

/bin/sh: 1: gcc: not found


12/06 06:58:15 - mmengine - [4m[97mINFO[0m - Config:
auto_scale_lr = dict(base_batch_size=16, enable=False)
backend_args = None
data_root = '/data/ephemeral/home/model/dataset'
dataset_type = 'CocoDataset'
default_hooks = dict(
    checkpoint=dict(interval=1, max_keep_ckpts=2, type='CheckpointHook'),
    logger=dict(interval=50, type='LoggerHook'),
    param_scheduler=dict(type='ParamSchedulerHook'),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    timer=dict(type='IterTimerHook'),
    visualization=dict(type='DetVisualizationHook'))
default_scope = 'mmdet'
env_cfg = dict(
    cudnn_benchmark=False,
    dist_cfg=dict(backend='nccl'),
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
load_from = None
log_level = 'INFO'
log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
model = dict(
    backbone=dict(
        depth=50,
        frozen_stages=1,
        init_cfg=dict(checkpoint='torchvision://resnet50', type='Pretrained'),
        norm_cfg=dict(

: 

: 

: 

: 