# Custom Dataset Training
Dataset 1, 3, 4를 합쳐서 학습하는 노트북

In [None]:
from pathlib import Path
import os
import sys
import pandas as pd
from IPython.display import display

PROJECT_ROOT = Path('..').resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from core.config import load_training_config
from core.data import available_datasets, build_datasets
from core.train_eval import train_and_evaluate
from core.utils import setup_logging
from model import MODEL_REGISTRY

setup_logging()
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'default.yaml'
BASE_CONFIG = load_training_config(CONFIG_PATH)
RUN_HISTORY = []

## 1. 하이퍼파라미터 설정

In [None]:
DATASET_3_LEN = 232003
DATASET_4_LEN = 232003


In [None]:
# 데이터셋 샘플링 비율
DATASET_3_RATIO = 0.1   # Dataset 3 샘플링 비율 (10%)
DATASET_4_RATIO = 0.01  # Dataset 4 샘플링 비율 (1%)

# Train/Val split 비율
TRAIN_VAL_RATIO = 0.8   # Train 80%, Val 20%

# 랜덤 시드
RANDOM_SEED = 42

# 출력 데이터셋 이름
OUTPUT_DATASET_NAME = "custom-dataset"

print(f"Dataset 3 sampling ratio: {DATASET_3_RATIO:.1%}")
print(f"Dataset 4 sampling ratio: {DATASET_4_RATIO:.1%}")
print(f"Train/Val ratio: {TRAIN_VAL_RATIO:.0%} / {(1-TRAIN_VAL_RATIO):.0%}")
print(f"Random seed: {RANDOM_SEED}")

## 2. 데이터셋 준비

In [None]:
# 데이터셋 준비 스크립트 실행
!cd .. && python prepare_custom_dataset.py --dataset3-ratio {DATASET_3_RATIO} --dataset4-ratio {DATASET_4_RATIO} --train-val-ratio {TRAIN_VAL_RATIO} --seed {RANDOM_SEED} --output-name {OUTPUT_DATASET_NAME}

## 3. 데이터셋 로드 및 학습

In [None]:
DATASET_NAME = OUTPUT_DATASET_NAME
MODEL_NAME = 'bilstm'
EPOCHS = 5

config = load_training_config(CONFIG_PATH, overrides={'epochs': EPOCHS})
loaders, vocab, tokenizer, info = build_datasets(
    name=DATASET_NAME,
    batch_size=config.batch_size,
    max_len=config.max_len,
    num_workers=config.num_workers,
    max_vocab_size=20000,
)

model_cls = MODEL_REGISTRY[MODEL_NAME]
model = model_cls(vocab_size=len(vocab), num_classes=2)

results, run_dir = train_and_evaluate(
    model,
    loaders,
    config,
    dataset_name=DATASET_NAME,
    model_name=MODEL_NAME,
    run_root=PROJECT_ROOT / 'runs',
)

results