# Custom Dataset Training
Dataset 1, 3, 4를 합쳐서 학습하는 노트북

## 0. GPU 확인


In [None]:
import torch

print("=" * 60)
print("🖥️  GPU 확인")
print("=" * 60)

if torch.cuda.is_available():
    print(f"✅ CUDA 사용 가능!")
    print(f"   GPU 이름: {torch.cuda.get_device_name(0)}")
    print(f"   GPU 개수: {torch.cuda.device_count()}")
    print(f"   현재 GPU: cuda:{torch.cuda.current_device()}")
    print(f"   메모리: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"\n✨ 학습 시 GPU가 자동으로 사용됩니다!")
else:
    print("❌ CUDA 사용 불가 (CPU로 학습됩니다)")
    print("   GPU를 사용하려면:")
    print("   1. NVIDIA GPU가 설치되어 있는지 확인")
    print("   2. CUDA Toolkit 설치")
    print("   3. PyTorch CUDA 버전 설치")
    print("      pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")

print("=" * 60)


In [1]:
from pathlib import Path
import os
import sys
import pandas as pd
from IPython.display import display, Image

PROJECT_ROOT = Path('..').resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from core.config import load_training_config
from core.data import available_datasets, build_datasets
from core.train_eval import train_and_evaluate
from core.utils import setup_logging
from model import MODEL_REGISTRY

setup_logging()
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'default.yaml'
BASE_CONFIG = load_training_config(CONFIG_PATH)
RUN_HISTORY = []

## 1. 하이퍼파라미터 설정

In [3]:
# 데이터셋 샘플링 비율
DATASET_3_RATIO = 0.1   # Dataset 3 샘플링 비율 (10%)
DATASET_4_RATIO = 0.01  # Dataset 4 샘플링 비율 (1%)

# 데이터셋 샘플링 비율에 따른 갯수
DATASET_3_LEN = 26938
DATASET_4_LEN = 232003
DATASET_3_SAMPLE_LEN = 26938 * DATASET_3_RATIO
DATASET_4_SAMPLE_LEN = 232003 * DATASET_4_RATIO

# Train/Val split 비율
TRAIN_VAL_RATIO = 0.8   # Train 80%, Val 20%

# 랜덤 시드
RANDOM_SEED = 42

# 출력 데이터셋 이름
OUTPUT_DATASET_NAME = "custom-dataset"

# 모델 및 학습 설정
MODEL_NAME = 'bilstm'  # 'bow_mlp', 'cnn_text', 'bilstm', 'tiny_transformer'
EPOCHS = 5

print("=" * 60)
print("하이퍼파라미터 설정")
print("=" * 60)
print(f"Dataset 3 sampling : {DATASET_3_RATIO:.1%} -> {round(DATASET_3_SAMPLE_LEN)} rows")
print(f"Dataset 4 sampling : {DATASET_4_RATIO:.1%} -> {round(DATASET_4_SAMPLE_LEN)} rows")
print(f"Train/Val ratio: {TRAIN_VAL_RATIO:.0%} / {(1-TRAIN_VAL_RATIO):.0%}")
print(f"Random seed: {RANDOM_SEED}")
print(f"Output dataset name: {OUTPUT_DATASET_NAME}")
print(f"Model: {MODEL_NAME}")
print(f"Epochs: {EPOCHS}")
print("=" * 60)

하이퍼파라미터 설정
Dataset 3 sampling : 10.0% -> 2694 rows
Dataset 4 sampling : 1.0% -> 2320 rows
Train/Val ratio: 80% / 20%
Random seed: 42
Output dataset name: custom-dataset
Model: bilstm
Epochs: 5


## 2. 데이터셋 준비

**주의**: 데이터셋이 이미 생성되어 있으면 이 셀을 건너뛰세요.

In [4]:
# 데이터셋 준비 스크립트 실행
!cd .. && python prepare_custom_dataset.py \
    --dataset3-ratio {DATASET_3_RATIO} \
    --dataset4-ratio {DATASET_4_RATIO} \
    --train-val-ratio {TRAIN_VAL_RATIO} \
    --seed {RANDOM_SEED} \
    --output-name {OUTPUT_DATASET_NAME}

2025-10-28 17:43:22,836 - INFO - 
=== Preparing Train/Val Data ===
2025-10-28 17:43:22,836 - INFO - Dataset 3 sample ratio: 10.0%
2025-10-28 17:43:22,836 - INFO - Dataset 4 sample ratio: 1.0%
2025-10-28 17:43:22,836 - INFO - Train/Val split ratio: 0.80/0.20
2025-10-28 17:43:22,836 - INFO - Loading dataset 1 - train: train.csv
2025-10-28 17:43:23,273 - INFO -   - Loaded 24353 rows
2025-10-28 17:43:23,273 - INFO - Loading dataset 1 - val: val.csv
2025-10-28 17:43:23,404 - INFO -   - Loaded 8117 rows
2025-10-28 17:43:23,405 - INFO - Dataset 1 total: 32470 rows
2025-10-28 17:43:23,405 - INFO - Loading dataset 3 - train: train.csv
2025-10-28 17:43:23,822 - INFO -   - Sampled 2694/26938 rows (10.0%)
2025-10-28 17:43:23,822 - INFO - Loading dataset 4 - train: train.csv
2025-10-28 17:43:24,975 - INFO -   - Sampled 2320/232003 rows (1.0%)
2025-10-28 17:43:24,977 - INFO - 
Combined total: 37484 rows
2025-10-28 17:43:24,978 - INFO - Label distribution:
2025-10-28 17:43:24,978 - INFO -   - 1: 1971

## 3. 데이터셋 확인

In [5]:
# 생성된 데이터셋 확인
dataset_path = PROJECT_ROOT / 'dataset' / OUTPUT_DATASET_NAME

if dataset_path.exists():
    print(f"✅ Dataset found: {dataset_path}\n")
    
    for split in ['train', 'val', 'test']:
        file_path = dataset_path / f"{split}.csv"
        if file_path.exists():
            df = pd.read_csv(file_path)
            print(f"{split.upper()}:")
            print(f"  - Total: {len(df)} rows")
            label_dist = df['label'].value_counts()
            for label, count in label_dist.items():
                print(f"  - Label {label}: {count} ({count/len(df)*100:.1f}%)")
            print()
else:
    print(f"❌ Dataset not found: {dataset_path}")
    print("Please run the data preparation cell above.")

✅ Dataset found: /Users/dorong/Desktop/hackerthonTA/fake-news-detection/dataset/custom-dataset

TRAIN:
  - Total: 29986 rows
  - Label 1: 15774 (52.6%)
  - Label 0: 14212 (47.4%)

VAL:
  - Total: 7498 rows
  - Label 1: 3944 (52.6%)
  - Label 0: 3554 (47.4%)

TEST:
  - Total: 55764 rows
  - Label 0: 35495 (63.7%)
  - Label 1: 20269 (36.3%)



## 4. 모델 학습

In [None]:
DATASET_NAME = OUTPUT_DATASET_NAME

print(f"Loading dataset: {DATASET_NAME}")
print(f"Model: {MODEL_NAME}")
print(f"Epochs: {EPOCHS}\n")

config = load_training_config(CONFIG_PATH, overrides={'epochs': EPOCHS})
loaders, vocab, tokenizer, info = build_datasets(
    name=DATASET_NAME,
    batch_size=config.batch_size,
    max_len=config.max_len,
    num_workers=config.num_workers,
    max_vocab_size=20000,
)

print(f"\nVocab size: {len(vocab)}")
print(f"Batch size: {config.batch_size}")
print(f"Max length: {config.max_len}\n")

model_cls = MODEL_REGISTRY[MODEL_NAME]
model = model_cls(vocab_size=len(vocab), num_classes=2)

results, run_dir = train_and_evaluate(
    model,
    loaders,
    config,
    dataset_name=DATASET_NAME,
    model_name=MODEL_NAME,
    run_root=PROJECT_ROOT / 'runs',
)

# 실행 기록 저장
RUN_HISTORY.append({
    'dataset': DATASET_NAME,
    'model': MODEL_NAME,
    'results': results,
    'run_dir': str(run_dir),
    'hyperparameters': {
        'dataset_3_ratio': DATASET_3_RATIO,
        'dataset_4_ratio': DATASET_4_RATIO,
        'train_val_ratio': TRAIN_VAL_RATIO,
        'seed': RANDOM_SEED,
        'epochs': EPOCHS
    }
})

print("\n" + "=" * 60)
print("학습 완료!")
print("=" * 60)

Loading dataset: custom-dataset
Model: bilstm
Epochs: 5


Vocab size: 19996
Batch size: 64
Max length: 256



[2025-10-28 17:45:47] INFO fake_news: Starting training | dataset=custom-dataset model=bilstm epochs=5 batch_size=64
[2025-10-28 17:56:54] INFO fake_news: Epoch 1 | train_loss=0.3743 val_loss=0.3318 val_f1=0.8850
[2025-10-28 18:11:19] INFO fake_news: Epoch 2 | train_loss=0.3131 val_loss=0.3268 val_f1=0.8869
[2025-10-28 18:23:45] INFO fake_news: Epoch 3 | train_loss=0.3013 val_loss=0.3202 val_f1=0.8944
[2025-10-28 18:37:16] INFO fake_news: Epoch 4 | train_loss=0.2869 val_loss=0.3212 val_f1=0.8992


## 5. 결과 확인

In [None]:
# 최근 실행 결과
latest = RUN_HISTORY[-1]

print(f"Dataset: {latest['dataset']}")
print(f"Model: {latest['model']}")
print(f"Run directory: {latest['run_dir']}\n")

# 메트릭 표 출력
metrics_df = pd.DataFrame(latest['results']).T
print("\n📊 성능 메트릭:")
display(metrics_df)

# Confusion Matrix 표시
cm_path = Path(latest['run_dir']) / 'confusion_matrix.png'
if cm_path.exists():
    print("\n📈 Confusion Matrix:")
    display(Image(filename=str(cm_path)))
else:
    print('\n⚠️  Confusion matrix not available for this run.')

## 6. 상세 결과 분석

In [None]:
latest = RUN_HISTORY[-1]

print("=" * 60)
print("상세 결과 분석")
print("=" * 60)

for split in ['train', 'val', 'test']:
    if split in latest['results']:
        metrics = latest['results'][split]
        print(f"\n{split.upper()} 결과:")
        print(f"  - Accuracy:  {metrics['accuracy']:.4f} ({metrics['accuracy']*100:.2f}%)")
        print(f"  - Precision: {metrics['precision']:.4f}")
        print(f"  - Recall:    {metrics['recall']:.4f}")
        print(f"  - F1 Score:  {metrics['f1']:.4f}")
        print(f"  - AUROC:     {metrics['auroc']:.4f}")
        print(f"  - Loss:      {metrics['loss']:.6f}")

print("\n" + "=" * 60)

## 7. 하이퍼파라미터 확인

In [None]:
latest = RUN_HISTORY[-1]

print("사용된 하이퍼파라미터:")
print("=" * 60)
for key, value in latest['hyperparameters'].items():
    if 'ratio' in key and value < 1:
        print(f"{key}: {value} ({value*100:.1f}%)")
    else:
        print(f"{key}: {value}")
print("=" * 60)

## 8. 전체 실험 기록 (선택사항)

여러 번 실험한 경우 모든 결과를 비교합니다.

In [None]:
if len(RUN_HISTORY) > 0:
    summary_rows = []
    
    for idx, record in enumerate(RUN_HISTORY):
        row = {
            'exp_id': idx + 1,
            'dataset': record['dataset'],
            'model': record['model'],
        }
        
        # 하이퍼파라미터 추가
        if 'hyperparameters' in record:
            for key, value in record['hyperparameters'].items():
                row[key] = value
        
        # Validation 메트릭 추가
        val_metrics = record['results'].get('val', {})
        for key, value in val_metrics.items():
            row[f'val_{key}'] = value
        
        # Test 메트릭 추가
        test_metrics = record['results'].get('test', {})
        for key, value in test_metrics.items():
            row[f'test_{key}'] = value
        
        summary_rows.append(row)
    
    summary_df = pd.DataFrame(summary_rows)
    
    print(f"\n전체 실험 기록 ({len(RUN_HISTORY)}개):")
    display(summary_df)
    
    # 가장 좋은 결과 찾기
    if 'val_f1' in summary_df.columns:
        best_idx = summary_df['val_f1'].idxmax()
        print(f"\n🏆 Best result (by val_f1):")
        print(summary_df.loc[best_idx])
else:
    print('아직 실행된 실험이 없습니다. 위의 학습 셀을 먼저 실행하세요.')