# 📄 Document type classification baseline code with WandB Integration



In [1]:

# =============================================================================
# 0. Prepare Environments & Install Libraries
# =============================================================================

# 필요한 라이브러리를 설치합니다.
#!pip install -r ../requirements.txt
#!pip install transformers==4.44.0
#!pip install easyocr
#!pip install datasets
#!pip install accelerate

In [2]:
# 현재 노트북에서 바로 실행하세요
import torch
import gc

def quick_cleanup():
    """즉시 사용 가능한 빠른 메모리 정리"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    print("메모리 정리 완료")

# 바로 실행
quick_cleanup()

메모리 정리 완료


In [3]:
from transformers.utils import move_cache
move_cache()

0it [00:00, ?it/s]

In [4]:
import numpy as np
import os
import time
import random
import copy
import json

import optuna, math
import torch
import albumentations as A
import pandas as pd
#import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import autocast, GradScaler  # Mixed Precision용

from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# WandB 관련 import 추가
import wandb
from datetime import datetime

# LayoutLMv3 관련 import
from transformers import LayoutLMv3Processor, LayoutLMv3ForSequenceClassification
import easyocr
from datasets import Dataset as HFDataset  # Hugging Face Dataset (옵션)

# OCR 초기화
reader = easyocr.Reader(['en'], gpu=True)  # 영어 문서 가정

# OCR 함수
def extract_ocr(image_path, max_words=512):
    """이미지에서 텍스트와 바운딩 박스 추출"""
    try:
        results = reader.readtext(image_path)
        words = []
        boxes = []
        for (bbox, text, conf) in results:
            if conf > 0.5:  # 신뢰도 필터
                words.append(text.strip())
                # 바운딩 박스: [x0, y0, x1, y1] → LayoutLMv3 형식 (0-1000 스케일)
                x0, y0 = bbox[0][0], bbox[0][1]
                x1, y1 = bbox[2][0], bbox[2][1]
                # 이미지 크기 기준 정규화 (0-1000)
                img = Image.open(image_path)
                w, h = img.size
                box = [int(1000 * (x0 / w)), int(1000 * (y0 / h)), 
                       int(1000 * (x1 / w)), int(1000 * (y1 / h))]
                boxes.append(box)
                if len(words) >= max_words:
                    break
        return words, boxes
    except Exception as e:
        print(f"OCR error for {image_path}: {e}")
        return [], []

# OCR 캐싱 (훈련 전에 실행)
def prepare_ocr_cache(df, img_path):
    """DataFrame에 OCR 결과를 추가하고 저장"""
    ocr_cache = {}
    for idx, row in df.iterrows():
        image_id = row['ID']
        img_full_path = os.path.join(img_path, image_id)
        words, boxes = extract_ocr(img_full_path)
        df.at[idx, 'words'] = json.dumps(words)
        df.at[idx, 'boxes'] = json.dumps(boxes)
        ocr_cache[image_id] = (words, boxes)
    df.to_csv(os.path.join(img_path, 'with_ocr.csv'), index=False)
    return ocr_cache

# 프로세서 로드
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")

In [5]:
 
"""
🚀 팀원 사용 가이드:

1. WandB 계정 생성: https://wandb.ai/signup
2. 이 셀 실행 시 로그인 프롬프트가 나타나면 개인 API 키 입력
3. EXPERIMENT_NAME을 다음과 같이 변경:
   - "member1-baseline"
   - "member2-augmentation-test"  
   - "member3-hyperparameter-tuning"
   등등 각자 다른 이름 사용

4. 팀 대시보드 URL: [여기에 당신의 프로젝트 URL 추가]

⚠️ 주의사항:
- 절대 API 키를 코드에 하드코딩하지 마세요
- EXPERIMENT_NAME만 변경하고 PROJECT_NAME은 그대로 두세요
- 각자 개인 계정으로 로그인해서 실험을 추가하세요
"""

# WandB 로그인 (각자 실행)
try:
    if wandb.api.api_key is None:
        print("WandB에 로그인이 필요합니다.")
        wandb.login()
    else:
        print(f"WandB 로그인 상태: {wandb.api.viewer()['username']}")
except:
    print("WandB 로그인을 진행합니다...")
    wandb.login()

# 프로젝트 설정 (각자 수정할 부분)
PROJECT_NAME = "document-classification-team-CV"  # 모든 팀원 동일
ENTITY = None  # 각자 개인 계정 사용
EXPERIMENT_NAME = "layoutlmv3-baseline"  # 팀원별로 변경 (예: "member1-hyperopt", "member2-augmentation")

print(f"프로젝트: {PROJECT_NAME}")
print(f"실험명: {EXPERIMENT_NAME}")

WandB 로그인 상태: kimsunmin0227
프로젝트: document-classification-team-CV
실험명: layoutlmv3-baseline


In [6]:
 
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True


In [7]:
import os
import json
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LayoutLMv3Processor, LayoutLMv3ForSequenceClassification
from PIL import Image

# 환경 변수 설정 (MKL/OpenMP 충돌 방지)
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
os.environ["MKL_THREADING_LAYER"] = "GNU"

# 데이터 경로
DATA_ROOT = "/root/computervisioncompetition-cv-1/mywork/data"

# 데이터셋 클래스
class LayoutLMv3Dataset(Dataset):
    def __init__(self, df, img_path, processor, max_length=512):
        self.df = df
        self.img_path = img_path
        self.processor = processor
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.img_path, row['ID'])
        
        try:
            words = json.loads(row['words']) if isinstance(row['words'], str) else row['words']
            boxes = json.loads(row['boxes']) if isinstance(row['boxes'], str) else row['boxes']
        except:
            words, boxes = [], []
        
        # 빈 리스트 처리
        if not words or not boxes:
            words = [""]
            boxes = [[0, 0, 1, 1]]
        
        # 길이 맞추기
        if len(words) != len(boxes):
            min_len = min(len(words), len(boxes))
            words = words[:min_len]
            boxes = boxes[:min_len]
        
        label = int(row['target'])
        image = Image.open(image_path).convert("RGB")
        
        # Processor에서 padding과 truncation 명시적 설정
        encoding = self.processor(
            image, 
            text=words, 
            boxes=boxes, 
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        
        # 모든 tensor를 squeeze하여 배치 차원 제거
        result = {}
        for key, value in encoding.items():
            if isinstance(value, torch.Tensor):
                result[key] = value.squeeze(0)
            else:
                result[key] = value
        
        return result, label

# Custom collate function
def collate_fn(batch):
    """
    Custom collate function for LayoutLMv3
    """
    inputs = {}
    labels = []
    
    # 첫 번째 샘플에서 키 확인
    sample_keys = batch[0][0].keys()
    
    for key in sample_keys:
        inputs[key] = torch.stack([item[0][key] for item in batch])
    
    labels = torch.tensor([item[1] for item in batch], dtype=torch.long)
    
    return inputs, labels

if __name__ == "__main__":
    # 프로세서 및 모델
    processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
    model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=17)
    
    # 데이터셋
    train_df = pd.read_csv(os.path.join(DATA_ROOT, 'train/with_ocr.csv'))
    print("Unique labels:", sorted(train_df['target'].unique()))
    print("Number of unique labels:", len(train_df['target'].unique()))
    print("Dataset size:", len(train_df))
    
    # 몇 개 샘플로 먼저 테스트
    test_df = train_df.head(10).copy()  # 작은 샘플로 테스트
    
    train_dataset = LayoutLMv3Dataset(test_df, os.path.join(DATA_ROOT, 'train/'), processor)
    train_loader = DataLoader(
        train_dataset, 
        batch_size=2,  # 작은 배치 크기로 시작
        shuffle=True, 
        num_workers=0,  # 디버깅을 위해 0으로 설정
        collate_fn=collate_fn  # 커스텀 collate function 사용
    )
    
    # GPU 설정
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model.to(device)
    
    # 학습 루프
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    
    print("Starting training...")
    for epoch in range(2):  # 테스트를 위해 2 에포크만
        print(f"Starting epoch {epoch}")
        total_loss = 0
        num_batches = 0
        
        for batch_idx, (inputs, labels) in enumerate(train_loader):
            try:
                # 입력을 GPU로 이동
                inputs = {k: v.to(device) for k, v in inputs.items()}
                labels = labels.to(device)
                
                # Forward pass
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss
                
                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                num_batches += 1
                
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}")
                
                # 메모리 정리
                del inputs, labels, outputs, loss
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
                
            except Exception as e:
                print(f"Error in batch {batch_idx}: {e}")
                print("Input shapes:")
                for k, v in inputs.items():
                    print(f"  {k}: {v.shape}")
                print(f"Labels shape: {labels.shape}")
                break
        
        avg_loss = total_loss / num_batches if num_batches > 0 else 0
        print(f"Epoch {epoch} completed. Average loss: {avg_loss:.4f}")
    
    print("Training completed successfully!")
    
    # 간단한 추론 테스트
    print("\nTesting inference...")
    model.eval()
    with torch.no_grad():
        for batch_idx, (inputs, labels) in enumerate(train_loader):
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)
            print(f"Batch {batch_idx} predictions: {predictions.cpu().tolist()}")
            print(f"Batch {batch_idx} actual labels: {labels.tolist()}")
            if batch_idx >= 2:  # 몇 개 배치만 테스트
                break
    
    print("Inference test completed!")

Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unique labels: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
Number of unique labels: 17
Dataset size: 1570
Using device: cuda
Starting training...
Starting epoch 0
Epoch 0, Batch 0, Loss: 2.9736
Epoch 0, Batch 1, Loss: 2.7821
Epoch 0, Batch 2, Loss: 2.4954
Epoch 0, Batch 3, Loss: 2.5653
Epoch 0, Batch 4, Loss: 3.0934
Epoch 0 completed. Average loss: 2.7819
Starting epoch 1
Epoch 1, Batch 0, Loss: 2.2907
Epoch 1, Batch 1, Loss: 2.4921
Epoch 1, Batch 2, Loss: 2.2888
Epoch 1, Batch 3, Loss: 2.4479
Epoch 1, Batch 4, Loss: 2.3286
Epoch 1 completed. Average loss: 2.3696
Training completed successfully!

Testing inference...
Batch 0 predictions: [16, 16]
Batch 0 actual labels: [16, 4]
Batch 1 predictions: [16, 16]
Batch 1 actual labels: [15, 14]
Batch 2 predictions: [16, 16]
Batch 2 actual labels: [10, 15]
Inference test completed!


In [8]:
# Cutout (Random Erasing) 함수 정의
def random_erasing(image, p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3)):
    if random.random() > p:
        return image
    img_c, img_h, img_w = image.shape[1], image.shape[2], image.shape[3]
    area = img_h * img_w
    
    target_area = torch.tensor(random.uniform(scale[0], scale[1]), dtype=torch.float32) * area
    aspect_ratio = torch.tensor(random.uniform(ratio[0], ratio[1]), dtype=torch.float32)
    h = int(round(math.sqrt(target_area * aspect_ratio)))
    w = int(round(math.sqrt(target_area / aspect_ratio)))
    
    # h, w가 이미지 크기 내에 있는지 확인
    if w < img_w and h < img_h:
        x = random.randint(0, img_w - w)
        y = random.randint(0, img_h - h)
        
        # float32 마스크 생성
        mask = torch.ones_like(image, dtype=torch.float32)
        mask[:, y:y+h, x:x+w] = 0.0  # 또는 랜덤 값: torch.rand(3, h, w, dtype=torch.float32)
        
        # erasing 적용
        erased = image * mask
        return erased.float()  # float32 출력 보장
    return image.float()

# RandomCrop 함수 정의
def random_crop(image, crop_size=0.7):
    img_c, img_h, img_w = image.shape[1], image.shape[2], image.shape[3]
    crop_h = int(img_h * crop_size)
    crop_w = int(img_w * crop_size)
    
    if crop_h >= img_h or crop_w >= img_w:
        return image
    
    x = random.randint(0, img_w - crop_w)
    y = random.randint(0, img_h - crop_h)
    cropped_image = image[:, :, y:y+crop_h, x:x+crop_w]
    
    # 패딩으로 원래 크기 복원
    padded_image = torch.zeros_like(image)
    padded_image[:, :, y:y+crop_h, x:x+crop_w] = cropped_image
    return padded_image

# Mixup 함수 정의
def mixup_data(x, y, alpha=1.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).cuda()
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


In [9]:
# 훈련 데이터에 OCR 추가 (한 번만 실행)
#train_df = pd.read_csv('../data/train.csv')
#train_ocr_cache = prepare_ocr_cache(train_df, '../data/train/')
# 검증/테스트도 동일하게 처리
#test_df = pd.read_csv('../data/sample_submission.csv')
#test_ocr_cache = prepare_ocr_cache(test_df, '../data/test/')  # target은 -1로 설정

병렬처리 가능

In [10]:
import os
import json
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from tqdm.notebook import tqdm  # Jupyter/Kaggle용 tqdm
from PIL import Image
import easyocr
import multiprocessing as mp
import torch

# 데이터 경로 설정
DATA_ROOT = "/root/computervisioncompetition-cv-1/mywork/data"  # 실제 경로 확인

# 시작 방식을 'spawn'으로 설정
try:
    mp.set_start_method('spawn', force=True)
    print("Multiprocessing start method set to 'spawn'")
except RuntimeError:
    print("Multiprocessing start method already set")

# GPU 메모리 초기화
torch.cuda.empty_cache()
print("GPU memory cleared")

# EasyOCR 초기화
def init_easyocr():
    try:
        reader = easyocr.Reader(['en'], gpu=True)
        print("EasyOCR initialized with GPU")
        return reader
    except Exception as e:
        print(f"Failed to initialize EasyOCR with GPU: {e}. Falling back to CPU.")
        return easyocr.Reader(['en'], gpu=False)

def extract_ocr(image_path, max_words=512):
    try:
        reader = init_easyocr()
        results = reader.readtext(image_path)
        words = []
        boxes = []
        for (bbox, text, conf) in results:
            if conf > 0.5:
                words.append(text.strip())
                img = Image.open(image_path)
                w, h = img.size
                x0, y0 = bbox[0][0], bbox[0][1]
                x1, y1 = bbox[2][0], bbox[2][1]
                box = [int(1000 * (x0 / w)), int(1000 * (y0 / h)), 
                       int(1000 * (x1 / w)), int(1000 * (y1 / h))]
                boxes.append(box)
                if len(words) >= max_words:
                    break
        print(f"OCR processed for {image_path}: {len(words)} words extracted")
        return words, boxes
    except Exception as e:
        print(f"OCR error for {image_path}: {e}")
        return [], []
    finally:
        torch.cuda.empty_cache()

def process_image(args):
    image_id, img_path = args
    img_full_path = os.path.join(img_path, image_id)
    words, boxes = extract_ocr(img_full_path)
    return image_id, words, boxes

def prepare_ocr_cache_parallel(df, img_path, num_workers=2):
    print(f"Starting OCR processing for {len(df)} images in {img_path}")
    ocr_cache = {}
    df = df.copy()
    
    num_workers = min(num_workers, mp.cpu_count())
    print(f"Using {num_workers} workers for parallel processing")
    
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(process_image, (row['ID'], img_path)) for _, row in df.iterrows()]
        for future in tqdm(futures, total=len(df), desc="OCR Processing"):
            try:
                image_id, words, boxes = future.result()
                ocr_cache[image_id] = (words, boxes)
                df.loc[df['ID'] == image_id, 'words'] = json.dumps(words)
                df.loc[df['ID'] == image_id, 'boxes'] = json.dumps(boxes)
            except Exception as e:
                print(f"Error processing image {image_id if 'image_id' in locals() else 'unknown'}: {e}")
                continue
    
    output_path = os.path.join(img_path, 'with_ocr.csv')
    df.to_csv(output_path, index=False)
    print(f"OCR results saved to {output_path}")
    return ocr_cache

def load_existing_cache(df, img_path):
    cache_path = os.path.join(img_path, 'with_ocr.csv')
    if os.path.exists(cache_path):
        print(f"Loading existing cache from {cache_path}")
        cached_df = pd.read_csv(cache_path)
        ocr_cache = {row['ID']: (json.loads(row['words']), json.loads(row['boxes'])) 
                     for _, row in cached_df.iterrows()}
        print(f"Loaded {len(ocr_cache)} cached OCR results")
        return cached_df, ocr_cache
    print(f"No cache found at {cache_path}")
    return df, {}

# 실행
print("Starting script execution")
train_df, train_ocr_cache = load_existing_cache(
    pd.read_csv(os.path.join(DATA_ROOT, 'train.csv')), 
    os.path.join(DATA_ROOT, 'train/')
)
if not train_ocr_cache:
    print("No train cache found, running OCR processing")
    train_ocr_cache = prepare_ocr_cache_parallel(
        train_df, os.path.join(DATA_ROOT, 'train/'), num_workers=2
    )
else:
    print("Train cache loaded, skipping OCR processing")

test_df, test_ocr_cache = load_existing_cache(
    pd.read_csv(os.path.join(DATA_ROOT, 'sample_submission.csv')), 
    os.path.join(DATA_ROOT, 'test/')
)
if not test_ocr_cache:
    print("No test cache found, running OCR processing")
    test_ocr_cache = prepare_ocr_cache_parallel(
        test_df, os.path.join(DATA_ROOT, 'test/'), num_workers=2
    )
else:
    print("Test cache loaded, skipping OCR processing")
print("Script execution completed")

Multiprocessing start method set to 'spawn'
GPU memory cleared
Starting script execution
Loading existing cache from /root/computervisioncompetition-cv-1/mywork/data/train/with_ocr.csv
Loaded 1570 cached OCR results
Train cache loaded, skipping OCR processing
Loading existing cache from /root/computervisioncompetition-cv-1/mywork/data/test/with_ocr.csv
Loaded 3140 cached OCR results
Test cache loaded, skipping OCR processing
Script execution completed


In [None]:
import os
import json
import numpy as np
import pandas as pd
import torch
import copy
import time
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from transformers import LayoutLMv3Processor, LayoutLMv3ForSequenceClassification
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import wandb
import torch.nn as nn

# 환경 변수 설정
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
os.environ["MKL_THREADING_LAYER"] = "GNU"

# Dataset 클래스를 전역으로 정의 (multiprocessing 에러 방지)
class LayoutLMv3Dataset(Dataset):
    def __init__(self, df, img_path, processor, max_length=256):
        self.df = df
        self.img_path = img_path
        self.processor = processor
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.img_path, row['ID'])
        
        try:
            words = json.loads(row['words']) if isinstance(row['words'], str) else row['words']
            boxes = json.loads(row['boxes']) if isinstance(row['boxes'], str) else row['boxes']  # 오타 수정: boxeås -> boxes
        except:
            words, boxes = [], []
        
        # 빈 리스트 처리
        if not words or not boxes:
            words = [""]
            boxes = [[0, 0, 1, 1]]
        
        # 길이 맞추기
        if len(words) != len(boxes):
            min_len = min(len(words), len(boxes))
            words = words[:min_len]
            boxes = boxes[:min_len]
        
        label = int(row['target'])
        
        # 이미지 크기 축소 (메모리 절약)
        image = Image.open(image_path).convert("RGB")
        image = image.resize((224, 224))
        
        # Processor에서 padding과 truncation 명시적 설정
        encoding = self.processor(
            image, 
            text=words, 
            boxes=boxes, 
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        
        # 모든 tensor를 squeeze하여 배치 차원 제거
        result = {}
        for key, value in encoding.items():
            if isinstance(value, torch.Tensor):
                result[key] = value.squeeze(0)
            else:
                result[key] = value
        
        return result, label

# Collate function을 전역으로 정의
def collate_fn(batch):
    """Custom collate function for LayoutLMv3"""
    inputs = {}
    labels = []
    
    # 첫 번째 샘플에서 키 확인
    sample_keys = batch[0][0].keys()
    
    for key in sample_keys:
        inputs[key] = torch.stack([item[0][key] for item in batch])
    
    labels = torch.tensor([item[1] for item in batch], dtype=torch.long)
    
    return inputs, labels

# 메모리 정리 함수
def clear_memory():
    torch.cuda.empty_cache()
    import gc
    gc.collect()

def train_one_epoch_with_grad_accumulation(train_loader, model, optimizer, loss_fn, device, scaler, epoch, fold, gradient_accumulation_steps=1):
    model.train()
    total_loss = 0
    total_acc = 0
    total_f1 = 0
    num_batches = len(train_loader)
    
    optimizer.zero_grad()  # 시작 시 gradient 초기화
    
    pbar = tqdm(train_loader, desc=f"Fold {fold} Train Epoch {epoch+1}")
    for batch_idx, (inputs, targets) in enumerate(pbar):
        inputs = {k: v.to(device) for k, v in inputs.items()}
        targets = targets.to(device)
        
        with autocast():
            outputs = model(**inputs)
            logits = outputs.logits
            loss = loss_fn(logits, targets)
            # Gradient accumulation을 위해 loss를 나눔
            loss = loss / gradient_accumulation_steps
        
        scaler.scale(loss).backward()
        
        # Gradient accumulation
        if (batch_idx + 1) % gradient_accumulation_steps == 0 or (batch_idx + 1) == len(train_loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        preds = torch.argmax(logits, dim=1)
        acc = accuracy_score(targets.cpu(), preds.cpu())
        f1 = f1_score(targets.cpu(), preds.cpu(), average='macro')
        
        total_loss += loss.item() * gradient_accumulation_steps  # 원래 loss로 복원
        total_acc += acc
        total_f1 += f1
        
        pbar.set_postfix({'Loss': f"{loss.item() * gradient_accumulation_steps:.4f}", 'F1': f"{f1:.4f}"})
        
        # 메모리 정리
        del inputs, targets, outputs, logits, loss
        if batch_idx % 10 == 0:
            clear_memory()
    
    return {
        'train_loss': total_loss / num_batches,
        'train_acc': total_acc / num_batches,
        'train_f1': total_f1 / num_batches
    }

def validate_one_epoch(val_loader, model, loss_fn, device, epoch, fold, log_confusion=False):
    model.eval()
    total_loss = 0
    total_acc = 0
    total_f1 = 0
    all_preds = []
    all_targets = []
    num_batches = len(val_loader)
    
    with torch.no_grad():
        pbar = tqdm(val_loader, desc=f"Fold {fold} Val Epoch {epoch+1}")
        for batch_idx, (inputs, targets) in enumerate(pbar):
            inputs = {k: v.to(device) for k, v in inputs.items()}
            targets = targets.to(device)
            
            with autocast():
                outputs = model(**inputs)
                logits = outputs.logits
                loss = loss_fn(logits, targets)
            
            preds = torch.argmax(logits, dim=1)
            acc = accuracy_score(targets.cpu(), preds.cpu())
            f1 = f1_score(targets.cpu(), preds.cpu(), average='macro')
            
            total_loss += loss.item()
            total_acc += acc
            total_f1 += f1
            
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            
            pbar.set_postfix({'Loss': f"{loss.item():.4f}", 'F1': f"{f1:.4f}"})
            
            # 메모리 정리
            del inputs, targets, outputs, logits, loss
            if batch_idx % 5 == 0:
                clear_memory()
    
    # Problem classes (3,7,14) F1 계산
    problem_f1 = {}
    for cls in [3,7,14]:
        cls_mask = np.array(all_targets) == cls
        if np.sum(cls_mask) > 0:
            cls_preds = np.array(all_preds)[cls_mask]
            cls_targets = np.array(all_targets)[cls_mask]
            problem_f1[f'class_{cls}_f1'] = f1_score(cls_targets, cls_preds, average='macro')
    
    avg_problem_f1 = np.mean(list(problem_f1.values())) if problem_f1 else 0
    
    return {
        'val_loss': total_loss / num_batches,
        'val_acc': total_acc / num_batches,
        'val_f1': total_f1 / num_batches,
        'problem_class_f1': problem_f1,
        'avg_problem_f1': avg_problem_f1
    }

def main():
    # 설정값들
    DATA_ROOT = "/root/computervisioncompetition-cv-1/mywork/data"
    model_name = "microsoft/layoutlmv3-base"
    num_classes = 17
    EPOCHS = 3
    BATCH_SIZE = 4  # 메모리 고려
    GRADIENT_ACCUMULATION_STEPS = 8  # 실질적 배치 크기 = 2 * 16 = 32
    N_FOLDS = 2
    LR = 0.0001
    MAX_PATIENCE = 5
    NUM_WORKERS = 0
    SEED = 42
    max_length = 256  # 메모리 절약을 위해 줄임
    
    # WandB 설정
    PROJECT_NAME = "document-classification-team-CV"
    EXPERIMENT_NAME = "layoutlmv3-baseline"
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    print(f"Batch size: {BATCH_SIZE}")
    print(f"Gradient accumulation steps: {GRADIENT_ACCUMULATION_STEPS}")
    print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
    print(f"Max sequence length: {max_length}")
    
    # 프로세서 로드
    processor = LayoutLMv3Processor.from_pretrained(model_name, apply_ocr=False)
    
    # 데이터 로드
    train_df = pd.read_csv(os.path.join(DATA_ROOT, 'train/with_ocr.csv'))
    print(f"Dataset size: {len(train_df)}")
    print(f"Unique labels: {sorted(train_df['target'].unique())}")
    
    # 작은 샘플로 테스트
    #train_df = train_df.head(200).copy()  
    #print(f"Using {len(train_df)} samples for testing")
    
    fold_results = []
    fold_models = []
    
    # K-Fold 교차 검증
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(train_df['ID'], train_df['target'])):
        print(f"\n{'='*60}")
        print(f"Fold {fold+1}/{N_FOLDS}")
        print(f"{'='*60}")
        
        # 시작 전 메모리 정리
        clear_memory()
        
        # 데이터 분할
        trn_df = train_df.iloc[trn_idx].reset_index(drop=True)
        val_df = train_df.iloc[val_idx].reset_index(drop=True)
        
        print(f"Train samples: {len(trn_df)}, Validation samples: {len(val_df)}")
        
        # 데이터셋 생성
        trn_dataset = LayoutLMv3Dataset(trn_df, os.path.join(DATA_ROOT, 'train/'), processor, max_length=max_length)
        val_dataset = LayoutLMv3Dataset(val_df, os.path.join(DATA_ROOT, 'train/'), processor, max_length=max_length)
        
        # DataLoader 생성
        trn_loader = DataLoader(
            trn_dataset, 
            batch_size=BATCH_SIZE, 
            shuffle=True, 
            collate_fn=collate_fn, 
            num_workers=NUM_WORKERS,
            pin_memory=False
        )
        val_loader = DataLoader(
            val_dataset, 
            batch_size=BATCH_SIZE, 
            shuffle=False, 
            collate_fn=collate_fn, 
            num_workers=NUM_WORKERS,
            pin_memory=False
        )
        
        # 모델 생성
        model = LayoutLMv3ForSequenceClassification.from_pretrained(
            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
        ).to(device)
        
        optimizer = Adam(model.parameters(), lr=LR)
        scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
        loss_fn = nn.CrossEntropyLoss()
        scaler = GradScaler()
        
        best_val_f1 = 0
        patience = 0
        best_model = None
        
        # WandB 초기화
        wandb.init(
            project=PROJECT_NAME, 
            name=f"{EXPERIMENT_NAME}-fold-{fold+1}", 
            config={
                'fold': fold+1,
                'model_name': model_name,
                'batch_size': BATCH_SIZE,
                'gradient_accumulation_steps': GRADIENT_ACCUMULATION_STEPS,
                'effective_batch_size': BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS,
                'learning_rate': LR,
                'epochs': EPOCHS,
                'max_length': max_length
            },
            reinit=True
        )
        
        print(f"모델 학습 시작 - Fold {fold+1}")
        
        for epoch in range(EPOCHS):
            print(f"\nEpoch {epoch+1}/{EPOCHS}")
            
            # 훈련 (gradient accumulation 포함)
            train_ret = train_one_epoch_with_grad_accumulation(
                trn_loader, model, optimizer, loss_fn, device, scaler, 
                epoch, fold, GRADIENT_ACCUMULATION_STEPS
            )
            
            # 검증
            val_ret = validate_one_epoch(val_loader, model, loss_fn, device, epoch, fold)
            
            current_lr = optimizer.param_groups[0]['lr']
            
            # WandB 로깅
            log_data = {
                "epoch": epoch + 1,
                "fold": fold + 1,
                "train/loss": train_ret['train_loss'],
                "train/accuracy": train_ret['train_acc'], 
                "train/f1": train_ret['train_f1'],
                "val/loss": val_ret['val_loss'],
                "val/accuracy": val_ret['val_acc'],
                "val/f1": val_ret['val_f1'],
                "learning_rate": current_lr,
            }
            
            if 'avg_problem_f1' in val_ret:
                log_data["val/problem_classes_avg_f1"] = val_ret['avg_problem_f1']
            
            for cls in [3, 7, 14]:
                if f"class_{cls}_f1" in val_ret['problem_class_f1']:
                    log_data[f"val/class_{cls}_f1"] = val_ret['problem_class_f1'][f"class_{cls}_f1"]
            
            wandb.log(log_data)
            
            scheduler.step()
            
            # 출력
            print(f"Train Loss: {train_ret['train_loss']:.4f} | Train F1: {train_ret['train_f1']:.4f}")
            print(f"Val Loss: {val_ret['val_loss']:.4f} | Val F1: {val_ret['val_f1']:.4f}")
            if 'avg_problem_f1' in val_ret:
                print(f"Problem Classes Avg F1: {val_ret['avg_problem_f1']:.4f}")
            
            # 최고 모델 저장
            if val_ret['val_f1'] > best_val_f1:
                best_val_f1 = val_ret['val_f1']
                best_model = copy.deepcopy(model.state_dict())
                patience = 0
                model_path = f'best_model_fold_{fold+1}.pth'
                torch.save(best_model, model_path)
                print(f"New best! F1: {best_val_f1:.4f}")
            else:
                patience += 1
            
            if patience >= MAX_PATIENCE and epoch > EPOCHS // 2:
                print(f"Early stopping at epoch {epoch+1}")
                break
            
            # 에포크 끝날 때마다 메모리 정리
            clear_memory()
        
        fold_results.append({
            'fold': fold+1, 
            'best_val_f1': best_val_f1, 
            'epochs_trained': epoch+1,
            'early_stopped': patience >= MAX_PATIENCE,
            'final_train_f1': train_ret['train_f1'],
            'train_samples': len(trn_df),
            'val_samples': len(val_df)
        })
        fold_models.append(best_model)
        
        wandb.finish()
        
        # 메모리 정리
        del model, optimizer, scheduler, trn_loader, val_loader
        clear_memory()
        
        print(f"Fold {fold+1} 완료!")
    
    # 앙상블 모델 준비 (모든 fold가 끝난 후)
    print(f"\n앙상블 모델 준비 중...")
    ensemble_models = []
    
    for i, state_dict in enumerate(fold_models):
        fold_model = LayoutLMv3ForSequenceClassification.from_pretrained(
            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
        ).to(device)
        fold_model.load_state_dict(state_dict)
        fold_model.eval()
        ensemble_models.append(fold_model)
        print(f"Fold {i+1} 모델 로드 완료")
    
    print(f"총 {len(ensemble_models)}개 모델로 앙상블 구성")
    
    # 결과 요약
    val_f1_scores = [result['best_val_f1'] for result in fold_results]
    mean_f1 = np.mean(val_f1_scores)
    std_f1 = np.std(val_f1_scores)
    
    print(f"\n{'='*60}")
    print(" K-FOLD CROSS VALIDATION 최종 결과")
    print(f"{'='*60}")
    
    for result in fold_results:
        status = " Early Stopped" if result['early_stopped'] else " Completed"
        print(f"Fold {result['fold']}: {result['best_val_f1']:.4f} "
              f"({result['epochs_trained']} epochs){status}")
    
    print(f"\n평균 CV F1: {mean_f1:.4f} ± {std_f1:.4f}")
    print(f"최고 Fold: {max(val_f1_scores):.4f}")
    print(f"최악 Fold: {min(val_f1_scores):.4f}")

if __name__ == "__main__":
    main()

Using device: cuda
Batch size: 4
Gradient accumulation steps: 8
Effective batch size: 32
Max sequence length: 256
Dataset size: 1570
Unique labels: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

Fold 1/2
Train samples: 785, Validation samples: 785


Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
epoch,▁▅█
fold,▁▁▁
learning_rate,█▆▁
train/accuracy,▁█▁
train/f1,▂█▁
train/loss,█▃▁
val/accuracy,▁▁█
val/class_14_f1,▁▁▁
val/class_3_f1,▁▁▁
val/class_7_f1,▁▁▁

0,1
epoch,3
fold,1
learning_rate,0.00025
train/accuracy,0.05711
train/f1,0.03199
train/loss,2.8499
val/accuracy,0.06599
val/class_14_f1,0
val/class_3_f1,0
val/class_7_f1,0


모델 학습 시작 - Fold 1

Epoch 1/3


Fold 0 Train Epoch 1: 100%|██████████| 197/197 [00:24<00:00,  8.19it/s, Loss=2.7988, F1=0.0000]
Fold 0 Val Epoch 1: 100%|██████████| 197/197 [00:22<00:00,  8.88it/s, Loss=2.6777, F1=0.0000]


Train Loss: 2.8543 | Train F1: 0.0379
Val Loss: 2.8177 | Val F1: 0.0277
Problem Classes Avg F1: 0.0000
New best! F1: 0.0277

Epoch 2/3


Fold 0 Train Epoch 2: 100%|██████████| 197/197 [00:24<00:00,  8.18it/s, Loss=2.7305, F1=0.0000]
Fold 0 Val Epoch 2: 100%|██████████| 197/197 [00:22<00:00,  8.69it/s, Loss=2.7246, F1=0.0000]


Train Loss: 2.8272 | Train F1: 0.0322
Val Loss: 2.8148 | Val F1: 0.0280
Problem Classes Avg F1: 0.0000
New best! F1: 0.0280

Epoch 3/3


Fold 0 Train Epoch 3: 100%|██████████| 197/197 [00:24<00:00,  8.19it/s, Loss=2.7656, F1=0.0000]
Fold 0 Val Epoch 3: 100%|██████████| 197/197 [00:22<00:00,  8.86it/s, Loss=2.7773, F1=0.0000]


Train Loss: 2.8239 | Train F1: 0.0385
Val Loss: 2.8136 | Val F1: 0.0267
Problem Classes Avg F1: 0.3333


0,1
epoch,▁▅█
fold,▁▁▁
learning_rate,█▆▁
train/accuracy,█▁▇
train/f1,▇▁█
train/loss,█▂▁
val/accuracy,▁▁▁
val/class_14_f1,▁▁▁
val/class_3_f1,▁▁▁
val/class_7_f1,▁▁█

0,1
epoch,3
fold,1
learning_rate,3e-05
train/accuracy,0.06472
train/f1,0.03847
train/loss,2.82386
val/accuracy,0.06345
val/class_14_f1,0
val/class_3_f1,0
val/class_7_f1,1


Fold 1 완료!

Fold 2/2
Train samples: 785, Validation samples: 785


Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


모델 학습 시작 - Fold 2

Epoch 1/3


Fold 1 Train Epoch 1: 100%|██████████| 197/197 [00:23<00:00,  8.21it/s, Loss=1.4160, F1=0.0000]
Fold 1 Val Epoch 1: 100%|██████████| 197/197 [00:22<00:00,  8.90it/s, Loss=0.7031, F1=1.0000]


Train Loss: 2.4838 | Train F1: 0.1220
Val Loss: 2.0999 | Val F1: 0.1704
Problem Classes Avg F1: 0.1395
New best! F1: 0.1704

Epoch 2/3


Fold 1 Train Epoch 2: 100%|██████████| 197/197 [00:24<00:00,  8.12it/s, Loss=1.8320, F1=0.0000]
Fold 1 Val Epoch 2: 100%|██████████| 197/197 [00:22<00:00,  8.57it/s, Loss=0.0996, F1=1.0000]


Train Loss: 1.8629 | Train F1: 0.3160
Val Loss: 1.6000 | Val F1: 0.3760
Problem Classes Avg F1: 0.0733
New best! F1: 0.3760

Epoch 3/3


Fold 1 Train Epoch 3: 100%|██████████| 197/197 [00:23<00:00,  8.22it/s, Loss=1.7852, F1=0.0000]
Fold 1 Val Epoch 3: 100%|██████████| 197/197 [00:22<00:00,  8.83it/s, Loss=0.0734, F1=1.0000]


Train Loss: 1.4298 | Train F1: 0.4467
Val Loss: 1.3589 | Val F1: 0.4462
Problem Classes Avg F1: 0.0380
New best! F1: 0.4462


0,1
epoch,▁▅█
fold,▁▁▁
learning_rate,█▆▁
train/accuracy,▁▅█
train/f1,▁▅█
train/loss,█▄▁
val/accuracy,▁▆█
val/class_14_f1,▁▁▁
val/class_3_f1,█▃▁
val/class_7_f1,▁██

0,1
epoch,3
fold,2
learning_rate,3e-05
train/accuracy,0.56091
train/f1,0.44675
train/loss,1.42977
val/accuracy,0.5736
val/class_14_f1,0
val/class_3_f1,0.0307
val/class_7_f1,0.08333


Fold 2 완료!

앙상블 모델 준비 중...


Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1 모델 로드 완료


Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 2 모델 로드 완료
총 2개 모델로 앙상블 구성

 K-FOLD CROSS VALIDATION 최종 결과
Fold 1: 0.0280 (3 epochs) Completed
Fold 2: 0.4462 (3 epochs) Completed

평균 CV F1: 0.2371 ± 0.2091
최고 Fold: 0.4462
최악 Fold: 0.0280


: 

In [14]:
# 현재 노트북에서 바로 실행하세요
import torch
import gc

def quick_cleanup():
    """즉시 사용 가능한 빠른 메모리 정리"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    print("메모리 정리 완료")

# 바로 실행
quick_cleanup()

메모리 정리 완료


In [None]:
# Essential TTA transforms (이미지 변형만)
essential_tta_transforms = [
    A.Compose([  # 원본
        A.LongestMaxSize(max_size=img_size),
        A.PadIfNeeded(min_height=img_size, min_width=img_size, border_mode=0, value=0),
        ToTensorV2(),
    ]),
     
    # 90도 회전들
    A.Compose([
        A.LongestMaxSize(max_size=img_size),
        A.PadIfNeeded(min_height=img_size, min_width=img_size, border_mode=0, value=0),
        A.Rotate(limit=[90, 90], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
    A.Compose([
        A.LongestMaxSize(max_size=img_size),
        A.PadIfNeeded(min_height=img_size, min_width=img_size, border_mode=0, value=0),
        A.Rotate(limit=[180, 180], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
    A.Compose([
        A.LongestMaxSize(max_size=img_size),
        A.PadIfNeeded(min_height=img_size, min_width=img_size, border_mode=0, value=0),
        A.Rotate(limit=[-90, -90], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
    # 밝기 개선
    A.Compose([
        A.LongestMaxSize(max_size=img_size),
        A.PadIfNeeded(min_height=img_size, min_width=img_size, border_mode=0, value=0),
        A.RandomBrightnessContrast(brightness_limit=[0.3, 0.3], contrast_limit=[0.3, 0.3], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
]

print(f"TTA 변환 {len(essential_tta_transforms)}개 준비 완료")



🔧 WandB 연결 문제 해결 중...
WandB 초기화 시도 1/3...


[34m[1mwandb[0m: Currently logged in as: [33mkimsunmin0227[0m ([33mkimsunmin0227-hufs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ WandB 초기화 성공!

🚀 WandB 실험 시작!
📊 대시보드: https://wandb.ai/kimsunmin0227-hufs/document-classification-team-CV/runs/g7stly7w
📋 실험명: efficientnet-b3-baseline-0909-0218


In [None]:
class TTALayoutLMv3Dataset(Dataset):
    def __init__(self, df, img_path, ocr_cache, transforms):
        super().__init__(df, img_path, ocr_cache, transform=None)  # 기본 transform None
        self.transforms = transforms  # TTA용

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_id = row['ID']
        target = row['target']
        img_full_path = os.path.join(self.img_path, image_id)
        
        image = Image.open(img_full_path).convert("RGB")
        words, boxes = self.ocr_cache[image_id] if self.ocr_cache else ([], [])
        
        # TTA: 여러 변형된 이미지 리스트 생성 (OCR 고정)
        augmented_images = []
        for transform in self.transforms:
            if transform is not None:  # 원본은 None 아님
                image_np = np.array(image)
                aug_img_np = transform(image=image_np)['image']
                aug_img = Image.fromarray(aug_img_np).convert("RGB")
            else:
                aug_img = image
            # 각 aug_img에 동일 OCR 적용해 encoding 생성
            encoding = processor(images=aug_img, words=words, boxes=boxes, return_tensors="pt", truncation=True, padding="max_length")
            inputs = {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'bbox': encoding['bbox'].squeeze() if 'bbox' in encoding else None,
                'pixel_values': encoding['pixel_values'].squeeze(),
            }
            augmented_images.append(inputs)
        
        return augmented_images, target  # 리스트 반환

# TTA Dataset
tta_dataset = TTALayoutLMv3Dataset(test_df, '../data/test/', test_ocr_cache, essential_tta_transforms)
tta_loader = DataLoader(tta_dataset, batch_size=4, shuffle=False, collate_fn=lambda b: collate_tta(b), num_workers=NUM_WORKERS)  # 배치 크기 줄임

def collate_tta(batch):
    # TTA 배치 처리 (각 샘플의 augmented_images 리스트 스택)
    all_inputs = []
    for sample in batch:
        all_inputs.extend(sample[0])  # Flatten list of dicts
    # Stack as usual
    inputs = {}
    for key in all_inputs[0].keys():
        inputs[key] = torch.stack([inp[key] for inp in all_inputs])
    targets = torch.tensor([s[1] for s in batch])
    return inputs, targets

학습 데이터: 1570개 샘플
 클래스 분포: {0: 100, 1: 46, 2: 100, 3: 100, 4: 100, 5: 100, 6: 100, 7: 100, 8: 100, 9: 100, 10: 100, 11: 100, 12: 100, 13: 74, 14: 50, 15: 100, 16: 100}





🚀 WandB 실험 시작!
📊 대시보드: https://wandb.ai/kimsunmin0227-hufs/document-classification-team-CV/runs/qaeb6wly
📋 실험명: efficientnet-b3-baseline-0909-0218


In [None]:
def ensemble_tta_inference_with_logging(models, loader, confidence_threshold=0.9):
    all_predictions = []
    all_confidences = []
    
    print(f"앙상블 TTA 추론 시작...")
    print(f"{len(models)}개 모델 × {len(essential_tta_transforms)}개 TTA = {len(models)*len(essential_tta_transforms)} 예측 평균")
    
    start_time = time.time()
    
    for batch_idx, (aug_inputs_list, targets) in enumerate(tqdm(loader, desc="Ensemble TTA")):
        batch_size = len(targets)
        ensemble_probs = torch.zeros(batch_size * len(aug_inputs_list[0]), num_classes).to(device)  # TTA 확장
        
        for model in models:
            model.eval()
            with torch.no_grad():
                for aug_idx, inputs in enumerate(aug_inputs_list):  # TTA 루프
                    inputs = {k: v.to(device) for k, v in inputs.items()}
                    outputs = model(**inputs)
                    probs = torch.softmax(outputs.logits, dim=1)
                    ensemble_probs[aug_idx::len(aug_inputs_list)] += probs / len(models)
        
        # 평균 후 원본 배치 크기로 reshape
        ensemble_probs = ensemble_probs.view(batch_size, -1, num_classes).mean(dim=1)
        
        max_probs = torch.max(ensemble_probs, dim=1)[0]
        batch_confidences = max_probs.cpu().numpy()
        all_confidences.extend(batch_confidences)
        
        final_preds = torch.argmax(ensemble_probs, dim=1)
        all_predictions.extend(final_preds.cpu().numpy())
        
        # 로깅 (기존과 동일)
    
    total_time = time.time() - start_time
    print(f"\n 앙상블 TTA 추론 완료! 총 소요시간: {total_time/60:.1f}분")
    return all_predictions, all_confidences

tta_predictions, confidences = ensemble_tta_inference_with_logging(ensemble_models, tta_loader)


 FOLD 1/5


📊 Fold 1 Dashboard: https://wandb.ai/kimsunmin0227-hufs/document-classification-team-CV/runs/kyfo47f4
Train samples: 1256, Validation samples: 314
 모델 학습 시작 - Fold 1

📈 Epoch 1/5


Loss: 2.9053, Mixup: True, Cutout: False, RandomCrop: False: 100%|██████████| 20/20 [00:35<00:00,  1.80s/it]
Val Loss: 2.3544: 100%|██████████| 5/5 [00:02<00:00,  1.69it/s]


  Class 3 F1: 0.1980
  Class 7 F1: 0.2588
  Class 14 F1: 0.2000
  Problem Classes Avg F1: 0.2189
 Epoch  1 | Train Loss: 3.1569 | Train F1: 0.0707 | Val Loss: 2.2478 | Val F1: 0.2575 | LR: 2.00e-04
         Problem Classes (3,7,14) Avg F1: 0.2189 | ✅ Problem classes performing well
🎉 새로운 최고 성능! F1: 0.2575 (Problem Classes: 0.2189)

📈 Epoch 2/5


Loss: 2.6389, Mixup: False, Cutout: False, RandomCrop: True: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
Val Loss: 2.0480: 100%|██████████| 5/5 [00:02<00:00,  2.33it/s]


  Class 3 F1: 0.2336
  Class 7 F1: 0.3103
  Class 14 F1: 0.1111
  Problem Classes Avg F1: 0.2183
 Epoch  2 | Train Loss: 2.5265 | Train F1: 0.1662 | Val Loss: 1.9551 | Val F1: 0.4072 | LR: 1.81e-04
         Problem Classes (3,7,14) Avg F1: 0.2183 | ⚠️ Problem classes need attention
🎉 새로운 최고 성능! F1: 0.4072 (Problem Classes: 0.2183)

📈 Epoch 3/5


Loss: 2.3842, Mixup: False, Cutout: True, RandomCrop: False: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Val Loss: 1.9502: 100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


  Class 3 F1: 0.2368
  Class 7 F1: 0.0625
  Class 14 F1: 0.0000
  Problem Classes Avg F1: 0.0998
 Epoch  3 | Train Loss: 2.3387 | Train F1: 0.2710 | Val Loss: 1.8810 | Val F1: 0.4629 | LR: 1.31e-04
         Problem Classes (3,7,14) Avg F1: 0.0998 | ⚠️ Problem classes need attention
🎉 새로운 최고 성능! F1: 0.4629 (Problem Classes: 0.0998)

📈 Epoch 4/5


Loss: 2.3648, Mixup: False, Cutout: False, RandomCrop: True: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
Val Loss: 1.7044: 100%|██████████| 5/5 [00:01<00:00,  2.81it/s]


  Class 3 F1: 0.2727
  Class 7 F1: 0.2500
  Class 14 F1: 0.1176
  Problem Classes Avg F1: 0.2135
 Epoch  4 | Train Loss: 2.2945 | Train F1: 0.2921 | Val Loss: 1.7210 | Val F1: 0.5273 | LR: 6.91e-05
         Problem Classes (3,7,14) Avg F1: 0.2135 | ⚠️ Problem classes need attention
🎉 새로운 최고 성능! F1: 0.5273 (Problem Classes: 0.2135)

📈 Epoch 5/5


Training Epoch 5:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
# =============================================================================
# 13. K-Fold Cross Validation Results Summary
# =============================================================================

print(f"\n{'='*60}")
print(" K-FOLD CROSS VALIDATION 최종 결과")
print(f"{'='*60}")

val_f1_scores = [result['best_val_f1'] for result in fold_results]
mean_f1 = np.mean(val_f1_scores)
std_f1 = np.std(val_f1_scores)

try:
    # wandb.run이 현재 활성화된 run을 가리킴
    if wandb.run is None:
        print(" 활성화된 run이 없어 새로운 summary run을 생성합니다.")
        active_run = wandb.init(
            project=PROJECT_NAME,
            name=f"SUMMARY-{EXPERIMENT_NAME}-{datetime.now().strftime('%m%d-%H%M')}",
            config=config,
            tags=["summary", "cv-results", model_name],
            group="k-fold-experiment",
            job_type="summary",
            reinit=True
        )
    else:
        print(" 기존 run을 사용합니다.")
        active_run = wandb.run
        
except Exception as e:
    print(f" Run 상태 확인 중 에러: {e}")
    # 새로운 run 생성
    active_run = wandb.init(
        project=PROJECT_NAME,
        name=f"SUMMARY-{EXPERIMENT_NAME}-{datetime.now().strftime('%m%d-%H%M')}",
        config=config,
        tags=["summary", "cv-results", model_name],
        group="k-fold-experiment",
        job_type="summary",
        reinit=True
    )

# CV 요약 테이블 생성
fold_table = wandb.Table(columns=[
    "Fold", "Best_Val_F1", "Final_Train_F1", "Train_Samples", 
    "Val_Samples", "Epochs_Trained", "Early_Stopped"
])

for result in fold_results:
    fold_table.add_data(
        result['fold'], 
        result['best_val_f1'], 
        result['final_train_f1'],
        result['train_samples'], 
        result['val_samples'],
        result['epochs_trained'],
        result['early_stopped']
    )

# 안전한 로깅
try:
    active_run.log({
        "cv_results/mean_f1": mean_f1,
        "cv_results/std_f1": std_f1,
        "cv_results/best_fold_f1": max(val_f1_scores),
        "cv_results/worst_fold_f1": min(val_f1_scores),
        "cv_results/f1_range": max(val_f1_scores) - min(val_f1_scores),
        "cv_results/fold_results_table": fold_table,
        "cv_results/n_folds": N_FOLDS,
        "cv_results/total_epochs": sum([r['epochs_trained'] for r in fold_results]),
        "cv_results/avg_epochs_per_fold": np.mean([r['epochs_trained'] for r in fold_results]),
        "cv_results/early_stopped_folds": sum([r['early_stopped'] for r in fold_results])
    })
    
    # Fold별 성능 바차트 생성
    fold_performance_data = [[f"Fold {i+1}", score] for i, score in enumerate(val_f1_scores)]
    active_run.log({
        "cv_results/fold_performance_chart": wandb.plot.bar(
            wandb.Table(data=fold_performance_data, columns=["Fold", "F1_Score"]),
            "Fold", "F1_Score", 
            title="K-Fold Cross Validation Performance"
        )
    })
    
    print(" CV 결과 로깅 완료!")
    
except Exception as e:
    print(f" WandB 로깅 중 에러: {e}")
    print(" 결과를 콘솔에 출력합니다:")

# 어떤 경우든 콘솔에는 결과 출력
for result in fold_results:
    status = " Early Stopped" if result['early_stopped'] else " Completed"
    print(f"Fold {result['fold']}: {result['best_val_f1']:.4f} "
          f"({result['epochs_trained']} epochs) {status}")

print(f"\n 평균 CV F1: {mean_f1:.4f} ± {std_f1:.4f}")
print(f" 최고 Fold: {max(val_f1_scores):.4f}")
print(f" 최악 Fold: {min(val_f1_scores):.4f}")
print(f" 성능 범위: {max(val_f1_scores) - min(val_f1_scores):.4f}")



 K-FOLD CROSS VALIDATION 최종 결과
 기존 run을 사용합니다.
 CV 결과 로깅 완료!
Fold 1: 0.9390 (70 epochs)  Completed
Fold 2: 0.9319 (70 epochs)  Completed
Fold 3: 0.9420 (70 epochs)  Completed
Fold 4: 0.9376 (70 epochs)  Completed
Fold 5: 0.9402 (70 epochs)  Completed

 평균 CV F1: 0.9381 ± 0.0035
 최고 Fold: 0.9420
 최악 Fold: 0.9319
 성능 범위: 0.0102


In [None]:

# =============================================================================
# 14. Ensemble Models Preparation
# =============================================================================

# 5-Fold 앙상블 모델 준비
ensemble_models = []
print(f"\n🔧 앙상블 모델 준비 중...")

for i, state_dict in enumerate(fold_models):
    fold_model = timm.create_model(model_name, pretrained=True, num_classes=17).to(device)
    fold_model.load_state_dict(state_dict)
    fold_model.eval()
    ensemble_models.append(fold_model)
    print(f"Fold {i+1} 모델 로드 완료")

print(f" 총 {len(ensemble_models)}개 모델로 앙상블 구성")

try:
    if wandb.run is not None:
        wandb.run.log({
            "ensemble/num_models": len(ensemble_models),
            "ensemble/model_architecture": model_name,
            "ensemble/ensemble_type": "simple_average"
        })
    else:
        print("📊 앙상블 정보:")
        print(f"  - 모델 개수: {len(ensemble_models)}")
        print(f"  - 아키텍처: {model_name}")
        print(f"  - 앙상블 타입: simple_average")
except Exception as e:
    print(f"⚠️ 앙상블 정보 로깅 실패: {e}")



🔧 앙상블 모델 준비 중...
Fold 1 모델 로드 완료
Fold 2 모델 로드 완료
Fold 3 모델 로드 완료
Fold 4 모델 로드 완료
Fold 5 모델 로드 완료
 총 5개 모델로 앙상블 구성


In [None]:
tta_pred_df = pd.DataFrame(test_df, columns=['ID', 'target'])
tta_pred_df['target'] = tta_predictions
tta_pred_df.to_csv('../data/output/layoutlmv3_ensemble.csv', index=False)

# WandB 아티팩트 등 (기존)
print("최종 결과 저장 완료!")


 TTA (Test Time Augmentation) 설정...
TTA 변환 5개 준비 완료
