In [None]:
# EfficientNet-B5 + OCR 2-Track 앙상블 모델
#
# 수정 사항:
# 1. (Track 1) Vision: 기존 EfficientNet (Pseudo-Labeling + TTA)
# 2. (Track 2) OCR: EasyOCR + TF-IDF/LogisticRegression
# 3. Ensemble: 최종 예측 시 Track 1과 Track 2의 확률을 가중 평균

import os
import random
import re # NEW: OCR 텍스트 클리닝
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
import wandb
from tqdm import tqdm
import warnings

# NEW: OCR 및 텍스트 분류기 라이브러리
import easyocr
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib # NEW: OCR 모델 저장을 위해

warnings.filterwarnings('ignore')

In [None]:
# ===============================
# 1. Seed 고정 (재현성 확보)
# ===============================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
# ===============================
# 2. 설정 및 하이퍼파라미터 (수정됨)
# ===============================
class Config:
    # 경로
    train_csv = '/home1/dg5407/cv_project/data/train.csv'
    meta_csv = '/home1/dg5407/cv_project/data/meta.csv'
    train_img_dir = '/home1/dg5407/cv_project/data/train'
    test_img_dir = '/home1/dg5407/cv_project/data/test'
    submission_csv = '/home1/dg5407/cv_project/data/sample_submission.csv'
    
    # 모델 설정
    model_name = 'tf_efficientnet_b5'
    img_size = 456
    num_classes = 17
    
    # 학습 설정
    val_split_ratio = 0.2
    epochs = 100
    batch_size = 16
    num_workers = 48 # 이동건님 환경에 맞게 설정 (원본 48 유지)
    
    # Learning Rate 설정
    max_lr = 1e-3
    min_lr = 1e-6
    warmup_epochs = 5
    
    # 의사 라벨링 (Pseudo-Labeling) 설정
    use_pseudo_labeling = True
    pseudo_label_threshold = 0.99
    
    # --- NEW: 2-Track 앙상블 설정 ---
    ocr_lang = ['ko', 'en'] # OCR 언어 설정 (한국어, 영어)
    ensemble_vision_weight = 0.7 # Vision 모델 가중치
    ensemble_ocr_weight = 0.3    # OCR 모델 가중치
    # ---------------------------------
    
    # 기타
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    use_wandb = True
    project_name = 'document-classification-2track' # 프로젝트 이름 변경
    
config = Config()

In [None]:
# ===============================
# 3. WandB 초기화 (수정됨)
# ===============================
if config.use_wandb:
    wandb.login(key='API_KEY') # 이동건님 키
    wandb.init(
        project=config.project_name,
        name=f'{config.model_name}_2track_pseudo_w_v{config.ensemble_vision_weight}_o{config.ensemble_ocr_weight}', # 실행 이름 변경
        config={
            'model': config.model_name,
            'img_size': config.img_size,
            'epochs': config.epochs,
            'batch_size': config.batch_size,
            'max_lr': config.max_lr,
            'min_lr': config.min_lr,
            'val_split_ratio': config.val_split_ratio,
            'use_pseudo_labeling': config.use_pseudo_labeling,
            'pseudo_label_threshold': config.pseudo_label_threshold,
            # NEW: 앙상블 하이퍼파라미터 로깅
            'ensemble_vision_weight': config.ensemble_vision_weight,
            'ensemble_ocr_weight': config.ensemble_ocr_weight,
            'ocr_lang': config.ocr_lang
        }
    )

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home1/dg5407/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdg818500[0m ([33mdg818500-university-of-seoul[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
