# **📄 Document Classification with Modular Architecture**
> 모듈화 CV 경진대회 코드
> 
> **Features**: K-Fold CV + Ensemble TTA + Professional Architecture

##  Contents
- Environment Setup & Configuration
- K-Fold Cross Validation Training
- Ensemble TTA Inference
- Results & Submission

## 1. Environment Setup

In [1]:
# 필요한 라이브러리 설치
# !pip install timm optuna albumentations

import warnings
warnings.filterwarnings('ignore')

print("Starting Modular CV Competition Pipeline...")

Starting Modular CV Competition Pipeline...


In [2]:
# 모듈화된 컴포넌트 임포트
import pandas as pd

from module.config import Config
from module.utils.seed import set_seed
from module.experiment.kfold import KFoldExperiment
from module.inference.tta import EnsembleTTAPredictor
from module.data.dataset import ImageDataset
from module.data.transforms import get_train_transforms, get_test_transforms

print("All modules imported successfully")

ModuleNotFoundError: No module named 'module.training.loss'

## 2. Configuration Setup

In [None]:
# 설정 초기화
config = Config()

# 경로 설정 (필요시 수정)
config.data_path = "../data/"
config.train_path = "../data/train/"
config.test_path = "../data/test/"
config.output_path = "../output/"

# 모델 및 학습 설정
config.model_name = 'efficientnet_b3'
config.img_size = 384
config.batch_size = 32
config.epochs = 10
config.learning_rate = 5e-4
config.n_folds = 5

# 시드 고정으로 재현성 보장
set_seed(config.seed)

print(f"  Configuration completed")
print(f"   - Model: {config.model_name}")
print(f"   - Image Size: {config.img_size}")
print(f"   - Batch Size: {config.batch_size}")
print(f"   - Epochs: {config.epochs}")
print(f"   - K-Folds: {config.n_folds}")
print(f"   - Device: {config.device}")

## 3. Data Loading

In [None]:
# 훈련 데이터 로드
train_df = pd.read_csv(config.get_train_csv_path())

print(f" Dataset Info:")
print(f"   - Train samples: {len(train_df)}")
print(f"   - Classes: {config.num_classes}")
print(f"   - Class distribution:")
print(train_df['target'].value_counts().sort_index().to_string())

## 4. K-Fold Cross Validation Training

In [None]:
# K-Fold 실험 초기화
experiment = KFoldExperiment(config)

# 교차 검증 실행
print(" Starting K-Fold Cross Validation Training...")
cv_results = experiment.run_cross_validation(train_df)

# 결과 저장
experiment.save_experiment_results(cv_results)

print(f"\n Final CV Results:")
print(f"   - Mean F1: {cv_results['mean_f1']:.4f} ± {cv_results['std_f1']:.4f}")
print(f"   - Best Fold: {cv_results['best_fold']} (F1: {cv_results['max_f1']:.4f})")
print(f"   - Models saved: {len(experiment.fold_models)}")

## 5. Test Data Loading

In [None]:
# 테스트 데이터 로드
test_df = pd.read_csv(config.get_test_csv_path())

# 테스트 데이터셋 생성
test_dataset = ImageDataset(
    image_paths=[f"{config.test_path}{img_id}" for img_id in test_df['ID']],
    targets=[0] * len(test_df),  # 테스트 데이터는 레이블이 없음
    transform=get_val_transforms(config)
)

from torch.utils.data import DataLoader
test_loader = DataLoader(
    test_dataset,
    batch_size=config.tta_batch_size,
    shuffle=False,
    num_workers=config.num_workers,
    pin_memory=True
)

print(f" Test Data Loaded:")
print(f"   - Test samples: {len(test_dataset)}")
print(f"   - Batch size: {config.tta_batch_size}")

## 6. Ensemble TTA Inference

In [None]:
# 앙상블 TTA 예측기 초기화
ensemble_predictor = EnsembleTTAPredictor(
    models=experiment.fold_models,
    config=config
)

# 앙상블 TTA 추론 실행
print(" Starting Ensemble TTA Inference...")
ensemble_probs = ensemble_predictor.predict_ensemble_tta(
    test_loader,
    use_adaptive_tta=True,
    show_progress=True
)

# 최종 예측 생성
import numpy as np
final_predictions = np.argmax(ensemble_probs, axis=1)

print(f"\n Inference Completed:")
print(f"   - Total predictions: {len(final_predictions)}")
print(f"   - Models used: {len(experiment.fold_models)}")
print(f"   - TTA transforms: {len(ensemble_predictor.tta_predictors[0].tta_transforms)}")

## 7. Results & Submission

In [None]:
# 제출 파일 생성
submission_df = pd.DataFrame({
    'ID': test_df['ID'],
    'target': final_predictions.astype(int)
})

# 결과 저장
submission_path = config.get_output_csv_path()
submission_df.to_csv(submission_path, index=False)

print(f" Submission saved: {submission_path}")
print(f"\n Prediction Summary:")
print(f"   - Total samples: {len(submission_df)}")
print(f"   - Unique classes predicted: {len(np.unique(final_predictions))}")
print(f"\n Class Distribution:")
print(submission_df['target'].value_counts().sort_index().to_string())

# 샘플 출력
print(f"\n Sample Predictions:")
display(submission_df.head(10))

## 8. Performance Summary

In [None]:
# 최종 성능 요약
print(f"\n{'='*60}")
print(f" MODULAR CV PIPELINE COMPLETED")
print(f"{'='*60}")

print(f" Cross Validation Results:")
for i, result in enumerate(experiment.fold_results):
    print(f"   Fold {i+1}: F1 = {result['best_val_f1']:.4f}")

print(f"\n Final Performance:")
print(f"   - CV Mean F1: {cv_results['mean_f1']:.4f} ± {cv_results['std_f1']:.4f}")
print(f"   - Expected Public Score: ~{cv_results['mean_f1']*0.85:.4f} (conservative estimate)")

print(f"\n Technical Highlights:")
print(f"   - Modular Architecture: ✅")
print(f"   - K-Fold Cross Validation: ✅")
print(f"   - Ensemble TTA: ✅")
print(f"   - Document-Specific Augmentation: ✅")
print(f"   - Mixed Precision Training: ✅")
print(f"   - Reproducible Results: ✅")

print(f"\n Ready for submission: {submission_path}")