In [1]:
import os
import torch
import pandas as pd
import numpy as np

from Load_Data import random_seed
from Train import Trainer
from Inference import Model_Ensemble, run_inference
from PseudoLabel import PseudoLabeler

In [2]:
# 시드 고정
random_seed(42)

# 기본 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
img_size = 384
LR = 1e-5
EPOCHS = 60
BATCH_SIZE = 12
Drop_out = 0.4
weight_decay = 5e-4
num_workers = 4
num_classes = 17
n_splits = 3
patience = 5
model_name = "convnextv2_large.fcmae_ft_in22k_in1k_384"
# model_name = "convnextv2_huge.fcmae_ft_in22k_in1k_512"
# model_name = "convnext_large_in22ft1k" # batch_size=14~16
# model_name = "convnext_base_in22ft1k" # batch_size=32
# model_name = "maxvit_large_tf_384"

# 경로
run_name = "V18"
train_csv_path = "../data/train_update2.csv"
test_csv_path = "../data/sample_submission.csv"
original_train_path = "../data/train/"
test_path = "../data/test/"
model_save_path = f"../model/{run_name}/"
augmented_save_path = f"../data/augment_image/{run_name}-augmented"
augmented_csv_save_path = f"../data/augment_csv/{run_name}-augmented.csv"
submission_path = f"../data/submission/{run_name}-submission.csv"

In [None]:
# 원본 데이터
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Trainer 생성
trainer = Trainer(
    df=train_df,
    original_data_path=original_train_path,
    augmented_save_path=augmented_save_path,
    augmented_csv_save_path=augmented_csv_save_path,
    model_name=model_name,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    lr=LR,
    drop_out=Drop_out,
    weight_decay=weight_decay,
    img_size=img_size,
    num_workers=num_workers,
    device=device,
    save_dir=model_save_path,
    run_name_prefix=run_name,
    num_classes=num_classes,
    n_splits=n_splits,
    patience=patience,
    k_fold=True,
    augmentation_target_count=450
)

# K-Fold 분할, 증강, 학습 모두 실행
f1_df = trainer.run()

Calculating aspect ratios:   0%|          | 0/1570 [00:00<?, ?it/s]

Calculating aspect ratios: 100%|██████████| 1570/1570 [00:00<00:00, 18442.03it/s]






Augmented for class 0: 100%|██████████| 383/383 [00:09<00:00, 41.95it/s]
Augmented for class 1: 100%|██████████| 420/420 [00:10<00:00, 40.53it/s]
Augmented for class 2: 100%|██████████| 383/383 [00:09<00:00, 41.84it/s]
Augmented for class 3: 100%|██████████| 384/384 [00:09<00:00, 42.25it/s]
Augmented for class 4: 100%|██████████| 382/382 [00:08<00:00, 42.67it/s]
Augmented for class 5: 100%|██████████| 384/384 [00:09<00:00, 39.68it/s]
Augmented for class 6: 100%|██████████| 383/383 [00:08<00:00, 44.36it/s]
Augmented for class 7: 100%|██████████| 383/383 [00:08<00:00, 43.89it/s]
Augmented for class 8: 100%|██████████| 383/383 [00:09<00:00, 39.27it/s]
Augmented for class 9: 100%|██████████| 384/384 [00:09<00:00, 39.82it/s]
Augmented for class 10: 100%|██████████| 384/384 [00:09<00:00, 41.55it/s]
Augmented for class 11: 100%|██████████| 383/383 [00:09<00:00, 41.03it/s]
Augmented for class 12: 100%|██████████| 383/383 [00:09<00:00, 40.49it/s]
Augmented for class 13: 100%|██████████| 400/400


=== Fold 1: Train=7650, Val=523 ===


[34m[1mwandb[0m: Currently logged in as: [33mmoonstalker9010[0m ([33mmoonstalker9010-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[Fold 1][Epoch 1/60] Training: 100%|██████████| 638/638 [08:46<00:00,  1.21it/s, loss=0.906]
[Fold 1][Epoch 1/60] Validation: 100%|██████████| 44/44 [00:10<00:00,  4.39it/s]


[Fold 1] Ep1 - Train: 1.3000 | Val: 0.2515, Acc: 0.9254, F1: 0.9131


[Fold 1][Epoch 2/60] Training: 100%|██████████| 638/638 [08:34<00:00,  1.24it/s, loss=0.668] 
[Fold 1][Epoch 2/60] Validation: 100%|██████████| 44/44 [00:09<00:00,  4.67it/s]


[Fold 1] Ep2 - Train: 0.6284 | Val: 0.1626, Acc: 0.9465, F1: 0.9401


[Fold 1][Epoch 3/60] Training: 100%|██████████| 638/638 [08:33<00:00,  1.24it/s, loss=0.252] 
[Fold 1][Epoch 3/60] Validation: 100%|██████████| 44/44 [00:09<00:00,  4.62it/s]


[Fold 1] Ep3 - Train: 0.4591 | Val: 0.1497, Acc: 0.9541, F1: 0.9478


[Fold 1][Epoch 4/60] Training: 100%|██████████| 638/638 [08:34<00:00,  1.24it/s, loss=0.465] 
[Fold 1][Epoch 4/60] Validation: 100%|██████████| 44/44 [00:09<00:00,  4.66it/s]


[Fold 1] Ep4 - Train: 0.3753 | Val: 0.1391, Acc: 0.9560, F1: 0.9500


[Fold 1][Epoch 5/60] Training: 100%|██████████| 638/638 [08:34<00:00,  1.24it/s, loss=0.774] 
[Fold 1][Epoch 5/60] Validation: 100%|██████████| 44/44 [00:09<00:00,  4.64it/s]


[Fold 1] Ep5 - Train: 0.3110 | Val: 0.1515, Acc: 0.9618, F1: 0.9559


[Fold 1][Epoch 6/60] Training: 100%|██████████| 638/638 [08:35<00:00,  1.24it/s, loss=0.0389]
[Fold 1][Epoch 6/60] Validation: 100%|██████████| 44/44 [00:09<00:00,  4.66it/s]


[Fold 1] Ep6 - Train: 0.2582 | Val: 0.1803, Acc: 0.9484, F1: 0.9424


[Fold 1][Epoch 7/60] Training: 100%|██████████| 638/638 [08:35<00:00,  1.24it/s, loss=0.449]  
[Fold 1][Epoch 7/60] Validation: 100%|██████████| 44/44 [00:09<00:00,  4.64it/s]


[Fold 1] Ep7 - Train: 0.2298 | Val: 0.1683, Acc: 0.9579, F1: 0.9515


[Fold 1][Epoch 8/60] Training: 100%|██████████| 638/638 [08:35<00:00,  1.24it/s, loss=0.0486] 
[Fold 1][Epoch 8/60] Validation: 100%|██████████| 44/44 [00:09<00:00,  4.64it/s]


[Fold 1] Ep8 - Train: 0.1971 | Val: 0.1770, Acc: 0.9560, F1: 0.9493


[Fold 1][Epoch 9/60] Training: 100%|██████████| 638/638 [08:35<00:00,  1.24it/s, loss=1.35]   
[Fold 1][Epoch 9/60] Validation: 100%|██████████| 44/44 [00:09<00:00,  4.63it/s]


[Fold 1] Ep9 - Train: 0.1873 | Val: 0.1793, Acc: 0.9541, F1: 0.9476


[Fold 1][Epoch 10/60] Training:  89%|████████▉ | 571/638 [07:40<00:53,  1.24it/s, loss=0.0474]

In [None]:
test_df = pd.read_csv(test_csv_path)

# 앙상블 모델
# fold_weights = np.array([0.9416, 0.94768, 0.93204])
fold_weights = f1_df["f1"].values
ensembler = Model_Ensemble(
    model_name=model_name,
    fold_paths_dir=model_save_path,
    fold_weights=fold_weights,
    num_classes=num_classes,
    drop_out=Drop_out,
    device=device,
    k_fold=True
)

# 추론 실행 (TTA 적용)
run_inference(
    ensembler=ensembler,
    submission_df=test_df.copy(),
    test_path=test_path,
    img_size=img_size,
    save_path=submission_path, # 저장
    batch_size=BATCH_SIZE,
    num_workers=num_workers,
    use_tta=False # TTA 
)

Inference: 100%|██████████| 449/449 [06:52<00:00,  1.09it/s]

[✓] Saved submission to: ../data/submission/V17-submission.csv





In [None]:
# 의사 레이블링 실행기 생성
labeler = PseudoLabeler(
    ensembler=initial_ensembler,
    device=device,
    img_size=img_size,
    batch_size=BATCH_SIZE,
    num_workers=num_workers
)

# 의사 레이블 생성, 데이터 결합, 파일 저장까지 모두 실행
final_df, final_image_path = labeler.run(
    original_aug_df=initial_trainer.df, # strata 정보가 포함된 원본 df
    original_aug_path=initial_augmented_save_path,
    test_df=test_df,
    test_path=test_path,
    confidence_threshold=0.97,
    save_base_dir=combined_data_save_dir,
    run_name=f"{final_run_name}-Pseudo-Data"
)