## Settings & Check

In [50]:
# 2. 필요한 라이브러리 설치
# %pip install -q kaggle

# 3. Kaggle API 설정
# import os

!which python
!pip show torch
import torch
print("cuda 버전:", torch.version.cuda)
!echo $PATH
!echo $LD_LIBRARY_PATH

/opt/anaconda3/bin/python
Name: torch
Version: 2.0.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /opt/anaconda3/lib/python3.11/site-packages
Requires: filelock, jinja2, networkx, sympy, typing-extensions
Required-by: 
cuda 버전: 12.1
/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/home/gpu_04/.vscode-server/cli/servers/Stable-fabdb6a30b49f79a7aba0f2ad9df9b399473380f/server/bin/remote-cli:/opt/anaconda3/bin:/opt/anaconda3/bin:/usr/local/cuda-12.2/bin:/opt/anaconda3/condabin:/usr/local/cuda-12.2/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/gpu_04/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
/usr/local/cuda-12.2/lib64


In [1]:
import torch
print("PyTorch 버전:", torch.__version__)
print("CUDA 사용 가능 여부:", torch.cuda.is_available())
print("현재 디바이스:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("CUDA 버전:", torch.version.cuda if torch.cuda.is_available() else "None")

# GPU 메모리 단편화 문제 완화
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# GPU 캐시 비우기
import torch
import gc

# 1. 불필요한 변수 삭제
# del variable

# 2. 가비지 컬렉터 실행
gc.collect()

# 3. PyTorch 캐시 메모리 해제
torch.cuda.empty_cache()

# 4. 메모리 사용 상태 출력
print(f"Allocated memory: {torch.cuda.memory_allocated() / (1024 ** 2):.2f} MB")
print(f"Reserved memory: {torch.cuda.memory_reserved() / (1024 ** 2):.2f} MB")

PyTorch 버전: 2.5.1+cu121
CUDA 사용 가능 여부: True
현재 디바이스: NVIDIA RTX A6000
CUDA 버전: 12.1
Allocated memory: 0.00 MB
Reserved memory: 0.00 MB


In [2]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from transformers import ViTModel, RobertaModel, RobertaTokenizer
# swin 모델는 transformers에서 불러올 수 있음
from transformers import SwinModel

from datasets import load_dataset

# 시드 고정
pl.seed_everything(42)

  from .autonotebook import tqdm as notebook_tqdm
2025-02-26 23:36:24.737148: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740580584.759778 2777379 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740580584.766639 2777379 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-26 23:36:24.790609: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Seed set to 42


42

## DataSet Structure (Lightning)

### 모든 데이터셋 쌍 (이미지-캡션 5개) 사용

In [3]:
# =============================================================================
# 2. Flickr30k 데이터셋을 Multi-Caption Dataset으로 변환
#    - 원본 record 하나가 한 이미지와 5개의 캡션을 가지므로,
#      각 record에 대해 (record_idx, caption_idx)로 mapping하는 인덱스를 생성합니다.
# =============================================================================
class Flickr30KMultiCaptionDataset(Dataset):
    """
    한 이미지 record의 모든 캡션(총 5개)을 각각 하나의 예제로 반환.
    각 예제는 이미지, 텍스트 토큰, attention mask와 함께 원본 이미지의 인덱스(img_id)를 포함.
    """
    # def __init__(self, hf_dataset, tokenizer, image_transform, max_length=64):
    #     self.hf_dataset = hf_dataset
    #     self.tokenizer = tokenizer
    #     self.image_transform = image_transform
    #     self.max_length = max_length
    #     self.index_map = []  # (record_idx, caption_idx)
    #     for rec_idx, record in enumerate(self.hf_dataset):
    #         captions = record["caption"]
    #         for cap_idx in range(len(captions)):
    #             self.index_map.append((rec_idx, cap_idx))
    def __init__(self, hf_dataset, tokenizer, image_transform, max_length=64):
        """
        Dataset for Flickr30K with only one caption per image.
        
        Args:
            hf_dataset: HuggingFace dataset for Flickr30K
            tokenizer: Text tokenizer (RoBERTa)
            image_transform: Image transformation pipeline
            max_length: Maximum token length for captions
        """
        self.hf_dataset = hf_dataset
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_length = max_length
        
        # 각 이미지마다 첫 번째 캡션만 사용
        # 또는 랜덤하게 하나의 캡션만 선택하려면 아래 주석 처리된 방식 사용
        self.index_map = [(idx, 0) for idx in range(len(self.hf_dataset))]

    def __len__(self):
        return len(self.index_map)

    def __getitem__(self, idx):
        record_idx, caption_idx = self.index_map[idx]
        record = self.hf_dataset[record_idx]
        pil_image = record["image"]
        caption = record["caption"][caption_idx]

        # 이미지 전처리
        image = self.image_transform(pil_image)

        # 텍스트 토큰화
        tokenized = self.tokenizer(
            caption,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = tokenized["input_ids"].squeeze(0)
        attention_mask = tokenized["attention_mask"].squeeze(0)

        return {
            "image": image,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "img_id": record_idx  # 동일 이미지면 동일한 id 부여
        }

## Data Module

In [4]:
# =============================================================================
# 3. DataModule 구성 (학습/검증/테스트 셋 분리)
# =============================================================================
class Flickr30KDataModule(pl.LightningDataModule):
    def __init__(self,
                 train_dataset_hf,
                 valid_dataset_hf,
                 test_dataset_hf,
                 batch_size=128,
                 num_workers=4,
                 max_length=64):
        super().__init__()
        self.train_dataset_hf = train_dataset_hf
        self.valid_dataset_hf = valid_dataset_hf
        self.test_dataset_hf = test_dataset_hf
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.max_length = max_length

        # 이미지 전처리: Resize, ToTensor, Normalize
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.5, 0.5, 0.5),
                                 std=(0.5, 0.5, 0.5))
        ])
        # 텍스트 토크나이저: roberta-large 사용
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train_dataset = Flickr30KMultiCaptionDataset(
                self.train_dataset_hf["test"],  # 수정: ["test"] 접근
                tokenizer=self.tokenizer,
                image_transform=self.image_transform,
                max_length=self.max_length
            )
            self.valid_dataset = Flickr30KMultiCaptionDataset(
                self.valid_dataset_hf["test"],  # 수정: ["test"] 접근
                tokenizer=self.tokenizer,
                image_transform=self.image_transform,
                max_length=self.max_length
            )
        if stage == "test" or stage is None:
            self.test_dataset = Flickr30KMultiCaptionDataset(
                self.test_dataset_hf["test"],  # 수정: ["test"] 접근
                tokenizer=self.tokenizer,
                image_transform=self.image_transform,
                max_length=self.max_length
            )


    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers
        )

## Model Structure (Lightning)

In [8]:
# =============================================================================
# 4. 다중 Positive Contrastive Loss 함수 구현
#    - 각 anchor(텍스트 혹은 이미지)마다, 동일 img_id를 가진 모든 다른 샘플들을 positive로 취급.
#    - loss = logsumexp(전체) – log(sum(exp(similarity)) over positive)
# =============================================================================
def multi_positive_contrastive_loss(similarity_matrix):
    """
    표준 InfoNCE 손실 함수.
    similarity_matrix: (B, B)
    """
    # ground truth는 대각선 인덱스 (자기 자신과 매칭)
    labels = torch.arange(similarity_matrix.size(0), device=similarity_matrix.device)
    loss_i2t = F.cross_entropy(similarity_matrix, labels)
    loss_t2i = F.cross_entropy(similarity_matrix.t(), labels)
    return (loss_i2t + loss_t2i) / 2.0

# =============================================================================
# 5. Lightning Module 구성
#    - 이미지 인코더는 SOTA인 Swin (또는 ViT) 사용, 텍스트 인코더는 roberta-large 사용
#    - 인코딩 후 projection layer를 거쳐 L2 normalization 수행
#    - forward() 내부에서 이미지 인코더가 swin인 경우에는 평균 풀링으로 feature를 추출
#    - training/validation 시, 배치 내 img_id를 활용하여 positive mask를 생성한 후 loss를 계산함
# =============================================================================
class ImageTextLightningModel(pl.LightningModule):
    def __init__(self,
                 image_encoder_name="microsoft/swin-base-patch4-window7-224",
                 text_encoder_name="roberta-large",
                 embed_dim=256,
                 temperature=0.07,
                 learning_rate=1e-5,
                 vit_train_layers=12,        # 이미지 인코더에서 fine-tuning할 마지막 레이어 수
                 roberta_train_layers=12):   # 텍스트 인코더에서 fine-tuning할 마지막 레이어 수
        super().__init__()
        self.save_hyperparameters()

        # 1) 이미지 인코더: swin 또는 ViT 선택
        if "swin" in image_encoder_name:
            self.image_encoder = SwinModel.from_pretrained(image_encoder_name)
        else:
            self.image_encoder = ViTModel.from_pretrained(image_encoder_name)

        # 2) 텍스트 인코더: roberta-large 사용
        self.text_encoder = RobertaModel.from_pretrained(text_encoder_name)

        # 3) Projection layers (각 인코더의 hidden_size에 맞춰 구성)
        image_hidden_size = self.image_encoder.config.hidden_size
        text_hidden_size = self.text_encoder.config.hidden_size
        self.image_proj = nn.Linear(image_hidden_size, embed_dim)
        self.text_proj = nn.Linear(text_hidden_size, embed_dim)

        self.temperature = temperature
        self.learning_rate = learning_rate

        # 검증/테스트 시 중간 결과 저장용
        self._val_outputs = []
        self.test_image_embeds = []
        self.test_text_embeds = []

        # Freeze & Unfreeze: 전체 파라미터 동결 후, 마지막 몇 레이어만 unfreeze
        self.freeze_image_encoder_layers(train_layers=vit_train_layers)
        self.freeze_roberta_layers(train_layers=roberta_train_layers)

    def freeze_image_encoder_layers(self, train_layers):
        # 전체 파라미터 동결
        for param in self.image_encoder.parameters():
            param.requires_grad = False
        # ViT의 경우: self.image_encoder.encoder.layer, Swin의 경우: self.image_encoder.encoder.layers 로 구성됨
        if hasattr(self.image_encoder.encoder, "layer"):
            layers = self.image_encoder.encoder.layer
        elif hasattr(self.image_encoder.encoder, "layers"):
            layers = self.image_encoder.encoder.layers
        else:
            layers = []
        total_layers = len(layers)
        for layer_idx in range(max(0, total_layers - train_layers), total_layers):
            for param in layers[layer_idx].parameters():
                param.requires_grad = True
        # 추가적으로 layernorm이나 pooler가 있다면 unfreeze
        if hasattr(self.image_encoder, "layernorm"):
            for param in self.image_encoder.layernorm.parameters():
                param.requires_grad = True

    def freeze_roberta_layers(self, train_layers):
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        total_layers = len(self.text_encoder.encoder.layer)
        for layer_idx in range(max(0, total_layers - train_layers), total_layers):
            for param in self.text_encoder.encoder.layer[layer_idx].parameters():
                param.requires_grad = True
        if hasattr(self.text_encoder, "pooler"):
            for param in self.text_encoder.pooler.parameters():
                param.requires_grad = True

    def forward(self, images, input_ids, attention_mask):
        # --- 이미지 인코더 ---
        image_outputs = self.image_encoder(pixel_values=images)
        # swin인 경우: 평균 풀링, 그 외에는 [CLS] 토큰 사용
        if self.hparams.image_encoder_name.startswith("microsoft/swin"):
            image_feat = image_outputs.last_hidden_state.mean(dim=1)
        else:
            image_feat = image_outputs.last_hidden_state[:, 0, :]
        image_embeds = self.image_proj(image_feat)
        image_embeds = F.normalize(image_embeds, p=2, dim=-1)

        # --- 텍스트 인코더 ---
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = text_outputs.last_hidden_state[:, 0, :]
        text_embeds = self.text_proj(text_feat)
        text_embeds = F.normalize(text_embeds, p=2, dim=-1)

        return image_embeds, text_embeds

    def compute_contrastive_loss(self, image_embeds, text_embeds, img_ids):
        """
        싱글 캡션 모델에서는 단순히 배치 내 쌍에 대한 대각선 요소를 positive로 취급합니다.
        img_ids는 더 이상 필요하지 않지만, 기존 호출 코드와 호환성을 위해 파라미터는 유지합니다.
        """
        # text→image 방향: (B, B) similarity matrix
        sim_matrix = torch.matmul(text_embeds, image_embeds.t()) / self.temperature
        # 싱글 캡션이므로 단순 InfoNCE 손실 사용
        return multi_positive_contrastive_loss(sim_matrix)

    def training_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        img_ids = batch["img_id"]
        image_embeds, text_embeds = self(images, input_ids, attention_mask)
        loss = self.compute_contrastive_loss(image_embeds, text_embeds, img_ids)
        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=True)
        return loss

    def on_train_epoch_start(self):
        optimizer = self.optimizers()
        if isinstance(optimizer, list):
            optimizer = optimizer[0]
        for i, param_group in enumerate(optimizer.param_groups):
            lr = param_group["lr"]
            self.log(f"lr_layer_{i}", lr, prog_bar=True)

    def validation_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        img_ids = batch["img_id"]
        image_embeds, text_embeds = self(images, input_ids, attention_mask)
        val_loss = self.compute_contrastive_loss(image_embeds, text_embeds, img_ids)
        self.log("val_loss_step", val_loss, prog_bar=False)
        return {
            "val_loss": val_loss,
            "image_embeds": image_embeds,
            "text_embeds": text_embeds,
            "img_ids": img_ids
        }

    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx=0):
        self._val_outputs.append(outputs)

    def on_validation_epoch_end(self):
        # 평균 validation loss 로깅
        val_losses = torch.stack([o["val_loss"] for o in self._val_outputs])
        avg_val_loss = val_losses.mean()
        self.log("val_loss", avg_val_loss, prog_bar=True)

        # 배치별 결과를 모두 모아 recall@K 계산 (text→image retrieval)
        all_image_embeds = torch.cat([o["image_embeds"] for o in self._val_outputs], dim=0)
        all_text_embeds  = torch.cat([o["text_embeds"]  for o in self._val_outputs], dim=0)
        similarity_matrix = torch.matmul(all_text_embeds, all_image_embeds.t())
        recall_at_k = self.compute_recall(similarity_matrix, ks=[1,5,10])
        for k, v in recall_at_k.items():
            self.log(f"val_recall@{k}", v, prog_bar=True)
        self._val_outputs.clear()

    def test_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        image_embeds, text_embeds = self(images, input_ids, attention_mask)
        return {"image_embeds": image_embeds, "text_embeds": text_embeds}

    def on_test_batch_end(self, outputs, batch, batch_idx, dataloader_idx=0):
        self.test_image_embeds.append(outputs["image_embeds"])
        self.test_text_embeds.append(outputs["text_embeds"])

    def on_test_epoch_end(self):
        all_image_embeds = torch.cat(self.test_image_embeds, dim=0)
        all_text_embeds  = torch.cat(self.test_text_embeds, dim=0)
        similarity_matrix = torch.matmul(all_text_embeds, all_image_embeds.t())
        recall_at_k = self.compute_recall(similarity_matrix, ks=[1,5,10])
        for k, v in recall_at_k.items():
            self.log(f"test_recall@{k}", v, prog_bar=True)
        print(f"[on_test_epoch_end] Test Recall: {recall_at_k}")
        self.test_image_embeds.clear()
        self.test_text_embeds.clear()

    def compute_recall(self, similarity_matrix, ks=[1,5,10]):
        device = similarity_matrix.device
        n = similarity_matrix.size(0)
        # 대각선 요소가 정답 (각 텍스트는 같은 인덱스의 이미지에 매칭)
        ground_truth = torch.arange(n, device=device)
        sorted_indices = similarity_matrix.argsort(dim=1, descending=True)
        recall_scores = {}
        for k in ks:
            top_k = sorted_indices[:, :k]
            match = (top_k == ground_truth.unsqueeze(1)).any(dim=1)
            recall_scores[k] = match.float().mean().item()
        return recall_scores

    def configure_optimizers(self):
        base_lr = self.learning_rate
        layerwise_decay = 0.9
        optimizer_params = []

        # --- 이미지 인코더: layer-wise learning rate decay ---
        if hasattr(self.image_encoder.encoder, "layer"):
            image_layers = self.image_encoder.encoder.layer
        elif hasattr(self.image_encoder.encoder, "layers"):
            image_layers = self.image_encoder.encoder.layers
        else:
            image_layers = []
        total_image_layers = len(image_layers)
        train_image_layers = self.hparams.vit_train_layers
        for i, layer_idx in enumerate(range(max(0, total_image_layers - train_image_layers), total_image_layers)):
            lr = base_lr * (layerwise_decay ** i)
            optimizer_params.append({"params": image_layers[layer_idx].parameters(), "lr": lr})

        # --- 텍스트 인코더: layer-wise learning rate decay ---
        total_text_layers = len(self.text_encoder.encoder.layer)
        train_text_layers = self.hparams.roberta_train_layers
        for i, layer_idx in enumerate(range(max(0, total_text_layers - train_text_layers), total_text_layers)):
            lr = base_lr * (layerwise_decay ** i)
            optimizer_params.append({"params": self.text_encoder.encoder.layer[layer_idx].parameters(), "lr": lr})

        # --- Projection layers: 새로 학습되는 부분은 더 큰 lr 적용 ---
        optimizer_params.append({"params": self.image_proj.parameters(), "lr": base_lr * 5})
        optimizer_params.append({"params": self.text_proj.parameters(), "lr": base_lr * 5})

        optimizer = torch.optim.AdamW(optimizer_params, lr=base_lr, weight_decay=1e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=self.trainer.max_epochs, eta_min=1e-7
        )
        return [optimizer], [scheduler]

## Train

In [9]:
# =============================================================================
# 6. 데이터셋 로드 및 DataModule 생성
# =============================================================================
# 허깅페이스 데이터셋 로드 및 split 별로 filtering (split 필드 기준)
dataset = load_dataset("nlphuji/flickr30k")
train_dataset = dataset.filter(lambda x: x["split"] == "train")
valid_dataset = dataset.filter(lambda x: x["split"] == "val")
test_dataset  = dataset.filter(lambda x: x["split"] == "test")

data_module = Flickr30KDataModule(
    train_dataset_hf=train_dataset,
    valid_dataset_hf=valid_dataset,
    test_dataset_hf=test_dataset,
    batch_size=128,
    num_workers=4,
    max_length=64
)
data_module.setup("fit")

# =============================================================================
# 7. 모델 초기화 및 학습 설정
# =============================================================================
model = ImageTextLightningModel(
    image_encoder_name="microsoft/swin-base-patch4-window7-224",
    text_encoder_name="roberta-large",
    embed_dim=256,
    temperature=0.07,
    learning_rate=1e-5,
    vit_train_layers=0,
    roberta_train_layers=0
)

logger = TensorBoardLogger(
    save_dir="ImageRetrieveLogs",
    name="ImageRetrieve_MultiPos"
)
# checkpoint_callback = ModelCheckpoint(
#     monitor="val_recall@5",
#     mode="max",
#     dirpath="checkpoints_multi",
#     filename="best-checkpoint",
#     save_top_k=3,
#     save_last=True
# )
early_stopping_callback = EarlyStopping(
    monitor="val_recall@5",
    patience=5,
    mode="max"
)

trainer = pl.Trainer(
    max_epochs=200,   # 에폭 수: 하드웨어 및 실험에 따라 조정 가능
    accelerator="gpu",
    devices=1,
    precision="16-mixed",
    logger=logger,
    callbacks=early_stopping_callback,
    enable_checkpointing=False
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [10]:
# 모델 학습
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]



  | Name          | Type         | Params | Mode 
-------------------------------------------------------
0 | image_encoder | SwinModel    | 86.7 M | eval 
1 | text_encoder  | RobertaModel | 355 M  | eval 
2 | image_proj    | Linear       | 262 K  | train
3 | text_proj     | Linear       | 262 K  | train
-------------------------------------------------------
1.6 M     Trainable params
441 M     Non-trainable params
442 M     Total params
1,770.511 Total estimated model params size (MB)
2         Modules in train mode
927       Modules in eval mode


Epoch 8:  26%|▎| 60/227 [00:19<00:54,  3.06it/s, v_num=1, train_loss_step=1.350, val_loss=1.740, val_recall@1=0.294, val_recall@5=0.587, val_recall@10=


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

## Test 및 체크포인트 수정

In [28]:
from pytorch_lightning import Trainer

# 저장된 체크포인트 파일 경로
checkpoint_path = "/home/gpu_04/jw2020/ImageRetrieving/checkpoints_7th/best-checkpoint.ckpt"

# 모델 로드
model = ImageTextLightningModel.load_from_checkpoint(checkpoint_path)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# test
trainer.test(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]


Testing DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.48it/s][on_test_epoch_end] Test Recall: {1: 0.5160000324249268, 5: 0.8110000491142273, 10: 0.8860000371932983}
Testing DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.42it/s]


[{'test_recall@1': 0.5160000324249268,
  'test_recall@5': 0.8110000491142273,
  'test_recall@10': 0.8860000371932983}]