## Settings & Check

In [4]:
# 2. 필요한 라이브러리 설치
# %pip install -q kaggle

# 3. Kaggle API 설정
# import os

!which python
!pip show torch
import torch
print("cuda 버전:", torch.version.cuda)
!echo $PATH
!echo $LD_LIBRARY_PATH

/opt/anaconda3/bin/python
Name: torch
Version: 2.0.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /opt/anaconda3/lib/python3.11/site-packages
Requires: filelock, jinja2, networkx, sympy, typing-extensions
Required-by: 
cuda 버전: 12.1
/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/home/gpu_04/.vscode-server/cli/servers/Stable-fabdb6a30b49f79a7aba0f2ad9df9b399473380f/server/bin/remote-cli:/opt/anaconda3/bin:/opt/anaconda3/bin:/usr/local/cuda-12.2/bin:/opt/anaconda3/condabin:/usr/local/cuda-12.2/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/gpu_04/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
/usr/local/cuda-12.2/lib64


In [5]:
import torch
print("PyTorch 버전:", torch.__version__)
print("CUDA 사용 가능 여부:", torch.cuda.is_available())
print("현재 디바이스:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("CUDA 버전:", torch.version.cuda if torch.cuda.is_available() else "None")

# GPU 메모리 단편화 문제 완화
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# GPU 캐시 비우기
import torch
import gc

# 1. 불필요한 변수 삭제
# del variable

# 2. 가비지 컬렉터 실행
gc.collect()

# 3. PyTorch 캐시 메모리 해제
torch.cuda.empty_cache()

# 4. 메모리 사용 상태 출력
print(f"Allocated memory: {torch.cuda.memory_allocated() / (1024 ** 2):.2f} MB")
print(f"Reserved memory: {torch.cuda.memory_reserved() / (1024 ** 2):.2f} MB")

PyTorch 버전: 2.5.1+cu121
CUDA 사용 가능 여부: True
현재 디바이스: NVIDIA RTX A6000
CUDA 버전: 12.1
Allocated memory: 0.00 MB
Reserved memory: 0.00 MB


In [15]:
from datasets import load_dataset

# Flickr30k 데이터셋 다운로드
dataset = load_dataset("nlphuji/flickr30k")
train_dataset = dataset.filter(lambda x: x["split"] == "train")
valid_dataset = dataset.filter(lambda x: x["split"] == "val")
test_dataset = dataset.filter(lambda x: x["split"] == "test")

In [72]:
print(train_dataset)
print(valid_dataset)
print(test_dataset)

DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 29000
    })
})
DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 1014
    })
})
DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 1000
    })
})


In [99]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Sampler
import random

from torchvision import transforms
from transformers import ViTModel, RobertaModel, RobertaTokenizer

# 시드 고정
pl.seed_everything(42)

Seed set to 42


42

## DataSet Structure (Lightning)

### 모든 데이터셋 쌍 (이미지-캡션 5개) 사용

In [100]:
class Flickr30KCustomDataset(Dataset):
    """
    Flickr30K에서 한 이미지당 최대 5개 캡션을 모두 사용하여
    (이미지, 캡션) 쌍을 중복 생성해 총 5배의 데이터로 만든다.
    """
    def __init__(self, hf_dataset, tokenizer, image_transform, max_length=64):
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_length = max_length

        self.pairs = []  # (PIL.Image, caption) 쌍 리스트
        for item in hf_dataset:
            pil_image = item["image"]
            captions = item["caption"]  # 최대 5개
            # 5개 캡션 각각에 대해 중복 샘플링
            for c in captions:
                self.pairs.append((pil_image, c))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pil_image, caption = self.pairs[idx]

        # 1) 이미지 변환
        image = self.image_transform(pil_image)

        # 2) 캡션 토큰화 (RoBERTa)
        tokenized = self.tokenizer(
            caption,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = tokenized["input_ids"].squeeze(0)       # (seq_len,)
        attention_mask = tokenized["attention_mask"].squeeze(0)  # (seq_len,)

        return {
            "image": image,  # (3, 224, 224)
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

## Data Module

In [106]:
class UniqueImageBatchSampler(Sampler):
    """
    같은 이미지가 한 배치 내에 두 번 포함되지 않도록 하는 Batch Sampler
    """
    def __init__(self, dataset, batch_size):
        self.batch_size = batch_size
        self.image_to_indices = {}

        # 같은 이미지를 가진 인덱스를 그룹화
        for idx, (image, _) in enumerate(dataset.pairs):
            image_id = id(image)  # 이미지 객체의 ID로 그룹화
            if image_id not in self.image_to_indices:
                self.image_to_indices[image_id] = []
            self.image_to_indices[image_id].append(idx)

        # 이미지별 그룹 리스트
        self.image_groups = list(self.image_to_indices.values())

        # 모든 인덱스를 하나의 리스트로 모으고 섞음
        self.indices = [idx for group in self.image_groups for idx in group]
        self.num_batches = len(self.indices) // self.batch_size

    def __iter__(self):
        # 샘플 순서를 무작위로 섞은 후 배치를 만듦
        indices = self.indices.copy()
        random.shuffle(indices)
        for i in range(self.num_batches):
            batch = indices[i * self.batch_size : (i + 1) * self.batch_size]
            yield batch

    def __len__(self):
        return self.num_batches

In [107]:
from transformers import ViTModel, RobertaModel, RobertaTokenizer

class Flickr30KDataModule(pl.LightningDataModule):
    def __init__(self,
                 train_dataset_hf,
                 valid_dataset_hf,
                 test_dataset_hf,
                 batch_size=32,
                 num_workers=4):
        super().__init__()
        self.train_dataset_hf = train_dataset_hf
        self.valid_dataset_hf = valid_dataset_hf
        self.test_dataset_hf = test_dataset_hf
        self.batch_size = batch_size
        self.num_workers = num_workers

        # 이미지 전처리 (ViT를 위한 224x224)
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.5, 0.5, 0.5),
                                 std=(0.5, 0.5, 0.5))
        ])

        # RoBERTa 토크나이저
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    def setup(self, stage=None):
        """
        - train_dataset_hf / valid_dataset_hf / test_dataset_hf 는
          이미 split 기준으로 필터링된 DatasetDict 형태일 것이라 가정.
        """
        if stage == "fit" or stage is None:
            self.train_dataset = Flickr30KCustomDataset(
                self.train_dataset_hf["test"],  # 실제 train split
                tokenizer=self.tokenizer,
                image_transform=self.image_transform
            )
            self.valid_dataset = Flickr30KCustomDataset(
                self.valid_dataset_hf["test"],
                tokenizer=self.tokenizer,
                image_transform=self.image_transform
            )
        if stage == "test" or stage is None:
            self.test_dataset = Flickr30KCustomDataset(
                self.test_dataset_hf["test"],
                tokenizer=self.tokenizer,
                image_transform=self.image_transform
            )

    def train_dataloader(self):
        # 한 batch에서 같은 이미지가 중복되지 않도록 UniqueImageBatchSampler 사용
        train_sampler = UniqueImageBatchSampler(self.train_dataset, batch_size=self.batch_size)
        return DataLoader(
            self.train_dataset,
            # batch_size=self.batch_size,   #! batch_sampler option is mutually exclusive with batch_size, shuffle, sampler, and drop_last
            # shuffle=True,
            batch_sampler=train_sampler,    # batch_sampler
            num_workers=self.num_workers
        )

    def val_dataloader(self):
        valid_sampler = UniqueImageBatchSampler(self.valid_dataset, batch_size=self.batch_size)
        return DataLoader(
            self.valid_dataset,
            # batch_size=self.batch_size,
            # shuffle=False,
            batch_sampler=valid_sampler,
            num_workers=self.num_workers
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers
        )

## Model Structure (Lightning)

In [108]:
class ImageTextLightningModel(pl.LightningModule):
    def __init__(self,
                 image_encoder_name="google/vit-base-patch16-224",
                 text_encoder_name="roberta-base",
                 embed_dim=256,
                 temperature=0.07,
                 learning_rate=5e-5):
        super().__init__()
        self.save_hyperparameters()

        # 1) Image Encoder (ViT)
        self.image_encoder = ViTModel.from_pretrained(image_encoder_name)

        # 2) Text Encoder (RoBERTa)
        self.text_encoder = RobertaModel.from_pretrained(text_encoder_name)

        # 3) Projection layers: 768 -> embed_dim
        self.image_proj = nn.Linear(768, embed_dim)
        self.text_proj = nn.Linear(768, embed_dim)

        self.temperature = temperature
        self.learning_rate = learning_rate

        # (테스트 시) 임베딩 저장 버퍼
        self.test_image_embeds = []
        self.test_text_embeds = []
        
        # Validation 결과 저장 버퍼
        self._val_outputs = []
        

    def forward(self, images, input_ids, attention_mask):
        # --- 이미지 임베딩 ---
        image_outputs = self.image_encoder(pixel_values=images)
        image_cls = image_outputs.last_hidden_state[:, 0, :]
        image_embeds = self.image_proj(image_cls)
        image_embeds = F.normalize(image_embeds, p=2, dim=-1)

        # --- 텍스트 임베딩 ---
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_cls = text_outputs.last_hidden_state[:, 0, :]
        text_embeds = self.text_proj(text_cls)
        text_embeds = F.normalize(text_embeds, p=2, dim=-1)

        return image_embeds, text_embeds

    def compute_contrastive_loss(self, image_embeds, text_embeds):
        """
        Symmetric InfoNCE Loss
        """
        logits_per_image = image_embeds @ text_embeds.t() / self.temperature
        logits_per_text = logits_per_image.t()

        batch_size = image_embeds.size(0)
        labels = torch.arange(batch_size, device=self.device)

        loss_i = F.cross_entropy(logits_per_image, labels)
        loss_t = F.cross_entropy(logits_per_text, labels)
        loss = (loss_i + loss_t) / 2.0
        return loss

    # -----------------------
    # Training
    # -----------------------
    def training_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        image_embeds, text_embeds = self(images, input_ids, attention_mask)
        loss = self.compute_contrastive_loss(image_embeds, text_embeds)
        self.log("train_loss", loss, prog_bar=True, on_epoch=True, on_step=True)
        return loss

    # -----------------------
    # Validation
    # -----------------------
    def validation_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        image_embeds, text_embeds = self(images, input_ids, attention_mask)
        val_loss = self.compute_contrastive_loss(image_embeds, text_embeds)
        self.log("val_loss_step", val_loss, prog_bar=False, on_epoch=False, on_step=True)

        return {
            "val_loss": val_loss,
            "image_embeds": image_embeds,
            "text_embeds": text_embeds
        }
    
    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx=0):
        """
        validation_step에서 반환된 outputs를 모아서
        에폭 종료 시점(on_validation_epoch_end)에서 사용.
        """
        self._val_outputs.append(outputs)

    def on_validation_epoch_end(self):
        """
        에폭 마지막에 축적한 self._val_outputs를 사용해
        (1) val_loss 평균
        (2) Recall@K
        를 계산
        """
        val_losses = torch.stack([o["val_loss"] for o in self._val_outputs])
        avg_val_loss = val_losses.mean()
        self.log("val_loss", avg_val_loss, prog_bar=True)

        # Recall@K 계산
        all_image_embeds = torch.cat([o["image_embeds"] for o in self._val_outputs], dim=0)
        all_text_embeds  = torch.cat([o["text_embeds"] for o in self._val_outputs], dim=0)

        similarity_matrix = all_text_embeds @ all_image_embeds.t()
        recall_at_k = self.compute_recall(similarity_matrix, ks=[1,5,10])
        for k, v in recall_at_k.items():
            self.log(f"val_recall@{k}", v, prog_bar=True)
        self.log("val_recall@5", recall_at_k[5], prog_bar=True)  # 체크포인트 모니터

        # 버퍼 비우기
        self._val_outputs.clear()


    # -----------------------
    # Test
    # -----------------------
    def test_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        image_embeds, text_embeds = self(images, input_ids, attention_mask)
        return {"image_embeds": image_embeds, "text_embeds": text_embeds}

    def on_test_batch_end(self, outputs, batch, batch_idx, dataloader_idx=0):
        self.test_image_embeds.append(outputs["image_embeds"])
        self.test_text_embeds.append(outputs["text_embeds"])

    def on_test_epoch_end(self):
        all_image_embeds = torch.cat(self.test_image_embeds, dim=0)
        all_text_embeds  = torch.cat(self.test_text_embeds, dim=0)
        similarity_matrix = all_text_embeds @ all_image_embeds.t()
        recall_at_k = self.compute_recall(similarity_matrix, ks=[1,5,10])
        for k, v in recall_at_k.items():
            self.log(f"test_recall@{k}", v, prog_bar=True)
        print(f"[on_test_epoch_end] Test Recall: {recall_at_k}")

        self.test_image_embeds.clear()
        self.test_text_embeds.clear()

    # -----------------------
    # 공통 함수
    # -----------------------
    def compute_recall(self, similarity_matrix, ks=[1,5,10]):
        """
        similarity_matrix: (N, N) => row i: text i, col j: image j
        대각선이 정답
        """
        device = similarity_matrix.device
        n = similarity_matrix.size(0)
        ground_truth = torch.arange(n, device=device)

        sorted_indices = similarity_matrix.argsort(dim=1, descending=True)
        recall_scores = {}
        for k in ks:
            top_k = sorted_indices[:, :k]
            match = (top_k == ground_truth.unsqueeze(1)).any(dim=1)
            recall_scores[k] = match.float().mean().item()
        return recall_scores

    # -----------------------
    # Optimizer
    # -----------------------
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=self.trainer.max_epochs
        )
        return [optimizer], [scheduler]

## Train

In [109]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# DataModule 생성
data_module = Flickr30KDataModule(
    train_dataset_hf=train_dataset,
    valid_dataset_hf=valid_dataset,
    test_dataset_hf=test_dataset,
    batch_size=128,  # contrastive learning 성능 위해 큰 배치 사용 가능
    num_workers=4
)
data_module.setup("fit")

# 모델 초기화
model = ImageTextLightningModel(
    image_encoder_name="google/vit-base-patch16-224",
    text_encoder_name="roberta-base",
    embed_dim=256,
    temperature=0.07,
    learning_rate=5e-5
)

# 로거와 콜백 설정
logger = TensorBoardLogger(
    save_dir="ImageRetrieveLogs",
    name="ImageRetrieve_ValLoss_Recall"
)

checkpoint_callback = ModelCheckpoint(
    monitor="val_recall@5",  # monitor: Recall@5
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=3,
    mode="max",        # recall은 클수록 좋다
    save_last=True
)

early_stopping_callback = EarlyStopping(
    monitor="val_recall@5",
    patience=5,
    mode="max"
)

trainer = pl.Trainer(
    max_epochs=150,
    accelerator="gpu",
    devices=1,
    precision="16-mixed",
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback]
)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
# 모델 학습
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]

  | Name          | Type         | Params | Mode 
-------------------------------------------------------
0 | image_encoder | ViTModel     | 86.4 M | eval 
1 | text_encoder  | RobertaModel | 124 M  | eval 
2 | image_proj    | Linear       | 196 K  | train
3 | text_proj     | Linear       | 196 K  | train
-------------------------------------------------------
211 M     Trainable params
0         Non-trainable params
211 M     Total params
845.714   Total estimated model params size (MB)
2         Modules in train mode
455       Modules in eval mode


Epoch 2:  26%|▎| 291/1132 [02:02<05:55,  2.37it/s, v_num=3, train_loss_step=0.194, val_loss=2.090, val_recall@1=0.115, val_recall@5=0.508, val_recall@10=0.654, train_loss_epo

## Test 및 체크포인트 수정

In [69]:
from pytorch_lightning import Trainer

# 저장된 체크포인트 파일 경로
checkpoint_path = "/home/gpu_04/jw2020/ImageRetrieving/checkpoints/last.ckpt"

# 모델 로드
model = ImageTextLightningModel.load_from_checkpoint(checkpoint_path)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [111]:
# test
trainer.test(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]


Testing DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:05<00:00,  7.14it/s][on_test_epoch_end] Test Recall: {1: 0.11659999936819077, 5: 0.5389999747276306, 10: 0.6805999875068665}
Testing DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:05<00:00,  7.12it/s]


[{'test_recall@1': 0.11659999936819077,
  'test_recall@5': 0.5389999747276306,
  'test_recall@10': 0.6805999875068665}]