## Settings & Check

In [36]:
# 2. 필요한 라이브러리 설치
# %pip install -q kaggle

# 3. Kaggle API 설정
# import os

!which python
!pip show torch
import torch
print("cuda 버전:", torch.version.cuda)
!echo $PATH
!echo $LD_LIBRARY_PATH

/opt/anaconda3/bin/python
Name: torch
Version: 2.0.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /opt/anaconda3/lib/python3.11/site-packages
Requires: filelock, jinja2, networkx, sympy, typing-extensions
Required-by: 
cuda 버전: 12.1
/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/usr/local/cuda-12.2/bin:/home/gpu_04/.vscode-server/cli/servers/Stable-fabdb6a30b49f79a7aba0f2ad9df9b399473380f/server/bin/remote-cli:/opt/anaconda3/bin:/opt/anaconda3/bin:/usr/local/cuda-12.2/bin:/opt/anaconda3/condabin:/usr/local/cuda-12.2/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/gpu_04/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
/usr/local/cuda-12.2/lib64


In [37]:
import torch
print("PyTorch 버전:", torch.__version__)
print("CUDA 사용 가능 여부:", torch.cuda.is_available())
print("현재 디바이스:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("CUDA 버전:", torch.version.cuda if torch.cuda.is_available() else "None")

# GPU 메모리 단편화 문제 완화
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# GPU 캐시 비우기
import torch
import gc

# 1. 불필요한 변수 삭제
# del variable

# 2. 가비지 컬렉터 실행
gc.collect()

# 3. PyTorch 캐시 메모리 해제
torch.cuda.empty_cache()

# 4. 메모리 사용 상태 출력
print(f"Allocated memory: {torch.cuda.memory_allocated() / (1024 ** 2):.2f} MB")
print(f"Reserved memory: {torch.cuda.memory_reserved() / (1024 ** 2):.2f} MB")

PyTorch 버전: 2.5.1+cu121
CUDA 사용 가능 여부: True
현재 디바이스: NVIDIA RTX A6000
CUDA 버전: 12.1
Epoch 6: 100%|█| 227/227 [1:18:46<00:00,  0.05it/s, v_num=0, train_loss_step=0.524, val_loss=1.030, val_recall@1=0.414, val_recall@5=0.745, val_recall@10=0.849, lr_layer_0=9.98e-6, lr_laye
Allocated memory: 0.00 MB
Reserved memory: 6.00 MB


In [38]:
from datasets import load_dataset

# Flickr30k 데이터셋 다운로드
dataset = load_dataset("nlphuji/flickr30k")
train_dataset = dataset.filter(lambda x: x["split"] == "train")
valid_dataset = dataset.filter(lambda x: x["split"] == "val")
test_dataset = dataset.filter(lambda x: x["split"] == "test")

In [39]:
print(train_dataset)
print(valid_dataset)
print(test_dataset)

DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 29000
    })
})
DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 1014
    })
})
DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 1000
    })
})


In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import random

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from torchvision import transforms
from transformers import ViTModel, RobertaModel, RobertaTokenizer
from datasets import load_dataset

# 시드 고정
pl.seed_everything(42)

Seed set to 42


42

## DataSet Structure (Lightning)

### 모든 데이터셋 쌍 (이미지-캡션 5개) 사용

In [41]:
class Flickr30KSingleCaptionDataset(Dataset):
    """
    한 이미지당 캡션 1개만(무작위) 사용
    """
    def __init__(self, hf_dataset, tokenizer, image_transform, max_length=64):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        pil_image = data["image"]
        captions = data["caption"]
        caption = random.choice(captions)  # 5개 중 1개 무작위 선택

        # 이미지 전처리
        image = self.image_transform(pil_image)

        # 텍스트 토큰화
        tokenized = self.tokenizer(
            caption,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = tokenized["input_ids"].squeeze(0)
        attention_mask = tokenized["attention_mask"].squeeze(0)

        return {
            "image": image,
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }


## Data Module

In [42]:
from transformers import ViTModel, RobertaModel, RobertaTokenizer

class Flickr30KDataModule(pl.LightningDataModule):
    def __init__(self,
                 train_dataset_hf,
                 valid_dataset_hf,
                 test_dataset_hf,
                 batch_size=32,
                 num_workers=4):
        super().__init__()
        self.train_dataset_hf = train_dataset_hf
        self.valid_dataset_hf = valid_dataset_hf
        self.test_dataset_hf = test_dataset_hf
        self.batch_size = batch_size
        self.num_workers = num_workers

        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.5, 0.5, 0.5),
                                 std=(0.5, 0.5, 0.5))
        ])
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train_dataset = Flickr30KSingleCaptionDataset(
                self.train_dataset_hf["test"],
                tokenizer=self.tokenizer,
                image_transform=self.image_transform
            )
            self.valid_dataset = Flickr30KSingleCaptionDataset(
                self.valid_dataset_hf["test"],
                tokenizer=self.tokenizer,
                image_transform=self.image_transform
            )
        if stage == "test" or stage is None:
            self.test_dataset = Flickr30KSingleCaptionDataset(
                self.test_dataset_hf["test"],
                tokenizer=self.tokenizer,
                image_transform=self.image_transform
            )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers
        )


## Model Structure (Lightning)

In [43]:
class ImageTextLightningModel(pl.LightningModule):
    def __init__(self,
                 image_encoder_name="google/vit-base-patch16-224",
                 text_encoder_name="roberta-base",
                 embed_dim=256,
                 temperature=0.07,
                 learning_rate=5e-5,
                 vit_train_layers=2,        # 학습할 비전 트랜스포머 레이어 수
                 roberta_train_layers=2):   # 학습할 로버타 레이어 수
        super().__init__()
        self.save_hyperparameters()

        # 1) Image Encoder (ViT)
        self.image_encoder = ViTModel.from_pretrained(image_encoder_name)
        # 2) Text Encoder (RoBERTa)
        self.text_encoder = RobertaModel.from_pretrained(text_encoder_name)

        # Projection layers
        self.image_proj = nn.Linear(768, embed_dim)
        self.text_proj = nn.Linear(768, embed_dim)

        self.temperature = temperature
        self.learning_rate = learning_rate

        # Validation 시 batch별 결과 임시 저장
        self._val_outputs = []
        # Test 시 batch별 결과 임시 저장
        self.test_image_embeds = []
        self.test_text_embeds = []

        # ------------------
        # Freeze + Unfreeze
        # ------------------
        self.freeze_vit_layers(train_layers=vit_train_layers)
        self.freeze_roberta_layers(train_layers=roberta_train_layers)

    def freeze_vit_layers(self, train_layers=12):
        """
        ViT의 마지막 train_layers개 레이어만 Fine-tuning하고, 나머지는 동결
        """
        # (1) 전체 파라미터 동결
        for param in self.image_encoder.parameters():
            param.requires_grad = False

        # (2) total 레이어 수 확인 (예: vit-base-patch16-224는 12층)
        total_layers = len(self.image_encoder.encoder.layer)
        # (3) 마지막 N개의 레이어만 unfreeze
        for layer_idx in range(total_layers - train_layers, total_layers):
            for param in self.image_encoder.encoder.layer[layer_idx].parameters():
                param.requires_grad = True

        # Pooler나 LayerNorm 등 추가로 학습해야 할 부분 있으면 여기서 풀어줄 수 있음
        if hasattr(self.image_encoder, "layernorm"):
            for param in self.image_encoder.layernorm.parameters():
                param.requires_grad = True

    def freeze_roberta_layers(self, train_layers=12):
        """
        RoBERTa의 마지막 train_layers개 레이어만 Fine-tuning
        """
        # (1) 전체 파라미터 동결
        for param in self.text_encoder.parameters():
            param.requires_grad = False

        # (2) roberta encoder.layer 총 12층
        total_layers = len(self.text_encoder.encoder.layer)
        # (3) 마지막 N개 레이어 unfreeze
        for layer_idx in range(total_layers - train_layers, total_layers):
            for param in self.text_encoder.encoder.layer[layer_idx].parameters():
                param.requires_grad = True

        # pooler or LMHead 등 추가로 학습하려면 풀어줄 수 있음
        if hasattr(self.text_encoder, "pooler"):
            for param in self.text_encoder.pooler.parameters():
                param.requires_grad = True

    # ------------------
    # forward
    # ------------------
    def forward(self, images, input_ids, attention_mask):
        # --- ViT ---
        image_outputs = self.image_encoder(pixel_values=images)
        # last_hidden_state[:, 0, :] → [CLS]
        image_cls = image_outputs.last_hidden_state[:, 0, :]
        image_embeds = self.image_proj(image_cls)
        image_embeds = F.normalize(image_embeds, p=2, dim=-1)

        # --- RoBERTa ---
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_cls = text_outputs.last_hidden_state[:, 0, :]
        text_embeds = self.text_proj(text_cls)
        text_embeds = F.normalize(text_embeds, p=2, dim=-1)

        return image_embeds, text_embeds

    # ------------------
    # compute_contrastive_loss
    # ------------------
    def compute_contrastive_loss(self, image_embeds, text_embeds):
        logits_per_image = image_embeds @ text_embeds.t() / self.temperature
        logits_per_text = logits_per_image.t()

        batch_size = image_embeds.size(0)
        labels = torch.arange(batch_size, device=self.device)

        loss_i = F.cross_entropy(logits_per_image, labels)
        loss_t = F.cross_entropy(logits_per_text, labels)
        loss = (loss_i + loss_t) / 2.0
        return loss

    # ------------------
    # train step
    # ------------------
    def training_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        image_embeds, text_embeds = self(images, input_ids, attention_mask)
        loss = self.compute_contrastive_loss(image_embeds, text_embeds)
        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=True)
        return loss
    
    def on_train_epoch_start(self):
        """Epoch 시작 시 현재 Learning Rate 확인"""
        optimizer = self.optimizers()
        for i, param_group in enumerate(optimizer.param_groups):
            lr = param_group["lr"]
            self.log(f"lr_layer_{i}", lr, prog_bar=True)

    # ------------------
    # validation step
    # ------------------
    def validation_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        image_embeds, text_embeds = self(images, input_ids, attention_mask)

        val_loss = self.compute_contrastive_loss(image_embeds, text_embeds)
        self.log("val_loss_step", val_loss, prog_bar=False)

        return {
            "val_loss": val_loss,
            "image_embeds": image_embeds,
            "text_embeds": text_embeds
        }

    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx=0):
        self._val_outputs.append(outputs)

    def on_validation_epoch_end(self):
        # 전체 val batch 결과 종합
        val_losses = torch.stack([o["val_loss"] for o in self._val_outputs])
        avg_val_loss = val_losses.mean()
        self.log("val_loss", avg_val_loss, prog_bar=True)

        # Recall@K 계산
        all_image_embeds = torch.cat([o["image_embeds"] for o in self._val_outputs], dim=0)
        all_text_embeds  = torch.cat([o["text_embeds"]  for o in self._val_outputs], dim=0)
        similarity_matrix = all_text_embeds @ all_image_embeds.t()
        recall_at_k = self.compute_recall(similarity_matrix, ks=[1,5,10])

        for k, v in recall_at_k.items():
            self.log(f"val_recall@{k}", v, prog_bar=True)
        self.log("val_recall@5", recall_at_k[5], prog_bar=True)

        self._val_outputs.clear()

    # ------------------
    # test step
    # ------------------
    def test_step(self, batch, batch_idx):
        images = batch["image"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        image_embeds, text_embeds = self(images, input_ids, attention_mask)
        return {"image_embeds": image_embeds, "text_embeds": text_embeds}

    def on_test_batch_end(self, outputs, batch, batch_idx, dataloader_idx=0):
        self.test_image_embeds.append(outputs["image_embeds"])
        self.test_text_embeds.append(outputs["text_embeds"])

    def on_test_epoch_end(self):
        all_image_embeds = torch.cat(self.test_image_embeds, dim=0)
        all_text_embeds  = torch.cat(self.test_text_embeds, dim=0)
        similarity_matrix = all_text_embeds @ all_image_embeds.t()
        recall_at_k = self.compute_recall(similarity_matrix, ks=[1,5,10])
        for k, v in recall_at_k.items():
            self.log(f"test_recall@{k}", v, prog_bar=True)
        print(f"[on_test_epoch_end] Test Recall: {recall_at_k}")

        self.test_image_embeds.clear()
        self.test_text_embeds.clear()

    # ------------------
    # recall
    # ------------------
    def compute_recall(self, similarity_matrix, ks=[1,5,10]):
        device = similarity_matrix.device
        n = similarity_matrix.size(0)
        ground_truth = torch.arange(n, device=device)
        sorted_indices = similarity_matrix.argsort(dim=1, descending=True)

        recall_scores = {}
        for k in ks:
            top_k = sorted_indices[:, :k]
            match = (top_k == ground_truth.unsqueeze(1)).any(dim=1)
            recall_scores[k] = match.float().mean().item()
        return recall_scores

    # ------------------
    # optimizer
    # ------------------
    # def configure_optimizers(self):
    #     # optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
    #     ## Layer별 다른 LR 설정 (projection layer에 더 큰 lr)
    #     optimizer = torch.optim.AdamW([
    #         {"params": model.image_encoder.encoder.layer[-2:].parameters(), "lr": 1e-5},
    #         {"params": model.text_encoder.encoder.layer[-2:].parameters(), "lr": 1e-5},
    #         {"params": model.image_proj.parameters(), "lr": 5e-5},
    #         {"params": model.text_proj.parameters(), "lr": 5e-5}
    #     ])
    #     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    #         optimizer, T_max=self.trainer.max_epochs
    #     )
    #     return [optimizer], [scheduler]
    def configure_optimizers(self):
        base_lr = 1e-5  # 기본 Learning Rate
        layerwise_decay = 0.9  # Layer-wise decay 비율

        optimizer_params = []

        # ViT 마지막 6개 레이어에 서로 다른 LR 적용
        total_vit_layers = len(self.image_encoder.encoder.layer)
        for i, layer_idx in enumerate(range(total_vit_layers - 12, total_vit_layers)):
            lr = base_lr * (layerwise_decay ** i)  # layer-wise decay 적용
            optimizer_params.append({"params": self.image_encoder.encoder.layer[layer_idx].parameters(), "lr": lr})

        # RoBERTa 마지막 6개 레이어에 서로 다른 LR 적용
        total_roberta_layers = len(self.text_encoder.encoder.layer)
        for i, layer_idx in enumerate(range(total_roberta_layers - 12, total_roberta_layers)):
            lr = base_lr * (layerwise_decay ** i)
            optimizer_params.append({"params": self.text_encoder.encoder.layer[layer_idx].parameters(), "lr": lr})

        # Projection Layer는 별도로 더 큰 학습률 적용 (기본적으로 새로 학습되는 부분)
        optimizer_params.append({"params": self.image_proj.parameters(), "lr": 5e-5})
        optimizer_params.append({"params": self.text_proj.parameters(), "lr": 5e-5})

        optimizer = torch.optim.AdamW(optimizer_params, lr=base_lr, weight_decay=1e-4)

        # Learning Rate Scheduler (Cosine Decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=self.trainer.max_epochs, eta_min=1e-7)

        return [optimizer], [scheduler]



## Train

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# DataModule 생성
data_module = Flickr30KDataModule(
    train_dataset_hf=train_dataset,
    valid_dataset_hf=valid_dataset,
    test_dataset_hf=test_dataset,
    batch_size=128,  # contrastive learning -> batch size
    num_workers=4
)
data_module.setup("fit")

# 모델 초기화
model = ImageTextLightningModel(
    image_encoder_name="google/vit-base-patch16-224",
    text_encoder_name="roberta-base",
    embed_dim=256,
    temperature=0.07,
    learning_rate=1e-5,
    vit_train_layers=12,        # ViT 마지막 2개 레이어만 학습
    roberta_train_layers=12     # RoBERTa 마지막 2개 레이어만 학습
)

# 로거와 콜백 설정
logger = TensorBoardLogger(
    save_dir="ImageRetrieveLogs",
    name="ImageRetrieve_6th"
)
checkpoint_callback = ModelCheckpoint(
    monitor="val_recall@5",
    mode="max",
    dirpath="checkpoints_7th",
    filename="best-checkpoint",
    save_top_k=3,
    save_last=True
)
early_stopping_callback = EarlyStopping(
    monitor="val_recall@5",
    patience=20,
    mode="max"
)

trainer = pl.Trainer(
    max_epochs=200,   # 에폭 수 (하드웨어 성능 따라 조정)
    accelerator="gpu",
    devices=1,
    precision="16-mixed",
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback]
)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# 모델 학습
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]

  | Name          | Type         | Params | Mode 
-------------------------------------------------------
0 | image_encoder | ViTModel     | 86.4 M | eval 
1 | text_encoder  | RobertaModel | 124 M  | eval 
2 | image_proj    | Linear       | 196 K  | train
3 | text_proj     | Linear       | 196 K  | train
-------------------------------------------------------
86.0 M    Trainable params
125 M     Non-trainable params
211 M     Total params
845.714   Total estimated model params size (MB)
2         Modules in train mode
455       Modules in eval mode


Epoch 0:  21%|██████████████████████▌                                                                                      | 47/227 [00:13<00:51,  3.49it/s, v_num=1, train_loss_step=3.430]

## Test 및 체크포인트 수정

In [34]:
from pytorch_lightning import Trainer

# 저장된 체크포인트 파일 경로
checkpoint_path = "/home/gpu_04/jw2020/ImageRetrieving/checkpoints/best-checkpoint-v2.ckpt"

# 모델 로드
model = ImageTextLightningModel.load_from_checkpoint(checkpoint_path)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
# test
trainer.test(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]


Testing DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.64it/s][on_test_epoch_end] Test Recall: {1: 0.49500003457069397, 5: 0.8080000281333923, 10: 0.8950000405311584}
Testing DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.57it/s]


[{'test_recall@1': 0.49500003457069397,
  'test_recall@5': 0.8080000281333923,
  'test_recall@10': 0.8950000405311584}]