# 라이브러리 import

In [1]:
import torch
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch.optim as optim
import cv2
from transformers import AutoTokenizer, BertModel
import numpy as np
import pandas as pd
import warnings
from tqdm.notebook import tqdm
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
import random
from sentence_transformers import SentenceTransformer
import time
import torch.nn.functional as F
warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'False'

# 바꿀만한 변수

In [2]:
num_epoch = 1
margin = 10
learning_rate = 0.005
user_name = "june16"
best_model_name = './weights/'+user_name+'sBERT_best.pth' 
last_model_name = './weights/'+user_name+'sBERT_last.pth'
early_stop = 40

# 학습 파라미터 및 loss 정의

In [3]:
class CosineSimilarityLoss(nn.Module):
    def __init__(self):
        super(CosineSimilarityLoss, self).__init__()
        self.cosine_sim = nn.CosineSimilarity(dim=1)

    def forward(self, x1, x2):
        return margin * (1 - self.cosine_sim(x1, x2).mean()) # 0~1사이면 loss가 너무 적어 학습이 안될 수도 있으니 곱해주는 겁니다.

In [4]:
gpu = "cuda:0"
device = torch.device(gpu)
criterion = CosineSimilarityLoss()
seed = 42  # 랜덤시드 고정
torch.manual_seed(seed)
train_batch = 135

# 데이터 불러오기

In [5]:
df = pd.read_csv("data.csv")[['image','caption']]
df.head()

Unnamed: 0,image,caption
0,./data/Images/1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,./data/Images/1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,./data/Images/1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,./data/Images/1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,./data/Images/1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


# Train, validation, Test split(60, 20, 20비율로) 

In [6]:
train, test = train_test_split(df, test_size=0.4, random_state=42)
valid, test = train_test_split(test, test_size=0.5, random_state=42)

# 파이토치 데이터로더

In [7]:
class Flickr8k(Dataset):
  def __init__(self,df):
    super().__init__()
    self.data = df.to_numpy().tolist()

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index):
    img, cap = self.data[index]
    img = cv2.resize(cv2.imread(img),(320,320))
    return {"image":img.transpose(2,0,1), "caption":cap}


train_dataset = Flickr8k(train)
valid_dataset = Flickr8k(valid)
test_dataset = Flickr8k(test)
del df
num_cores = os.cpu_count()
train_dataloader = DataLoader(train_dataset, batch_size=train_batch, shuffle=True,num_workers=num_cores)
valid_dataloader = DataLoader(valid_dataset, batch_size=train_batch, shuffle=False,num_workers=num_cores)
test_dataloader = DataLoader(test_dataset, batch_size=train_batch, shuffle=False,num_workers=num_cores)

# 이미지 임베딩 모델 정의

In [8]:
class ImageEmbedder(nn.Module):
    def __init__(self):
        super(ImageEmbedder, self).__init__()
        # ResNet50 불러오기
        self.resnet = models.resnet50(pretrained=True)
        # Backbone으로 활용하기 위해 last layer 제거.
        self.resnet.fc = nn.Identity()
        # BERT의 Output과 차원을 맞추기 위해 mlp 추가
        self.mlp = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024,768),
            nn.ReLU(),
            nn.Linear(768,384)
        )
    def forward(self, x):
        x = self.resnet(x)
        x = self.mlp(x)
        return x

# 이미지 임베딩, BERT모델 선언

In [9]:
image_embedder = ImageEmbedder().to(gpu)
sentence_embedder = model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(gpu)
model.eval()
sentence_embedder.eval()
warnings.filterwarnings('ignore')

# 학습 시작

In [10]:
optimizer = optim.Adam(image_embedder.parameters(), lr=learning_rate) # 경사하강법을 바꾸는 방법도...?
best = float("inf")
print("train start")
current_time = datetime.now()
early_stop_cnt = 0
print("시작 시간:", current_time)
for epoch in range(num_epoch):
    train_mse = 0
    val_loss = 0
    image_embedder.train()
    pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), leave=False)
    for idx, batch in pbar:
        pbar.set_postfix(cos=margin-(train_mse/max(1,idx)))
        img, cap = batch['image'].float().to(gpu), batch['caption']
        img = image_embedder(img)
        with torch.no_grad():
            cap = torch.Tensor(model.encode(cap)).to(gpu)
        loss = criterion(img, cap)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_mse += loss.item()
        
        del img, cap, batch, loss
        torch.cuda.empty_cache()
        

    train_mse /= len(train_dataloader)
    del pbar
    image_embedder.eval()
    with torch.no_grad():
        pbar2 = tqdm(enumerate(valid_dataloader), total=len(valid_dataloader), leave=False)
        for idx, batch in pbar2:
            pbar2.set_postfix(cos=margin-(val_loss/max(1,idx)))
            img, cap = batch['image'].float().to(gpu), batch['caption']
            img = image_embedder(img)
            cap = torch.Tensor(model.encode(cap)).to(gpu)
            val_loss += criterion(img, cap).item()
            del img, cap, batch
            torch.cuda.empty_cache()

        val_loss /= len(valid_dataloader)
        torch.cuda.empty_cache()
        print(f"[epoch:{epoch+1}] train_loss: {train_mse:.5f} val_loss: {val_loss:.5f}", end= " ")
        if val_loss < best:
            print('best!')
            torch.save(image_embedder.state_dict(), best_model_name)
            best = val_loss
            early_stop_cnt = 0
        else:
            print()
            early_stop_cnt += 1
        del pbar2
    torch.save(image_embedder.state_dict(), last_model_name)
    if early_stop_cnt >= early_stop:
        print("early stop!")
        break
current_time = datetime.now()
print("종료 시간:", current_time)   

train start
시작 시간: 2024-06-20 14:41:39.183439


  0%|          | 0/180 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

[epoch:1] train_loss: 7.21859 val_loss: 7.03393 best!
종료 시간: 2024-06-20 14:44:32.023485


# 데이터베이스에 이미지 다 올리기(Test데이터만)

In [11]:
image_embedder.load_state_dict(torch.load(best_model_name))
image_embedder.eval()
with torch.no_grad():
    pbar2 = tqdm(enumerate(test_dataloader), total=len(test_dataloader), leave=False)
    first=True
    for idx, batch in pbar2:
        img = batch['image'].float().to(gpu)
        img = image_embedder(img)
        cap = batch['caption']
        cap = torch.Tensor(model.encode(cap)).to("cuda:1")
        if first:
            first=False
            database = img
            database = database.to("cuda:1")
            text_database = cap
            text_database = text_database.to("cuda:1")
        else:
            img = img.to("cuda:1")
            database = torch.cat((database,img),dim=0)
            text_database = torch.cat((text_database,cap),dim=0)

  0%|          | 0/60 [00:00<?, ?it/s]

# Hit Rate 세팅

In [12]:
k_for_topk = [1,5, 10, 20, 30, 50, 75, 100]

# Hit Rate(자연어 -> 이미지)

In [13]:
with torch.no_grad():
    for hit_k in k_for_topk:
        hit_rate = 0
        for idx in tqdm(range(len(test)),leave=False):
            cap = torch.Tensor(model.encode(list(test[['caption']].iloc[idx])))
            cap = cap.to("cuda:1")
            distance = F.cosine_similarity(cap, database)
            distance = distance.view(-1, 1).T
            _, indices = torch.topk(distance, hit_k)
            if idx in indices:
                hit_rate += 1
            del cap, distance, indices
        print(f"Hit@{hit_k}: {(hit_rate/len(test))*100:.2f}%")
        torch.cuda.empty_cache()

  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@1: 0.02%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@5: 0.25%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@10: 0.42%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@20: 0.96%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@30: 1.20%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@50: 2.10%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@75: 3.35%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@100: 4.36%


# Hit Rate (이미지 -> 이미지)

In [14]:
with torch.no_grad():
    for hit_k in k_for_topk:
        hit_rate = 0
        for idx in tqdm(range(len(test)),leave=False):
            img = cv2.resize(cv2.imread(list(test[['image']].iloc[idx])[0]),(320,320)).transpose(2,0,1)
            img = torch.Tensor(img).unsqueeze(0).float().to("cuda:0")
            img = image_embedder(img).to("cuda:1")
            distance = F.cosine_similarity(img, database)
            distance = distance.view(-1, 1).T
            _, indices = torch.topk(distance, hit_k)
            if idx in indices:
                hit_rate += 1
        print(f"Hit@{hit_k}: {(hit_rate/len(test))*100:.2f}%")

  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@1: 66.58%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@5: 99.83%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@10: 100.00%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@20: 100.00%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@30: 100.00%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@50: 100.00%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@75: 100.00%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@100: 100.00%


# Hit Rate (이미지 -> 자연어)

In [15]:
with torch.no_grad():
    for hit_k in k_for_topk:
        hit_rate = 0
        for idx in tqdm(range(len(test)),leave=False):
            img = cv2.resize(cv2.imread(list(test[['image']].iloc[idx])[0]),(320,320)).transpose(2,0,1)
            img = torch.Tensor(img).unsqueeze(0).float().to("cuda:0")
            img = image_embedder(img).to("cuda:1")
            distance = F.cosine_similarity(img, text_database)
            distance = distance.view(-1, 1).T
            _, indices = torch.topk(text_database, hit_k)
            if idx in indices:
                hit_rate += 1
        print(f"Hit@{hit_k}: {(hit_rate/len(test))*100:.2f}%")

  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@1: 4.49%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@5: 4.71%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@10: 4.73%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@20: 4.73%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@30: 4.75%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@50: 4.75%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@75: 4.75%


  0%|          | 0/8091 [00:00<?, ?it/s]

Hit@100: 4.75%


# 속도 측정

In [16]:
del train_dataloader, valid_dataloader, test_dataloader

In [17]:
database = database.T.to("cuda:1")
for hit_k in [1, 5, 10, 20, 50, 100]:
    speed = 0
    for idx in tqdm(range(len(test)),leave=False):
        inp = list(test[['caption']].iloc[idx])
        s = time.time()
        cap = torch.Tensor(model.encode(inp)).to("cuda:1")
        distance = torch.matmul(cap,database)
        _, indices = torch.topk(distance, hit_k)
        e = time.time()
        speed += ( e - s )
    print(f"K: {hit_k}, query process time(avg): {1000 * (speed / len(test)):.2f} ms")

  0%|          | 0/8091 [00:00<?, ?it/s]

K: 1, query process time(avg): 5.91 ms


  0%|          | 0/8091 [00:00<?, ?it/s]

K: 5, query process time(avg): 6.08 ms


  0%|          | 0/8091 [00:00<?, ?it/s]

K: 10, query process time(avg): 5.36 ms


  0%|          | 0/8091 [00:00<?, ?it/s]

K: 20, query process time(avg): 6.12 ms


  0%|          | 0/8091 [00:00<?, ?it/s]

K: 50, query process time(avg): 5.20 ms


  0%|          | 0/8091 [00:00<?, ?it/s]

K: 100, query process time(avg): 5.13 ms


# MRR(자연어 -> 이미지)

In [None]:
with torch.no_grad():
    mrr_scores = []
    for idx in tqdm(range(len(test)), leave=False):
        query_caption = torch.Tensor(model.encode([test['caption'].iloc[idx]])).to("cuda:1")
        distances = F.cosine_similarity(query_caption, database)
        _, indices = torch.topk(distances, len(test), largest=True)
        true_idx = indices == idx
        rank = (true_idx.nonzero(as_tuple=True)[0] + 1).float()
        if rank.nelement() > 0:
            mrr_scores.append(1.0 / rank.item())
        else:
            mrr_scores.append(0)
    mean_mrr = np.mean(mrr_scores)
    print(f"MRR (Text to Image): {mean_mrr:.4f}")

# MRR (이미지 -> 이미지)

In [None]:
with torch.no_grad():
    mrr_scores = []
    for idx in tqdm(range(len(test)), leave=False):
        query_image_path = test['image'].iloc[idx]
        query_image = cv2.resize(cv2.imread(query_image_path), (320, 320)).transpose(2,0,1)
        query_image = torch.Tensor(query_image).unsqueeze(0).float().to("cuda:0")
        query_image = image_embedder(query_image).to("cuda:1")
        distances = F.cosine_similarity(query_image, database)
        _, indices = torch.topk(distances, len(test), largest=True)
        true_idx = indices == idx
        rank = (true_idx.nonzero(as_tuple=True)[0] + 1).float()
        if rank.nelement() > 0:
            mrr_scores.append(1.0 / rank.item())
        else:
            mrr_scores.append(0)
    mean_mrr = np.mean(mrr_scores)
    print(f"MRR (Image to Image): {mean_mrr:.4f}")

# MRR (이미지 -> 자연어)

In [None]:
with torch.no_grad():
    mrr_scores = []
    for idx in tqdm(range(len(test)), leave=False):
        query_image_path = test['image'].iloc[idx]
        query_image = cv2.resize(cv2.imread(query_image_path), (320, 320)).transpose(2,0,1)
        query_image = torch.Tensor(query_image).unsqueeze(0).float().to("cuda:0")
        query_image = image_embedder(query_image).to("cuda:1")
        distances = F.cosine_similarity(query_image, text_database)
        _, indices = torch.topk(distances, len(test), largest=True)
        true_idx = indices == idx
        rank = (true_idx.nonzero(as_tuple=True)[0] + 1).float()
        if rank.nelement() > 0:
            mrr_scores.append(1.0 / rank.item())
        else:
            mrr_scores.append(0)
    mean_mrr = np.mean(mrr_scores)
    print(f"MRR (Image to Text): {mean_mrr:.4f}")