In [None]:
!pip install torch Pillow matplotlib accelerate bitsandbytes
#!pip install textgrad
!pip install transformers==4.49.0
!pip install torch
!pip install torchmetrics
!pip install torch_optimizer
!pip install hpsv2

In [None]:
import torch
from transformers import (
    VisionEncoderDecoderModel,
    ViTImageProcessor,
    AutoTokenizer,
    AutoModelForCausalLM,
    BlipForConditionalGeneration,
    BlipProcessor,
    BitsAndBytesConfig,
    AutoProcessor,
    AutoModelForImageTextToText
)
from peft import get_peft_model, LoraConfig
from huggingface_hub import login
from PIL import Image
import requests
import matplotlib.pyplot as plt
import numpy as np
import io
from torch.utils.data import Dataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"


# BLIP 모델
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

# vit-gpt2 모델
vit_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to("cuda")
vit_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Florence-2
florence_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-large",#base or large
    trust_remote_code=True,
    torch_dtype=torch.float16  # 또는 float32
).to(device)

florence_processor = AutoProcessor.from_pretrained(
    "microsoft/Florence-2-large",
    trust_remote_code=True
)

In [None]:
def generate_caption(image, model_type="blip"):
    if image.mode != "RGB":
        image = image.convert("RGB")

    if model_type == "blip":
        inputs = blip_processor(image, return_tensors="pt").to("cuda")
        outputs = blip_model.generate(**inputs, max_length=50)
        return blip_processor.decode(outputs[0], skip_special_tokens=True)

    elif model_type == "vit-gpt2":
        pixel_values = vit_processor(images=image, return_tensors="pt").pixel_values.to("cuda")
        output_ids = vit_model.generate(pixel_values, max_length=50, num_beams=4)
        return vit_tokenizer.decode(output_ids[0], skip_special_tokens=True)
#textgrad
def florence_feedback(caption, image):
    prompt = f"""Image Caption Enhancement Task:
Original Caption: {caption}
Image Characteristics: {image.size} | Channels: {len(image.getbands())}

Improvement Requests:
1. Accurately identifying each objects.
2. Describing each as if cropped.
3. Considering what the image conveys even when seen out of focus.
4. Improve natural language flow checking sentence before descriptions.

Improved Caption:"""
    # Florence-2는 이미지+텍스트 입력 후 텍스트 생성
    inputs = florence_processor(images=image, text=prompt, return_tensors="pt").to(device)
    inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
    # 실행 속도와 용량을 위해 16bit 짜리 float를 사용
    outputs = florence_model.generate(**inputs, max_new_tokens=100)
    # Florence-2는 processor.decode가 아니라 tokenizer.decode 사용
    improved_caption = florence_processor.tokenizer.decode(outputs[0], skip_special_tokens=True)

    return improved_caption.strip()

In [None]:
"""
#평가 시스템
from torchmetrics.multimodal import CLIPScore

class CaptionEvaluator:
    def __init__(self):
        self.clip_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")

    def evaluate(self, image, caption):
        # 이미지 전처리
        processed_image = image.resize((224,224))
        img_tensor = torch.tensor(np.array(processed_image)).permute(2,0,1).unsqueeze(0)

        # CLIP 점수 계산
        score = self.clip_metric(img_tensor, [caption])
        return score.item()
"""

from torchmetrics.multimodal import CLIPScore
import torch
from transformers import AutoProcessor, AutoModel, BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import numpy as np
import hpsv2

class CaptionEvaluator:
    def __init__(self):
        self.clip_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")

        # HPS-V2 모델 초기화
        try:
            # HPS-V2는 직접 import하여 사용
            import hpsv2
            self.hps_available = True
        except ImportError:
            print("HPS-V2 모델을 로드할 수 없습니다. pip install hpsv2 필요")
            self.hps_available = False

        # Pick Score 모델 초기화
        try:
            self.pick_processor = AutoProcessor.from_pretrained("yuvalkirstain/PickScore_v1")
            self.pick_model = AutoModel.from_pretrained("yuvalkirstain/PickScore_v1")
            self.pick_available = True
        except:
            print("Pick Score 모델을 로드할 수 없습니다.")
            self.pick_available = False

        # VQA Score용 BLIP 모델 초기화
        try:
            self.vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
            self.vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
            self.vqa_available = True
        except:
            print("VQA 모델을 로드할 수 없습니다.")
            self.vqa_available = False

    def calculate_hps_score(self, image, caption):
        """HPS-V2 점수 계산"""
        if not self.hps_available:
            return None

        try:
            score = hpsv2.score(image, caption, hps_version="v2.1")
            return score
        except Exception as e:
            print(f"HPS-V2 점수 계산 오류: {e}")
            return None

    def calculate_pick_score(self, image, caption):
        """Pick Score 계산"""
        if not self.pick_available:
            return None

        try:
            # 입력 전처리
            inputs = self.pick_processor(
                images=image,
                text=caption,
                return_tensors="pt",
                padding=True
            )

            with torch.no_grad():
                outputs = self.pick_model(**inputs)
                # Pick Score는 logits의 첫 번째 값을 사용
                score = outputs.logits_per_image.item()

            return score
        except Exception as e:
            print(f"Pick Score 계산 오류: {e}")
            return None

    def calculate_vqa_score(self, image, caption):
        """VQA Score 계산 (캡션 기반 질문-답변 정확도)"""
        if not self.vqa_available:
            return None

        try:
            # 캡션에서 간단한 질문들 생성
            questions = [
                "What is in this image?",
                "What is the main object in the image?",
                "What color is the main object?",
                "What is happening in this image?"
            ]

            correct_answers = 0
            total_questions = len(questions)

            for question in questions:
                # VQA 모델로 답변 생성
                inputs = self.vqa_processor(image, question, return_tensors="pt")

                with torch.no_grad():
                    outputs = self.vqa_model.generate(**inputs, max_length=20)
                    answer = self.vqa_processor.decode(outputs[0], skip_special_tokens=True)

                # 간단한 키워드 매칭으로 정확도 계산
                caption_lower = caption.lower()
                answer_lower = answer.lower()

                # 답변의 주요 단어가 캡션에 포함되어 있는지 확인
                answer_words = answer_lower.split()
                if any(word in caption_lower for word in answer_words if len(word) > 3):
                    correct_answers += 1

            vqa_score = correct_answers / total_questions
            return vqa_score

        except Exception as e:
            print(f"VQA Score 계산 오류: {e}")
            return None

    def evaluate(self, image, caption):
        # 이미지 전처리
        processed_image = image.resize((224,224))
        img_tensor = torch.tensor(np.array(processed_image)).permute(2,0,1).unsqueeze(0)

        # CLIP 점수 계산
        clip_score = self.clip_metric(img_tensor, [caption])

        results = {'clip_score': clip_score.item()}

        # HPS-V2 점수 계산
        hps_score = self.calculate_hps_score(image, caption)
        if hps_score is not None:
            results['hps_score'] = hps_score

        # Pick Score 계산
        pick_score = self.calculate_pick_score(image, caption)
        if pick_score is not None:
            results['pick_score'] = pick_score

        # VQA Score 계산
        vqa_score = self.calculate_vqa_score(image, caption)
        if vqa_score is not None:
            results['vqa_score'] = vqa_score

        # 종합 점수 계산 (사용 가능한 점수들의 평균)
        available_scores = [score for score in [
            results.get('clip_score'),
            results.get('hps_score'),
            results.get('pick_score'),
            results.get('vqa_score')
        ] if score is not None]

        if available_scores:
            results['composite_score'] = sum(available_scores) / len(available_scores)

        return results



In [None]:
#fine tunning
from torch_optimizer import RAdam

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
optimizer = RAdam(model.parameters(), lr=5e-5)
# 학습용 데이터셋 구축
class CaptionDataset(Dataset):
    def __init__(self, images, captions=None, processor=None, tokenizer=None):
        self.images = images
        self.captions = captions or [""] * len(images)
        self.processor = processor
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.processor(images=self.images[idx], return_tensors="pt").pixel_values.squeeze()
        caption = self.tokenizer(self.captions[idx], padding="max_length", truncation=True, max_length=50, return_tensors="pt").input_ids.squeeze()
        return image, caption

def fine_tune_model(image, improved_caption):
    model.train()
    inputs = processor(images=image, return_tensors="pt").pixel_values.to(device)
    labels = tokenizer(improved_caption, return_tensors="pt", padding="max_length", truncation=True, max_length=50).input_ids.to(device)
    outputs = model(pixel_values=inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss.item()

#이미지 로드

import os
import zipfile
import requests
from PIL import Image

# 1. COCO 2017 validation 이미지 zip 파일 다운로드
url = "http://images.cocodataset.org/zips/val2017.zip"
zip_path = "val2017.zip"
img_dir = "val2017"

if not os.path.exists(zip_path):
    print("Downloading COCO 2017 val images...")
    r = requests.get(url, stream=True)
    with open(zip_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

# 2. 압축 해제
if not os.path.exists(img_dir):
    print("Extracting images...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(".")

# 3. 이미지 파일 리스트 가져오기
img_files = [os.path.join(img_dir, fname) for fname in os.listdir(img_dir) if fname.endswith(".jpg")]

# 4. PIL 이미지 리스트 만들기 (최대 5000장)
images = []
for img_path in img_files[:10]:  # 필요시 [:100] 등으로 조절
    img = Image.open(img_path).convert("RGB")
    images.append(img)

print(f"총 {len(images)}장의 이미지를 불러왔습니다.")

import torchvision.transforms as transforms

to_pil = transforms.ToPILImage()

captions = [generate_caption(img, model_type="blip") for img in images]
dataset = CaptionDataset(images, captions, processor, tokenizer)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

for epoch in range(3):
    for image_tensor, caption in dataset:
        image = to_pil(image_tensor)
        initial_caption = generate_caption(image)
        improved_caption = florence_feedback(initial_caption, image)
        fine_tune_model(image, improved_caption)


In [None]:
def optimize_caption(image, max_iter=3):
    evaluator = CaptionEvaluator()

    # 초기 캡션 생성
    base_caption = generate_caption(image, model_type="blip")
    best_score = evaluator.evaluate(image, base_caption)
    best_caption = base_caption

    #print(f"초기 캡션: {base_caption} | 점수: {best_score:.2f}")

    # 반복적 개선
    for i in range(max_iter):
        new_caption = florence_feedback(best_caption, image)
        #gradient_caption = florence_feedback(best_caption, image)
        #inputs = florence_processor(images=image, text=gradient_caption, return_tensors="pt").to(device)
        #inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
        #new_caption = florence_model.generate(**inputs, max_new_tokens=200)
        current_score = evaluator.evaluate(image, new_caption)

        if current_score > best_score:
            best_score = current_score
            best_caption = new_caption
            print(f"updated at {i+1}: {new_caption} | score: {current_score:.2f}")

    return best_caption

In [None]:
# 테스트 이미지 로드
def load_image(url):
    response = requests.get(url)
    return Image.open(io.BytesIO(response.content))
"""
test_image = load_image("https://images.unsplash.com/photo-1583512603805-3cc6b41f3edb")

# 최적화 실행
final_caption = optimize_caption(test_image)
print("\n최종 결과:", final_caption)

# 결과 시각화
plt.figure(figsize=(10,10))
plt.imshow(test_image)
plt.title(f"최적화된 캡션:\n{final_caption}", wrap=True)
plt.axis('off')
plt.show()
"""

In [None]:
def generate_caption_finetuned(image):
      device = "cuda" if torch.cuda.is_available() else "cpu"
      model.eval()
      pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
      with torch.no_grad():
          output_ids = model.generate(pixel_values, max_length=50, num_beams=4)
      caption = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
      return caption

In [None]:
def comparative_analysis(image):
    models = ["blip", "vit-gpt2"]
    results = {}
    evaluator = CaptionEvaluator()

    for model_import in models:
        caption = generate_caption(image, model_import)
        result = evaluator.evaluate(image, caption)
        results[model_import] = {"caption": caption, "CLIP_Score": result.get('clip_score', 'N/A'), "HPS_Score": result.get('hps_score', 'N/A'), "Pick_Score": result.get('pick_score', 'N/A'), "VQA_Score": result.get('vqa_score', 'N/A'), "Composite_Score": result.get('composite_score', 'N/A')}

    banila = optimize_caption(image, max_iter=0)
    banila_result = evaluator.evaluate(image, banila)
    results["Florence-2"] = {"caption": banila, "CLIP_Score": banila_result.get('clip_score', 'N/A'), "HPS_Score": banila_result.get('hps_score', 'N/A'), "Pick_Score": banila_result.get('pick_score', 'N/A'), "VQA_Score": banila_result.get('vqa_score', 'N/A'), "Composite_Score": banila_result.get('composite_score', 'N/A')}
    # Florence 기반 최적화
    #optimized = optimize_caption(image, max_iter=10)
    #opt_score = evaluator.evaluate(image, optimized)
    #results["optimized"] = {"caption": optimized, "score": opt_score}

    # fine-tuned 모델 결과
    finetuned_caption = generate_caption_finetuned(image)
    finetuned_result = evaluator.evaluate(image, finetuned_caption)
    results["finetuned"] = {"caption": finetuned_caption, "CLIP_Score": finetuned_result.get('clip_score', 'N/A'), "HPS_Score": finetuned_result.get('hps_score', 'N/A'), "Pick_Score": finetuned_result.get('pick_score', 'N/A'), "VQA_Score": finetuned_result.get('vqa_score', 'N/A'), "Composite_Score": finetuned_result.get('composite_score', 'N/A')}


    # 결과 시각화
    fig, axs = plt.subplots(1, len(results) + 1, figsize=(25, 5))
    axs[0].imshow(image)
    axs[0].set_title("original image")
    axs[0].axis('off')

    for idx, (name, data) in enumerate(results.items(), 1):
        axs[idx].imshow(image)
        axs[idx].set_title(f"{name}\nCLIP_Score: {data['CLIP_Score']:.2f}\nPick_Score: {data['Pick_Score']:.2f}\nVQA_Score: {data['VQA_Score']:.2f}\nComposite_Score: {data['Composite_Score']:.2f}\n")
        axs[idx].text(0, -50, data['caption'], wrap=True)
        axs[idx].axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
test_image = load_image("https://pbs.twimg.com/media/Gq_RCtfWUAEsxoC?format=jpg&name=large")

comparative_analysis(test_image)

In [None]:
test_image = load_image("https://pbs.twimg.com/media/GrDxQPibgAAz862?format=jpg&name=medium")

comparative_analysis(test_image)

In [None]:
test_image = load_image("https://pbs.twimg.com/media/GksdDm_XAAAx705?format=jpg&name=large")

comparative_analysis(test_image)

In [None]:
test_image = load_image("https://pbs.twimg.com/media/GrYrYt3bAAEdp-z?format=jpg&name=large")

comparative_analysis(test_image)

In [None]:
test_image = load_image("https://pbs.twimg.com/media/GtVfY2EWcAAdxzS?format=jpg&name=large")

comparative_analysis(test_image)

In [None]:
test_image = load_image("https://pbs.twimg.com/media/GtOTH4wakAElbhy?format=jpg&name=medium")

comparative_analysis(test_image)

In [None]:
test_image = load_image("https://pbs.twimg.com/media/GtVL6gnX0AA7zEq?format=jpg&name=large")

comparative_analysis(test_image)