# 1. Setup Depedencies

## 1.1 Import Library

In [10]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from PIL import Image
import logging
import torch

In [11]:
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Cuda device: {torch.cuda.get_device_name(0) if torch.cuda.is_available else 'N/A'}")

Cuda available: True
Cuda device: NVIDIA GeForce GTX 1060


In [12]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [17]:
class BLIP2Model:
    """ Wrapper for Image Captioning Model"""

    def __init__(self, model_name: str = "Salesforce/blip2-opt-2.7b"):
        """
        Initialize BLIP-2 model and processor.

        Args: 
            Model name: Hugging Face model identifier
                Options:
                - "Salesforce/blip2-opt-2.7b" 
                - "Salesforce/blip2-opt-6.7b"
                - "Salesforce/blip2-flan-t5-xl" 
        """

        logger.info(f"Loading model: {model_name}")

        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = Blip2ForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available else torch.float32)

        self.device= "cuda" if torch.cuda.is_available else "cpu"
        self.model.to(self.device)

        logger.info(f"Model loaded succesfully on: {self.device}")

    def generate_caption(
        self,
        image: Image.Image,
        prompt: str = None,
        max_length: int = 50
    ) -> str:
        """
        Generate prompt for single image

        Args: 
            Image : PIL Image Object
            prompt : Optional text prompt (e.g., "a photo of")
            max length : Maximum length of generated caption

        Returns:
        generated caption as string
        """

        try:
            # Preprocess image using AutoProcessor
            if prompt:
                inputs = self.processor(
                    image = image, 
                    text = prompt,
                    return_tensor="pt"
                ).to(self.device, torch.float16 if self.device =="cuda" else torch.float32)

            else :
                inputs(
                    image = image,
                    return_tensors = "pt"
                ).to(self.device, float16 if self.device == "cuda" else float32)

            generated_ids = self.model.generate(
                **inputs,
                max_length=max_length
            )

            caption = self.processor.batch_decode(
                generated_ids,
                skip_special_tokens=true
            )[0].strip()

            logger.info(f" generated captions: {caption}")
            return caption

        except Exception as e :
            logger.error(f"Error generating caption: {str(e)}")
            raise

    def generate_caption_from_path(
        self,
        image_path: str,
        prompt: str= None,
        max_length: str=50
    ) -> str:
        """
        Generate caption from image file path.
        
        Args:
            image_path: Path to image file
            prompt: Optional text prompt
            max_length: Maximum length of generated caption
            
        Returns:
            Generated caption as string
        """
        try:
            image = image.open(image_path).convert("RGB")
            return self.generate_caption(image, prompt, max_length)
        except Exception as e:
            logger.error(f"Error loading image from {image_path} : {str(e)}")
            raise
            
        