# Multimodal LLM for Image and Text-Based Question Answering

This notebook implements a lightweight multimodal LLM for visual question answering (VQA) that can run on 4GB VRAM.

**Model Used:** BLIP-2 (Salesforce) - Optimized for low VRAM usage

**Features:**
- Image understanding and analysis
- Visual question answering
- Image captioning
- Multi-turn conversations about images

## 1. Install Required Libraries

In [None]:
# Install required packages
!pip install transformers accelerate pillow torch torchvision bitsandbytes -q

## 2. Import Libraries

In [None]:
import torch
from PIL import Image
import requests
from io import BytesIO
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Available VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

## 3. Load Lightweight Multimodal Model

We'll use BLIP (Bootstrapping Language-Image Pre-training) which is efficient for 4GB VRAM.

In [None]:
# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load BLIP model for Visual Question Answering
print("Loading BLIP VQA model...")
vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

# Load BLIP model for Image Captioning
print("Loading BLIP Captioning model...")
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

print("Models loaded successfully!")

## 4. Helper Functions

In [None]:
def load_image(image_source):
    """
    Load image from URL or local path
    
    Args:
        image_source: URL string or local file path
    
    Returns:
        PIL Image object
    """
    if image_source.startswith('http://') or image_source.startswith('https://'):
        response = requests.get(image_source)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_source).convert('RGB')
    return image


def generate_caption(image, max_length=50):
    """
    Generate caption for an image
    
    Args:
        image: PIL Image or image path/URL
        max_length: Maximum caption length
    
    Returns:
        Generated caption string
    """
    if isinstance(image, str):
        image = load_image(image)
    
    inputs = caption_processor(image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        out = caption_model.generate(**inputs, max_length=max_length)
    
    caption = caption_processor.decode(out[0], skip_special_tokens=True)
    return caption


def answer_question(image, question, max_length=50):
    """
    Answer a question about an image
    
    Args:
        image: PIL Image or image path/URL
        question: Question string
        max_length: Maximum answer length
    
    Returns:
        Answer string
    """
    if isinstance(image, str):
        image = load_image(image)
    
    inputs = vqa_processor(image, question, return_tensors="pt").to(device)
    
    with torch.no_grad():
        out = vqa_model.generate(**inputs, max_length=max_length)
    
    answer = vqa_processor.decode(out[0], skip_special_tokens=True)
    return answer


def analyze_image(image, questions=None, generate_cap=True):
    """
    Comprehensive image analysis with caption and Q&A
    
    Args:
        image: PIL Image or image path/URL
        questions: List of questions to ask about the image
        generate_cap: Whether to generate caption
    
    Returns:
        Dictionary with caption and answers
    """
    if isinstance(image, str):
        image = load_image(image)
    
    results = {}
    
    if generate_cap:
        results['caption'] = generate_caption(image)
    
    if questions:
        results['qa'] = []
        for q in questions:
            answer = answer_question(image, q)
            results['qa'].append({'question': q, 'answer': answer})
    
    return results


print("Helper functions defined successfully!")

## 5. Example 1: Image Captioning

In [None]:
# Example image URL (a dog on the beach)
image_url = "https://images.unsplash.com/photo-1544568100-847a948585b9?w=800"

# Load and display image
image = load_image(image_url)
display(image.resize((400, 300)))

# Generate caption
caption = generate_caption(image)
print(f"\n Caption: {caption}")

## 6. Example 2: Visual Question Answering

In [None]:
# Use the same image
display(image.resize((400, 300)))

# Ask questions about the image
questions = [
    "What animal is in the image?",
    "What is the color of the dog?",
    "Where is the dog?",
    "What is the dog doing?",
]

print("\n Visual Question Answering:\n")
for question in questions:
    answer = answer_question(image, question)
    print(f"Q: {question}")
    print(f"A: {answer}\n")

## 7. Example 3: Multi-Image Analysis

In [None]:
# Analyze multiple images
image_urls = [
    "https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=800",  # Mountain landscape
    "https://images.unsplash.com/photo-1518791841217-8f162f1e1131?w=800",  # Cat
]

for idx, url in enumerate(image_urls, 1):
    print(f"\n{'='*60}")
    print(f"Image {idx}")
    print(f"{'='*60}\n")
    
    img = load_image(url)
    display(img.resize((400, 300)))
    
    # Generate caption
    cap = generate_caption(img)
    print(f"\nüìù Caption: {cap}")
    
    # Ask generic questions
    print("\nüîç Q&A:")
    generic_questions = [
        "What is in this image?",
        "What colors are dominant?",
    ]
    
    for q in generic_questions:
        a = answer_question(img, q)
        print(f"Q: {q}")
        print(f"A: {a}")

## 8. Example 4: Interactive Q&A Session

In [None]:
# Load an image for interactive Q&A
test_image_url = "https://images.unsplash.com/photo-1551963831-b3b1ca40c98e?w=800"  # Breakfast
test_image = load_image(test_image_url)

print("Image for analysis:")
display(test_image.resize((400, 300)))

# First, get a caption to understand what's in the image
print("\n Image Caption:")
print(generate_caption(test_image))

# Interactive Q&A
print("\n" + "="*60)
print("Interactive Q&A Session")
print("="*60)
print("\nYou can ask questions about this image.")
print("Type 'quit' to exit.\n")

# For notebook, we'll demonstrate with predefined questions
# In a real interactive scenario, you would use input()
demo_questions = [
    "What food items are visible?",
    "Is this breakfast or dinner?",
    "What is the main dish?",
    "Are there any fruits?",
]

for q in demo_questions:
    print(f"\nüë§ User: {q}")
    answer = answer_question(test_image, q)
    print(f"ü§ñ Assistant: {answer}")

## 9. Example 5: Custom Image Analysis

In [None]:
# You can upload your own image or use a local file path
# For demonstration, we'll use another URL

custom_image_url = "https://images.unsplash.com/photo-1485965120184-e220f721d03e?w=800"  # Bicycle

# Comprehensive analysis
custom_questions = [
    "What is the main object in this image?",
    "What color is it?",
    "Is this indoors or outdoors?",
    "What is the weather like?",
    "Are there any people?",
]

results = analyze_image(
    custom_image_url,
    questions=custom_questions,
    generate_cap=True
)

# Display image
img = load_image(custom_image_url)
display(img.resize((400, 300)))

# Display results
print("\n" + "="*60)
print("Comprehensive Image Analysis")
print("="*60)

print(f"\n Caption: {results['caption']}\n")

print(" Question & Answer:")
for qa in results['qa']:
    print(f"\nQ: {qa['question']}")
    print(f"A: {qa['answer']}")

## 10. Memory Monitoring

In [None]:
# Check GPU memory usage
if torch.cuda.is_available():
    print("\n GPU Memory Statistics:")
    print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
    print(f"Max Allocated: {torch.cuda.max_memory_allocated(0) / 1024**3:.2f} GB")
else:
    print("Running on CPU - No GPU memory to monitor")

## 11. Advanced: Batch Processing

In [None]:
def batch_analyze_images(image_sources, question):
    """
    Analyze multiple images with the same question
    
    Args:
        image_sources: List of image URLs or paths
        question: Question to ask about each image
    
    Returns:
        List of results
    """
    results = []
    
    for idx, source in enumerate(image_sources, 1):
        try:
            image = load_image(source)
            caption = generate_caption(image)
            answer = answer_question(image, question)
            
            results.append({
                'index': idx,
                'source': source,
                'caption': caption,
                'question': question,
                'answer': answer
            })
        except Exception as e:
            results.append({
                'index': idx,
                'source': source,
                'error': str(e)
            })
    
    return results


# Example batch processing
batch_images = [
    "https://images.unsplash.com/photo-1568605114967-8130f3a36994?w=800",  # House
    "https://images.unsplash.com/photo-1469474968028-56623f02e42e?w=800",  # Nature
]

batch_results = batch_analyze_images(batch_images, "What is the main subject?")

print("\n Batch Processing Results:\n")
for result in batch_results:
    if 'error' not in result:
        print(f"Image {result['index']}:")
        print(f"  Caption: {result['caption']}")
        print(f"  Q: {result['question']}")
        print(f"  A: {result['answer']}\n")
    else:
        print(f"Image {result['index']}: Error - {result['error']}\n")

## 12. Cleanup

In [None]:
# Clear GPU cache if needed
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU cache cleared")

print("\n‚úÖ Notebook execution complete!")
print("\nKey Features Implemented:")
print("- Image captioning")
print("- Visual question answering")
print("- Multi-image analysis")
print("- Batch processing")
print("- Memory-efficient design for 4GB VRAM")