# 🧠 Generative AI Assignment – BLIP-2 Mini Project

## - Image captioning with decoding strategies
## - Ask-the-Image Mini-App (Speech → Text → Image → Answer → Speech)

In [1]:
# 📦 Install Required Packages
!pip install transformers torch torchvision accelerate datasets evaluate sacrebleu nltk openai-whisper pyttsx3

## 🖼️ Image Captioning with BLIP-2
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
import torch

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pyttsx3
  Downloading pyttsx3-2.98-py3-none-any.whl.metadata (3.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvi

In [6]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

def describe_image(image_url, strategy="greedy", top_k=50, top_p=0.9, temperature=1.0):
    image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    inputs = processor(images=image, return_tensors="pt").to("cuda")

    if strategy == "greedy":
        output = model.generate(**inputs)
    elif strategy == "top-k":
        output = model.generate(**inputs, do_sample=True, top_k=top_k, temperature=temperature)
    elif strategy == "top-p":
        output = model.generate(**inputs, do_sample=True, top_p=top_p, temperature=temperature)

    return processor.decode(output[0], skip_special_tokens=True)

# 🔍 Try Image Description
url = "https://live.staticflickr.com/65535/53578340955_b22196ca12_4k.jpg"
describe_image(url, strategy="top-p")

'a teddy bear dances under an umbrella outside'

In [15]:
## 🎤 Ask-the-Image Mini-App
import whisper
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
from PIL import Image
import torch
import pyttsx3

# Load models
asr_model = whisper.load_model("small")
vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", torch_dtype=torch.float16)
vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base", torch_dtype=torch.float16).to("cuda")

In [10]:
!sudo apt-get update && sudo apt-get install espeak ffmpeg libespeak1
!pip install pyttsx3

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ub0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ub                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,607 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,908 kB]
Hit:10 https://ppa.la

In [17]:
# ⏺️ Transcribe speech
def transcribe_audio(audio_path):
    result = asr_model.transcribe(audio_path)
    return result['text']

# 🖼️ Answer the image-question
def ask_image(image_path, question):
    image = Image.open(image_path).convert('RGB')
    # Format the question properly
    question = f"Question: {question} Answer:"
    inputs = vqa_processor(image, question, return_tensors="pt").to("cuda")
    out = vqa_model.generate(**inputs)
    return vqa_processor.decode(out[0], skip_special_tokens=True)

# 🗣️ Speak text
def speak(text):
    engine = pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

# 📂 Upload your audio.wav and image.jpg using the file browser
audio_file = "audio.wav"
image_file = "image.jpg"

# 🔁 Full Pipeline
question = transcribe_audio(audio_file)
answer = ask_image(image_file, question)
print("Q:", question)
print("A:", answer)

Q:  What is in this picture? Is there a dog in the image?
A: no
