In [None]:
#Install Required Libraries

In [3]:
!pip install torch torchvision torchaudio
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install kaggle



In [5]:
pip install googletrans==4.0.0-rc1



In [6]:
# ✅ STEP 1: Install Required Libraries (only once)
!pip install kagglehub transformers torch torchvision torchaudio sentencepiece

# ✅ STEP 2: Import Required Libraries
import os
import json
from PIL import Image
import kagglehub
from transformers import MarianMTModel, MarianTokenizer
from transformers import BlipProcessor, BlipForQuestionAnswering

# ✅ STEP 3: Download Dataset using kagglehub
dataset_path = kagglehub.dataset_download("aniketvp68/vqav2-train")
print("Dataset path:", dataset_path)

Dataset path: /kaggle/input/vqav2-train


In [None]:
import os

for root, dirs, files in os.walk(dataset_path):
    print(f"\n📂 Directory: {root}")
    for file in files:
        print(" -", file)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 - COCO_train2014_000000314174.jpg
 - COCO_train2014_000000356783.jpg
 - COCO_train2014_000000078684.jpg
 - COCO_train2014_000000376557.jpg
 - COCO_train2014_000000293074.jpg
 - COCO_train2014_000000284155.jpg
 - COCO_train2014_000000003992.jpg
 - COCO_train2014_000000523638.jpg
 - COCO_train2014_000000191327.jpg
 - COCO_train2014_000000549330.jpg
 - COCO_train2014_000000308752.jpg
 - COCO_train2014_000000196971.jpg
 - COCO_train2014_000000258272.jpg
 - COCO_train2014_000000426429.jpg
 - COCO_train2014_000000327944.jpg
 - COCO_train2014_000000491991.jpg
 - COCO_train2014_000000384740.jpg
 - COCO_train2014_000000175734.jpg
 - COCO_train2014_000000285804.jpg
 - COCO_train2014_000000330766.jpg
 - COCO_train2014_000000545181.jpg
 - COCO_train2014_000000477104.jpg
 - COCO_train2014_000000236197.jpg
 - COCO_train2014_000000383816.jpg
 - COCO_train2014_000000271842.jpg
 - COCO_train2014_000000543684.jpg
 - COCO_train2014_0000001

In [7]:
# ✅ STEP 4: Load Questions and Annotations
questions_path = os.path.join(dataset_path, "v2_Questions_Train_mscoco", "v2_OpenEnded_mscoco_train2014_questions.json")

# If you also find the annotations file in another subdirectory, update this accordingly:
annotations_path = os.path.join(dataset_path, "v2_Annotations_Train_mscoco", "v2_mscoco_train2014_annotations.json")


with open(questions_path, 'r') as f:
    questions_data = json.load(f)

with open(annotations_path, 'r') as f:
    annotations_data = json.load(f)

In [8]:
# ✅ STEP 5: Load Translation Models

# English ➝ Hindi
en_hi_model_name = 'Helsinki-NLP/opus-mt-en-hi'
en_hi_tokenizer = MarianTokenizer.from_pretrained(en_hi_model_name)
en_hi_model = MarianMTModel.from_pretrained(en_hi_model_name)

def translate_to_hindi(text):
    tokens = en_hi_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    translated = en_hi_model.generate(**tokens)
    return en_hi_tokenizer.decode(translated[0], skip_special_tokens=True)

# Hindi ➝ English
hi_en_model_name = 'Helsinki-NLP/opus-mt-hi-en'
hi_en_tokenizer = MarianTokenizer.from_pretrained(hi_en_model_name)
hi_en_model = MarianMTModel.from_pretrained(hi_en_model_name)

def translate_hi_to_en(text):
    tokens = hi_en_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    translated = hi_en_model.generate(**tokens)
    return hi_en_tokenizer.decode(translated[0], skip_special_tokens=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
# ✅ STEP 6: Load BLIP VQA Model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [10]:
# ✅ STEP 7: Hindi VQA Function
def vqa_inference(image_path, question_hindi):
    # Step 1: Translate Hindi → English
    question_en = translate_hi_to_en(question_hindi)

    # Step 2: Load image
    image = Image.open(image_path).convert("RGB")

    # Step 3: Process with BLIP
    inputs = processor(image, question_en, return_tensors="pt")
    out = blip_model.generate(**inputs)

    # Step 4: Translate Answer back to Hindi
    answer_en = processor.decode(out[0], skip_special_tokens=True)
    answer_hi = translate_to_hindi(answer_en)

    return answer_hi

In [14]:
# ✅ STEP 8: Run Example Inference with Bilingual Output (Hindi + English)

from googletrans import Translator
import os

# Initialize Google Translator
translator = Translator()

# Use one sample from dataset
sample_question = questions_data['questions'][19]  # First question
image_id = sample_question['image_id']
filename = f"COCO_train2014_{image_id:012}.jpg"

# ✅ Corrected image path (based on actual directory structure)
image_path = os.path.join(dataset_path, "train2014", "train2014", filename)

# Hindi input question (translate the English version manually or write your own)
question_hindi = "इस चित्र में कौन है?"

# Optional: Confirm file exists
if os.path.exists(image_path):
    print("✅ Image found:", image_path)

    # Run inference (returns Hindi answer from Hindi question)
    answer_hindi = vqa_inference(image_path, question_hindi)

    # Translate both question and answer to English
    question_en = translator.translate(question_hindi, dest='en').text
    answer_en = translator.translate(answer_hindi, dest='en').text

    # Print bilingual output
    print("📌 प्रश्न (Hindi):", question_hindi)
    print("📌 Question (English):", question_en)
    print("✅ उत्तर (Hindi):", answer_hindi)
    print("✅ Answer (English):", answer_en)

else:
    print("❌ Image not found:", image_path)

✅ Image found: /kaggle/input/vqav2-train/train2014/train2014/COCO_train2014_000000393224.jpg
📌 प्रश्न (Hindi): इस चित्र में कौन है?
📌 Question (English): Who is in this picture?
✅ उत्तर (Hindi): आदमी
✅ Answer (English): Man


In [None]:
#Code for Custom Image Test1

In [23]:
from googletrans import Translator
import os

# ✅ Initialize translator
translator = Translator()

# ✅ Custom image path (your own unseen image)
image_path = "/content/Count chlidren.png"  # Replace with your file path
question_hindi = "बिस्तर पर कितने बच्चे हैं?"  # Change this to your question

if os.path.exists(image_path):
    print("✅ Custom Image found:", image_path)

    # Run Hindi question inference
    answer_hindi = vqa_inference(image_path, question_hindi)

    # Translate both for bilingual output
    question_en = translator.translate(question_hindi, dest='en').text
    answer_en = translator.translate(answer_hindi, dest='en').text

    print("\n🖼️ Custom Image Inference")
    print("📌 प्रश्न (Hindi):", question_hindi)
    print("📌 Question (English):", question_en)
    print("✅ उत्तर (Hindi):", answer_hindi)
    print("✅ Answer (English):", answer_en)
else:
    print("❌ Image not found:", image_path)

✅ Custom Image found: /content/Count chlidren.png

🖼️ Custom Image Inference
📌 प्रश्न (Hindi): बिस्तर पर कितने बच्चे हैं?
📌 Question (English): How many children are on the bed?
✅ उत्तर (Hindi): 2
✅ Answer (English): 2


In [None]:
#Code for Custom Image Test2

In [22]:
from googletrans import Translator
import os

# ✅ Initialize translator
translator = Translator()

# ✅ Custom image path (your own unseen image)
image_path = "/content/Mustache made of.png"  # Replace with your file path
question_hindi = "मूंछ किस चीज़ से बनी होती है?"  # Change this to your question

if os.path.exists(image_path):
    print("✅ Custom Image found:", image_path)

    # Run Hindi question inference
    answer_hindi = vqa_inference(image_path, question_hindi)

    # Translate both for bilingual output
    question_en = translator.translate(question_hindi, dest='en').text
    answer_en = translator.translate(answer_hindi, dest='en').text

    print("\n🖼️ Custom Image Inference")
    print("📌 प्रश्न (Hindi):", question_hindi)
    print("📌 Question (English):", question_en)
    print("✅ उत्तर (Hindi):", answer_hindi)
    print("✅ Answer (English):", answer_en)
else:
    print("❌ Image not found:", image_path)

✅ Custom Image found: /content/Mustache made of.png

🖼️ Custom Image Inference
📌 प्रश्न (Hindi): मूंछ किस चीज़ से बनी होती है?
📌 Question (English): What is the mustache made of?
✅ उत्तर (Hindi): केला
✅ Answer (English): Banned


In [None]:
#Code for Custom Image Test3

In [24]:
from googletrans import Translator
import os

# ✅ Initialize translator
translator = Translator()

# ✅ Custom image path (your own unseen image)
image_path = "/content/Girl walking.png"  # Replace with your file path
question_hindi = "क्या लड़की साइकिल को चलाकर ले जा रही है?"  # Change this to your question

if os.path.exists(image_path):
    print("✅ Custom Image found:", image_path)

    # Run Hindi question inference
    answer_hindi = vqa_inference(image_path, question_hindi)

    # Translate both for bilingual output
    question_en = translator.translate(question_hindi, dest='en').text
    answer_en = translator.translate(answer_hindi, dest='en').text

    print("\n🖼️ Custom Image Inference")
    print("📌 प्रश्न (Hindi):", question_hindi)
    print("📌 Question (English):", question_en)
    print("✅ उत्तर (Hindi):", answer_hindi)
    print("✅ Answer (English):", answer_en)
else:
    print("❌ Image not found:", image_path)

✅ Custom Image found: /content/Girl walking.png

🖼️ Custom Image Inference
📌 प्रश्न (Hindi): क्या लड़की साइकिल को चलाकर ले जा रही है?
📌 Question (English): Is the girl carrying a bicycle?
✅ उत्तर (Hindi): हाँ
✅ Answer (English): Yes
