In [1]:
!pip install torch torchvision transformers pillow



In [2]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [3]:
import torch
from transformers import CLIPProcessor, CLIPModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from PIL import Image

In [4]:
from huggingface_hub import login
login("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")

In [5]:
device = "cpu"

In [6]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Loading weights:   0%|          | 0/398 [00:00<?, ?it/s]

[1mCLIPModel LOAD REPORT[0m from: openai/clip-vit-base-patch32
Key                                  | Status     |  | 
-------------------------------------+------------+--+-
vision_model.embeddings.position_ids | UNEXPECTED |  | 
text_model.embeddings.position_ids   | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
The image processor of type `CLIPImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


In [7]:
def generate_tags(image_path):
    image = Image.open(image_path).convert("RGB")

    candidate_labels = [
        "person", "dog", "cat", "car", "street", "tree",
        "building", "food", "computer", "phone",
        "beach", "mountain", "indoor", "outdoor",
        "animal", "man", "woman", "child"
    ]

    inputs = clip_processor(
        text=candidate_labels,
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)

    with torch.no_grad():
        outputs = clip_model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1)

    probs = probs[0]

    tags = []
    for i, prob in enumerate(probs):
        if prob.item() > 0.20:   # confidence threshold
            tags.append(candidate_labels[i])

    return tags

In [8]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
llm_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small").to(device)

Loading weights:   0%|          | 0/190 [00:00<?, ?it/s]



In [9]:
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = llm_model.generate(
        **inputs,
        max_new_tokens=120,
        temperature=0.9,
        top_p=0.95,
        do_sample=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [10]:
conversation_history = []

def build_prompt(tags, question):
    tag_text = ", ".join(tags)

    prompt = f"""
You are an AI assistant that answers questions about an image.

The image contains: {tag_text}.

If the user asks to describe the image, summarize what is visible using the detected objects.

If the user asks about something not in the detected objects, politely say it is not detected.

Question: {question}

Answer in a clear full sentence:
"""
    return prompt

In [None]:
image_path = "test_images/cat1.jpg"  # change if needed

tags = generate_tags(image_path)
print("Detected image tags:", tags)

while True:
    question = input("Ask about the image (type 'exit' to quit): ")

    if question.lower() == "exit":
        break

    prompt = build_prompt(tags, question)
    response = generate_response(prompt)

    print("Assistant:", response)

    conversation_history.append({
        "user": question,
        "assistant": response
    })

Detected image tags: ['cat']


Ask about the image (type 'exit' to quit):  What animal is in the picture?


Assistant: cat


Ask about the image (type 'exit' to quit):  Is this indoor or outdoor?


Assistant: indoor
