[Reference](https://blog.gopenai.com/small-model-big-impact-ibm-granite-vision-dominates-document-understanding-547a505a0874)

```
python3 -m venv venv
source venv/bin/activate

pip install --upgrade pip
```

In [1]:
pip install 'transformers>=4.49' Pillow torch huggingface_hub



In [2]:
# app.py

import os
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from huggingface_hub import hf_hub_download
import torch

def run_vision_inference_with_conversation(model_name: str = "ibm-granite/granite-vision-3.3-2b"):
    _name (str): The name of the Hugging Face model to use.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    print(f"Attempting to load model and processor: {model_name}")
    try:

        processor = AutoProcessor.from_pretrained(model_name)
        model = AutoModelForVision2Seq.from_pretrained(model_name).to(device)
        print("Model and processor loaded successfully.")
    except Exception as e:
        print(f"Error loading model or processor: {e}")
        print("Please ensure you have 'transformers' and 'torch' (or 'tensorflow') installed.")
        print("You might also need to log in to Hugging Face with 'huggingface-cli login'.")
        return

    print(f"Downloading example image from Hugging Face Hub for model: {model_name}")
    img_path = None
    try:
        # Download an example image provided with the model from the Hugging Face Hub
        img_path = hf_hub_download(repo_id=model_name, filename='example.png')
        print(f"Example image downloaded to: {img_path}")
    except Exception as e:
        print(f"Error downloading example image: {e}")
        print("Please check your internet connection or the model's repository for 'example.png'.")
        return

    print("\n--- Interactive Chat ---")
    print("Type your questions about the image. Type 'quit' or 'exit' to end the session.")

    # Main loop for interactive chat
    while True:
        user_input = input("You: ").strip()


        if user_input.lower() in ["quit", "exit"]:
            print("Exiting application. Goodbye!")
            break

        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": img_path},
                    {"type": "text", "text": user_input},
                ],
            },
        ]

        print("Applying chat template and preparing inputs...")
        try:

            inputs = processor.apply_chat_template(
                conversation,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt"
            ).to(device)
            print("Inputs prepared successfully.")
        except Exception as e:
            print(f"Error applying chat template or preparing inputs: {e}")
            continue # Continue to next iteration if input preparation fails

        print("Generating response from the model...")
        try:

            output = model.generate(**inputs, max_new_tokens=100)


            generated_text = processor.decode(output[0], skip_special_tokens=True)

            print("\n--- Model Output ---")
            print(generated_text)
            print("--------------------\n")

        except Exception as e:
            print(f"An error occurred during inference: {e}")
            print("This could be due to memory constraints or other issues during generation.\n")


if __name__ == "__main__":

    run_vision_inference_with_conversation()