### Text Image Analayzer Gradio app using LLAMA3.2-11B-Vision model.This demo requires A100 GPU

In [7]:
!pip3 install -U transformers bitsandbytes accelerate peft -q

In [None]:
!pip3 install gradio -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.7/318.7 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.6/94.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [10]:
import gradio as gr
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

# Load the Llama 3.2 Vision Model
def load_llama_model():
    model_id = "meta-llama/Llama-3.2-11B-Vision"

    # Load model and processor
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    processor = AutoProcessor.from_pretrained(model_id)

    return model, processor

# Function to generate predictions for text and image
def process_input(text, image=None):
    model, processor = load_llama_model()

    if image:
        # If an image is uploaded, process it as a PIL Image object
        vision_input = image.convert("RGB").resize((224, 224))

        prompt = f"<|image|><|begin_of_text|>{text}"

        # Process image and text together
        inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
    else:
        # If no image is uploaded, just process the text
        prompt = f"<|begin_of_text|>{text}"
        inputs = processor(prompt, return_tensors="pt").to(model.device)

    # Generate output from the model
    outputs = model.generate(**inputs, max_new_tokens=100)

    # Decode the output to return a readable text
    decoded_output = processor.decode(outputs[0], skip_special_tokens=True)

    return decoded_output

# Gradio Interface Setup
def demo():
    # Define Gradio input and output components
    text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)

    # Use type="pil" to work with PIL Image objects
    image_input = gr.Image(label="Upload an Image", type="pil")

    output = gr.Textbox(label="Model Output", lines=5)

    # Define the interface layout
    interface = gr.Interface(
        fn=process_input,
        inputs=[text_input, image_input],
        outputs=output,
        title="Llama 3.2 Multimodal Text-Image Analyzer",
        description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model."
    )

    # Launch the demo
    interface.launch()

# Run the demo
if __name__ == "__main__":
    demo()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0fe7a691d7eb981c7d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
