In [4]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

# Prompt tests

In [6]:
from pprint import pprint
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
pprint(processor.chat_template)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


("{% for message in messages %}{{message['role'].capitalize()}}{% if "
 "message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif "
 "%}{% for line in message['content'] %}{% if line['type'] == 'text' "
 "%}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif "
 '%}{% endfor %}<end_of_utterance>\n'
 "{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}")


In [None]:
("{% for message in messages %}"
    "{{message['role'].capitalize()}}"
    "{% if message['content'][0]['type'] == 'image' %}"
        "{{':'}}"
    "{% else %}{{': '}}"
    "{% endif %}"
    "{% for line in message['content'] %}"
        "{% if line['type'] == 'text' %}"
            "{{line['text']}}"
        "{% elif line['type'] == 'image' %}"
            "{{ '<image>' }}"
        "{% endif '%}"
    "{% endfor %}"
    "<end_of_utterance>\n"
 "{% endfor %}"
 "{% if add_generation_prompt %}"
    "{{ 'Assistant:' }}{% endif %}")

In [8]:
def prompt(question: str, images: list | None = None) -> str:
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a friendly assistant which answers user querstions \
                    based on text and image inputs."}
            ]
        }
    ]
    user_message = {"role": "user", "content": []}
    if images:
        for img in images:
            user_message["content"].append({"type": "image"})
    user_message["content"].append({"type": "text", "text": question})
    messages.append(user_message)
    return processor.apply_chat_template(messages, add_generation_prompt=True)

In [9]:
pprint(prompt("Hello, how are you?"))

('System: You are a friendly assistant which answers user '
 'querstions                     based on text and image '
 'inputs.<end_of_utterance>\n'
 'User: Hello, how are you?<end_of_utterance>\n'
 'Assistant:')


In [11]:
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to("cuda")

Loading checkpoint shards: 100%|██████████| 7/7 [04:09<00:00, 35.63s/it]


## Text only

In [13]:
inputs = processor(text=prompt("Hello, how are you?"), return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

['System: You are a friendly assistant which answers user '
 'querstions                     based on text and image inputs. \n'
 'User: Hello, how are you? \n'
 'Assistant: I am doing well, thank you for asking! How can I assist you '
 'today?']


## Multimodal

In [16]:
img1 = load_image("https://www.derwesten.de/wp-content/uploads/sites/8/2022/06/Champions-League-Sieger.jpg")

inputs = processor(text=prompt("What can be seen in the picture?", images=[img1]), images=[img1], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

['System: You are a friendly assistant which answers user '
 'querstions                     based on text and image inputs. \n'
 'User: What can be seen in the picture? \n'
 'Assistant: People holding a trophy and wearing yellow jerseys.']


In [17]:
img2 = load_image("https://www.agon-sportsworld.de/media/image/48/29/48/BC1164.jpg")

inputs = processor(text=prompt("What is the difference in the pictures?", images=[img1, img2]), images=[img1, img2], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

['System: You are a friendly assistant which answers user '
 'querstions                     based on text and image inputs. \n'
 'User: What is the difference in the pictures? \n'
 'Assistant: In the first image, the men are holding up the trophy while in '
 'the second image, the men are holding the trophy over their heads.']


# Model init

In [None]:
class MultimodalModel:
    def __init__(self, model_id: str, device: torch.device | str | None = None) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else device
        self.model = AutoModelForVision2Seq.from_pretrained(model_id).to(self.device)
        self.processor = AutoProcessor.from_pretrained(model_id)

    