In [1]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

  from .autonotebook import tqdm as notebook_tqdm


# Prompt tests

In [2]:
from pprint import pprint
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
pprint(processor.chat_template)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


("{% for message in messages %}{{message['role'].capitalize()}}{% if "
 "message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif "
 "%}{% for line in message['content'] %}{% if line['type'] == 'text' "
 "%}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif "
 '%}{% endfor %}<end_of_utterance>\n'
 "{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}")


In [3]:
("{% for message in messages %}"
    "{{message['role'].capitalize()}}"
    "{% if message['content'][0]['type'] == 'image' %}"
        "{{':'}}"
    "{% else %}{{': '}}"
    "{% endif %}"
    "{% for line in message['content'] %}"
        "{% if line['type'] == 'text' %}"
            "{{line['text']}}"
        "{% elif line['type'] == 'image' %}"
            "{{ '<image>' }}"
        "{% endif '%}"
    "{% endfor %}"
    "<end_of_utterance>\n"
 "{% endfor %}"
 "{% if add_generation_prompt %}"
    "{{ 'Assistant:' }}{% endif %}")

"{% for message in messages %}{{message['role'].capitalize()}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif '%}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"

In [4]:
def prompt(question: str, images: list | None = None) -> str:
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a friendly assistant which answers user querstions \
                    based on text and image inputs."}
            ]
        }
    ]
    user_message = {"role": "user", "content": []}
    if images:
        for img in images:
            user_message["content"].append({"type": "image"})
    user_message["content"].append({"type": "text", "text": question})
    messages.append(user_message)
    return processor.apply_chat_template(messages, add_generation_prompt=True)

In [5]:
pprint(prompt("Hello, how are you?"))

('System: You are a friendly assistant which answers user '
 'querstions                     based on text and image '
 'inputs.<end_of_utterance>\n'
 'User: Hello, how are you?<end_of_utterance>\n'
 'Assistant:')


In [6]:
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to("cuda")

Loading checkpoint shards: 100%|██████████| 7/7 [04:06<00:00, 35.25s/it]


## Text only

In [7]:
inputs = processor(text=prompt("Hello, how are you?"), return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

['System: You are a friendly assistant which answers user '
 'querstions                     based on text and image inputs. \n'
 'User: Hello, how are you? \n'
 'Assistant: I am doing well, thank you for asking! How can I assist you '
 'today?']


## Multimodal

In [8]:
img1 = load_image("https://www.derwesten.de/wp-content/uploads/sites/8/2022/06/Champions-League-Sieger.jpg")

inputs = processor(text=prompt("What can be seen in the picture?", images=[img1]), images=[img1], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

['System: You are a friendly assistant which answers user '
 'querstions                     based on text and image inputs. \n'
 'User: What can be seen in the picture? \n'
 'Assistant: People holding a trophy and wearing yellow jerseys.']


In [9]:
img2 = load_image("https://www.agon-sportsworld.de/media/image/48/29/48/BC1164.jpg")

inputs = processor(text=prompt("What is the difference in the pictures?", images=[img1, img2]), images=[img1, img2], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

['System: You are a friendly assistant which answers user '
 'querstions                     based on text and image inputs. \n'
 'User: What is the difference in the pictures? \n'
 'Assistant: In the first image, the men are holding up the trophy while in '
 'the second image, the men are holding the trophy over their heads.']


In [10]:
processor.decode(generated_ids[0, 693:], skip_special_tokens=True)

'In the first image, the men are holding up the trophy while in the second image, the men are holding the trophy over their heads.'

In [11]:
del model

# Model class

In [12]:
class MultimodalModel:
    def __init__(self, model_id: str, device: torch.device | str | None = None) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else device
        self.model = AutoModelForVision2Seq.from_pretrained(model_id).to(self.device)
        self.processor = AutoProcessor.from_pretrained(model_id)

    def __construct_prompt(self, text: str, images: list | None = None) -> str:
        messages = [      
            {
                "role": "system",
                "content": [
                    {
                        "type": "text", 
                        "text": "You are a friendly assistant which answers user questions " \
                        + "based on text and image inputs. Be concise."
                    }
                ]
            }
        ]
        user_message = {"role": "user", "content": []}
        if images:
            for img in images:
                user_message["content"].append({"type": "image"})
        user_message["content"].append({"type": "text", "text": text})
        messages.append(user_message)
        return self.processor.apply_chat_template(messages, add_generation_prompt=True)

    def __encode_prompt(self, prompt: str, images: list | None = None) -> dict:
        return self.processor(
            text=prompt, 
            images=images, 
            return_tensors="pt"
        )

    @torch.no_grad()
    def __inference(self, inputs: dict, max_new_tokens: int = 500, **kwargs) -> torch.Tensor:
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        if "max_new_tokens" not in kwargs:
            kwargs["max_new_tokens"] = max_new_tokens

        return self.model.generate(**inputs, **kwargs)

    def generate(self, text: str, *, images: list | None = None, **kwargs) -> str:
        prompt = self.__construct_prompt(text, images)
        encoded_prompt = self.__encode_prompt(text, images)
        end_user_prompt = encoded_prompt["input_ids"].shape[-1]
        outputs = self.__inference(encoded_prompt, **kwargs)
        return self.processor.decode(outputs[0, end_user_prompt:], skip_special_tokens=True)

In [13]:
model = MultimodalModel("HuggingFaceM4/idefics2-8b-chatty")

Loading checkpoint shards: 100%|██████████| 7/7 [04:17<00:00, 36.72s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
from pprint import pprint

pprint(
    model.generate(
        "What is your favorite color?", 
        do_sample=True, 
        num_beams=3, 
        top_p=0.85, 
        temperature=1.5, 
        no_repeat_ngram_size=3, 
        length_penalty=-1.0,
        max_new_tokens=150
    )
)

('This is a simple question, right? Well, not for everyone. There are people '
 "who can't choose a favorite color because they love all of them equally. And "
 'then there are those who have a hard time deciding between two or three '
 "colors that they like the most. It's not just a matter of personal "
 "preference, it's a reflection of one's personality. So, what does your "
 "choice of color say about you? Let's find out.\n"
 '\n'
 '\n'
 '1. Red\n'
 'People who like red are passionate and energetic. They have a fiery '
 'personality and are not afraid to speak their mind. They are assertive and '
 'have a strong personality. Red is also associated with love and romance, so '
 'people who like')
