In [1]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

  from .autonotebook import tqdm as notebook_tqdm


# Prompt tests

In [None]:
from pprint import pprint
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
pprint(processor.chat_template)

In [None]:
("{% for message in messages %}"
    "{{message['role'].capitalize()}}"
    "{% if message['content'][0]['type'] == 'image' %}"
        "{{':'}}"
    "{% else %}{{': '}}"
    "{% endif %}"
    "{% for line in message['content'] %}"
        "{% if line['type'] == 'text' %}"
            "{{line['text']}}"
        "{% elif line['type'] == 'image' %}"
            "{{ '<image>' }}"
        "{% endif '%}"
    "{% endfor %}"
    "<end_of_utterance>\n"
 "{% endfor %}"
 "{% if add_generation_prompt %}"
    "{{ 'Assistant:' }}{% endif %}")

In [None]:
def prompt(question: str, images: list | None = None) -> str:
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a friendly assistant which answers user querstions \
                    based on text and image inputs."}
            ]
        }
    ]
    user_message = {"role": "user", "content": []}
    if images:
        for img in images:
            user_message["content"].append({"type": "image"})
    user_message["content"].append({"type": "text", "text": question})
    messages.append(user_message)
    return processor.apply_chat_template(messages, add_generation_prompt=True)

In [None]:
pprint(prompt("Hello, how are you?"))

In [None]:
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to("cuda")

## Text only

In [None]:
inputs = processor(text=prompt("Hello, how are you?"), return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

## Multimodal

In [None]:
img1 = load_image("https://www.derwesten.de/wp-content/uploads/sites/8/2022/06/Champions-League-Sieger.jpg")

inputs = processor(text=prompt("What can be seen in the picture?", images=[img1]), images=[img1], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

In [None]:
img2 = load_image("https://www.agon-sportsworld.de/media/image/48/29/48/BC1164.jpg")

inputs = processor(text=prompt("What is the difference in the pictures?", images=[img1, img2]), images=[img1, img2], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}


# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

pprint(generated_texts)

In [None]:
processor.decode(generated_ids[0, 693:], skip_special_tokens=True)

In [None]:
del model
torch.cuda.empty_cache()

# Model class

In [2]:
class MultimodalModel:
    def __init__(self, model_id: str, device: torch.device | str | None = None) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else device
        self.model = AutoModelForVision2Seq.from_pretrained(model_id).to(self.device)
        self.processor = AutoProcessor.from_pretrained(model_id)

    def __construct_prompt(self, text: str, images: list | None = None) -> str:
        messages = [      
            {
                "role": "system",
                "content": [
                    {
                        "type": "text", 
                        "text": "You are a friendly assistant which answers user questions " \
                        + "based on text and image inputs. Be concise."
                    }
                ]
            }
        ]
        user_message = {"role": "user", "content": []}
        if images:
            for img in images:
                user_message["content"].append({"type": "image"})
        user_message["content"].append({"type": "text", "text": text})
        messages.append(user_message)
        return self.processor.apply_chat_template(messages, add_generation_prompt=True)

    def __encode_prompt(self, prompt: str, images: list | None = None) -> dict:
        return self.processor(
            text=prompt, 
            images=images, 
            return_tensors="pt"
        )

    @torch.no_grad()
    def __inference(self, inputs: dict, max_new_tokens: int = 500, **kwargs) -> torch.Tensor:
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        if "max_new_tokens" not in kwargs:
            kwargs["max_new_tokens"] = max_new_tokens
        outputs = self.model.generate(**inputs, **kwargs)
        del inputs
        torch.cuda.empty_cache()
        return outputs

    def generate(self, text: str, *, images: list | None = None, **kwargs) -> str:
        prompt = self.__construct_prompt(text, images)
        encoded_prompt = self.__encode_prompt(prompt, images)
        end_user_prompt = encoded_prompt["input_ids"].shape[-1]
        outputs = self.__inference(encoded_prompt, **kwargs)
        return self.processor.decode(outputs[0, end_user_prompt:], skip_special_tokens=True)

In [3]:
model = MultimodalModel("HuggingFaceM4/idefics2-8b-chatty")

Loading checkpoint shards: 100%|██████████| 7/7 [04:09<00:00, 35.63s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## text only

In [4]:
from pprint import pprint

pprint(
    model.generate(
        "Hello, how are you?",
        do_sample=True, 
        num_beams=3, 
        top_p=0.85, 
        temperature=1.5, 
        no_repeat_ngram_size=3, 
        length_penalty=-1.0,
        max_new_tokens=150
    )
)

"I'm doing well, thank you for asking! How can I assist you today?"


## multimodal

In [5]:
img1 = load_image("https://www.derwesten.de/wp-content/uploads/sites/8/2022/06/Champions-League-Sieger.jpg")
img2 = load_image("https://www.agon-sportsworld.de/media/image/48/29/48/BC1164.jpg")

pprint(
    model.generate(
        "What is is the difference between this two images?",
        images=[img1, img2],
        do_sample=True, 
        num_beams=1, 
        top_p=0.85, 
        temperature=1.5, 
        no_repeat_ngram_size=3, 
        length_penalty=-1.0,
        max_new_tokens=150
    )
)



('The key difference between these two images is the number of people being '
 'lifted. In the first image, the group of friends includes the man and a '
 'buddy, while in the second image, a man is hoisted over others with the '
 'trophy.')
