In [3]:
%pip install -U transformers==4.32.0 \
             datasets==2.14.4 \
             diffusers==0.20.0 \
             accelerate==0.21.0 \
             torch==2.0.1 \
             torchvision==0.15.2 \
             sentencepiece==0.1.99

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers==4.32.0
  Obtaining dependency information for transformers==4.32.0 from https://files.pythonhosted.org/packages/ae/95/283a1c004430bd2a9425d6937fc545dd49a4e4592feb76be0299a14e2378/transformers-4.32.0-py3-none-any.whl.metadata
  Downloading transformers-4.32.0-py3-none-any.whl.metadata (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.5/118.5 kB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.14.4
  Obtaining dependency information for datasets==2.14.4 from https://files.pythonhosted.org/packages/66/f8/38298237d18d4b6a8ee5dfe390e97bed5adb8e01ec6f9680c0ddf3066728/datasets-2.14.4-py3-none-any.whl.metadata
  Downloading datasets-2.14.4-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece==0.1.99
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# TODO:  Read and summarize this: 

https://huggingface.co/docs/transformers/main/en/model_doc/idefics#transformers.IdeficsImageProcessor

and

https://arxiv.org/pdf/2306.16527.pdf


## Highlights
Based on Flamingo, trained on open dataset.

## Dataset
We introduce the OBELICS dataset, an open web-scale filtered dataset of interleaved image-text documents comprising 141 million web pages extracted from Common Crawl, 353 million associated images, and 115 billion text tokens. We describe the dataset creation process, present comprehensive filtering rules, and provide an analysis of the dataset’s content.

## Model 
We train an 80 billion parameters vision and language model on the dataset and obtain competitive performance on various multimodal benchmarks. We release the code to reproduce the dataset along with the dataset itself.

In [4]:
import torch
from transformers import IdeficsForVisionText2Text, AutoProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "HuggingFaceM4/idefics-9b-instruct"
model = IdeficsForVisionText2Text.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)
processor = AutoProcessor.from_pretrained(model_name)

# Generation args
exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Zero-shot inference

![](https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg)

In [69]:
url = "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg"
img = processor.image_processor.fetch_images([url])[0]

prompts = [
    "\nUser:",
    img,
    "Describe this image.\nAssistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

full_text='<s>\nUser:<fake_token_around_image><image><fake_token_around_image>Describe this image.\nAssistant:'


"<s> \nUser:<fake_token_around_image><image><fake_token_around_image> Describe this image.\nAssistant: The image features a small dog wearing a pair of black glasses, giving it a unique and adorable appearance. The dog is positioned in the center of the frame, and its glasses cover a significant portion of its face. The dog appears to be looking directly at the camera, capturing the viewer's attention. The background is relatively simple, with no other objects or elements distract"

# One-shot Inference to guide the description using a complete example

![](https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg)
![](https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg)

In [14]:
url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
img = processor.image_processor.fetch_images([url])[0]

# Either use img or url
prompts = [
    "User:",
    img,
    "Describe this image."
    "Assistant: An image of two kittens in grass."
    "User:",
    "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
    "Describe this image.",
    "Assistant: "
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
generated_text

full_text='<s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.Assistant: An image of two kittens in grass.User:<fake_token_around_image><image><fake_token_around_image>Describe this image.<end_of_utterance>Assistant:'


'User: Describe this image.Assistant: An image of two kittens in grass.User: Describe this image. Assistant: A dog wearing glasses and a tank top.'

# Show special characters injected around the images

In [6]:
generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

'<s> User:<fake_token_around_image><image><fake_token_around_image> Describe this image.Assistant: An image of two kittens in grass.User:<fake_token_around_image><image><fake_token_around_image> Describe this image.<end_of_utterance> Assistant: A dog wearing glasses and a tank top.<end_of_utterance>'

# Ask Questions About Text in the Image

![](img/movie-premiere.png)

In [16]:
# from PIL import Image
# #url = "https://dsoaws.s3.amazonaws.com/gaia/11_multimodal/img/movie-premiere.png"
# #img = processor.image_processor.fetch_images([url])[0]
# img = Image.open("img/movie-premiere.png") 

# prompts = [
#     "User: ",
#     #img,
#     "img/movie-premiere.png",
#     "Describe this image.",
#     "Assistant: ",
# ]

# inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

# generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
# generated_text

full_text='<s>User:<end_of_utterance>img/movie-premiere.png<end_of_utterance>Describe this image.<end_of_utterance>Assistant:'


"<s> User:<end_of_utterance> img/movie-premiere.png<end_of_utterance> Describe this image.<end_of_utterance> Assistant: I'm sorry, but I cannot describe this image as I am a text-based program and I do not have the capability to view images.<end_of_utterance>"

In [25]:
#url = "https://dsoaws.s3.amazonaws.com/gaia/11_multimodal/img/movie-premiere.png"
#url = "img/car-duck.png"
#img = processor.image_processor.fetch_images([url])[0]

img = Image.open("img/movie-premiere.png") 

prompts = [
    "User: ",
    img,
    "When does this movie premiere.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=100) # eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: When does this movie premiere. Assistant: The movie premieres on June 24.


![](img/happy-car-chris.png)

In [36]:
from PIL import Image
img = Image.open("img/happy-car-chris.png") 

prompts = [
    "User: ",
    img,
    "Describe this image.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

full_text='<s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.<end_of_utterance>Assistant:'


'<s> User:<fake_token_around_image><image><fake_token_around_image> Describe this image.<end_of_utterance> Assistant: A man is sitting on the hood of a white sports car.<end_of_utterance>'

In [40]:
from PIL import Image
img = Image.open("img/happy-car-chris.png") 

prompts = [
    "User: ",
    img,
    "Who makes this car?",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

full_text='<s>User:<fake_token_around_image><image><fake_token_around_image>Who makes this car?<end_of_utterance>Assistant:'


'<s> User:<fake_token_around_image><image><fake_token_around_image> Who makes this car?<end_of_utterance> Assistant: The car is made by Porsche.<end_of_utterance>'

![](img/baby-groot-toy.jpg)

In [32]:
img = Image.open("img/baby-groot-toy.jpg") 

prompts = [
    "User: ",
    img,
    "Which movie is this character from?",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=100) # eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: Which movie is this character from? Assistant: This character is from the movie Guardians of the Galaxy.


# Chain of thought

![](img/baby-groot-toy.jpg)

In [35]:
# This image is from https://www.amazon.com/Hot-Toys-Marvel-Guardians-Life-Size/dp/B07257N92P
img = Image.open("img/baby-groot-toy.jpg") 

prompts = [
    "User: ",
    img,
    "Who produced the movie that features this character?",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=100) #, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: Who produced the movie that features this character? Assistant: Marvel Studios produced the movie that features this character.


![](img/margherita-pizza.jpg)

In [None]:
# This image is from https://eu.ooni.com/blogs/recipes/margherita-pizza

img = Image.open("img/margherita-pizza.jpg") 

prompts = [
    "User: ",
    img,
    "What is this and how do I make this?",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)