# Install Required Libraries


In [1]:
!pip install transformers==4.40.2 sentencepiece==0.1.99 decord==0.6.0

#SentencePiece is an unsupervised text tokenizer and detokenizer
#to provide smooth experiences

Collecting transformers==4.40.2
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/138.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece==0.1.99
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting decord==0.6.0
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.2)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m88.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentencepiece-0.1.99-cp310-cp310-manylin

# Define Weighted Mean Pooling
In this block, we define the `weighted_mean_pooling` function, which calculates `weighted mean pooling` on the model’s `hidden states`.


In [13]:
import torch

def weighted_mean_pooling(hidden, attention_mask):
    # Apply cumulative sum to the attention mask to compute weighted pooling
    attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)

    # Compute the sum of hidden states weighted by attention and then normalize
    s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
    d = attention_mask_.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

# Define the Encoding Function
Here, we define `encode`, a function that can handle both text and image inputs to generate embeddings.


In [3]:
import torch.nn.functional as F

@torch.no_grad()
def encode(text_or_image_list):
    # Check or if input is text image, then prepare inputs accordingly
    if isinstance(text_or_image_list[0], str):
        inputs = {
            "text": text_or_image_list,
            'image': [None] * len(text_or_image_list),
            'tokenizer': tokenizer
        }
    else:
        inputs = {
            "text": [''] * len(text_or_image_list),
            'image': text_or_image_list,
            'tokenizer': tokenizer
        }
    # Forward pass through the model
    outputs = model(**inputs)
    attention_mask = outputs.attention_mask
    hidden = outputs.last_hidden_state

    # Apply weighted mean pooling and normalize the result
    reps = weighted_mean_pooling(hidden, attention_mask)
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

# Load VisRAG-Ret
This block loads the `VisRAG-Ret` model and tokenizer from Hugging Face. We also specify `torch.float16` as the data type for compatibility with T4 GPUs.

In [4]:
# Load Model and Tokenizer
from transformers import AutoModel, AutoTokenizer

# Load the VisRAG-Ret model and tokenizer
model_name_or_path = "openbmb/VisRAG-Ret"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
# Since the T4 GPU doesn't support torch.bfloat16, we use torch.float16 instead.
model = AutoModel.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True).cuda()
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

tokenizer.py:   0%|          | 0.00/983 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/VisRAG-Ret:
- tokenizer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.99M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/765 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.20M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

configuration_minicpm.py:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/VisRAG-Ret:
- configuration_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_visrag_ret.py:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

modeling_minicpm.py:   0%|          | 0.00/71.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/VisRAG-Ret:
- modeling_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_minicpmv.py:   0%|          | 0.00/21.0k [00:00<?, ?B/s]



resampler.py:   0%|          | 0.00/5.61k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/VisRAG-Ret:
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/VisRAG-Ret:
- modeling_minicpmv.py
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/VisRAG-Ret:
- modeling_visrag_ret.py
- modeling_minicpm.py
- modeling_minicpmv.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

VisRAG_Ret(
  (llm): MiniCPMForCausalLM(
    (model): MiniCPMModel(
      (embed_tokens): Embedding(122753, 2304)
      (layers): ModuleList(
        (0-39): 40 x MiniCPMDecoderLayer(
          (self_attn): MiniCPMSdpaAttention(
            (q_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (k_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (v_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (o_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (rotary_emb): MiniCPMRotaryEmbedding()
          )
          (mlp): MiniCPMMLP(
            (gate_proj): Linear(in_features=2304, out_features=5760, bias=False)
            (up_proj): Linear(in_features=2304, out_features=5760, bias=False)
            (down_proj): Linear(in_features=5760, out_features=2304, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): MiniCPMRMSNorm()
          (post_attention_layernorm): M

# Prepare Input Query and Download Test Images
This block defines sample queries and downloads test images for evaluating the model's capability to match queries with relevant images.


In [5]:
from PIL import Image
import requests
from io import BytesIO

# Define sample query
queries = ["What does a dog look like?"]
INSTRUCTION = "Represent this query for retrieving relevant documents: "
queries = [INSTRUCTION + query for query in queries]

# Download sample images
print("Downloading images...")
passages = [
    Image.open(BytesIO(requests.get(
        'https://github.com/OpenBMB/VisRAG/raw/refs/heads/master/scripts/demo/retriever/test_image/cat.jpeg'
    ).content)).convert('RGB'),
    Image.open(BytesIO(requests.get(
        'https://github.com/OpenBMB/VisRAG/raw/refs/heads/master/scripts/demo/retriever/test_image/dog.jpg'
    ).content)).convert('RGB')
]
print("Images downloaded.")


Downloading images...
Images downloaded.


# Compute Embeddings and Calculate Similarity Scores
In this section, we encode the queries and images, then compute similarity scores between the query embedding and each image embedding.


In [6]:
# Encode the queries and images to get embeddings
embeddings_query = encode(queries)
embeddings_doc = encode(passages)

# Calculate similarity scores
scores = (embeddings_query @ embeddings_doc.T)
print("Similarity scores:", scores.tolist())  # [[0.25753140449523926, 0.3385779857635498]], higher score for the dog image


Similarity scores: [[0.2575945258140564, 0.3385988473892212]]


# Use VisRAG-Gen for Generation with Image
Finally, we use the `MiniCPM-V-2` model to generate a response based on the image that best matches the query.


In [7]:
# Load VisRAG-Gen model and tokenizer for generation
# Since the T4 GPU doesn't support torch.bfloat16, we use torch.float16 instead.
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True, torch_dtype=torch.float16).to(device='cuda', dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
model.eval()

# Choose the best matching image (dog) based on similarity scores: [[0.25753140449523926, 0.3385779857635498]]


config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_minicpm.py:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2:
- configuration_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_minicpmv.py:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

modeling_minicpm.py:   0%|          | 0.00/71.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2:
- modeling_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


resampler.py:   0%|          | 0.00/36.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2:
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2:
- modeling_minicpmv.py
- modeling_minicpm.py
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.99M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.20M [00:00<?, ?B/s]

IndexError: list index out of range

In [9]:
image = passages[0]  # The image representing a dog
msgs = [{'role': 'user', 'content': queries[0]}]

# Generate response based on the query and image
res, context, _ = model.chat(
    image=image,
    msgs=msgs,
    context=None,
    tokenizer=tokenizer,
    sampling=True,
    temperature=0.7
)
print("Generated response:", res)

Generated response: The image features a grey tabby cat with large, brown eyes and long whiskers. The background is plain white which accentuates the feline's appearance prominently in this portrait-style photograph.
