In [1]:
import os
from pathlib import Path
from byaldi import RAGMultiModalModel

# os.environ["HF_TOKEN"] = "YOUR_HF_TOKEN"

# Initialize RAGMultiModalModel
model = RAGMultiModalModel.from_pretrained("vidore/colpali")

  from .autonotebook import tqdm as notebook_tqdm


Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 10.68it/s]
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]


In [2]:
# Let's get everyone's favourite paper in here
!wget https://arxiv.org/pdf/1706.03762
!mkdir docs
!mv 1706.03762 docs/attention.pdf
!cp -r docs/attention.pdf docs/attention_with_a_mustache.pdf

--2024-09-05 14:56:31--  https://arxiv.org/pdf/1706.03762
Resolving arxiv.org (arxiv.org)... 151.101.67.42, 151.101.3.42, 151.101.131.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2215244 (2.1M) [application/pdf]
Saving to: ‘1706.03762’


2024-09-05 14:56:32 (32.9 MB/s) - ‘1706.03762’ saved [2215244/2215244]

mkdir: cannot create directory ‘docs’: File exists


In [3]:
# Test indexing
metadata = [{"filename":file_name} for file_name in os.listdir("docs")]

index_name = "attention_index"
model.index(
    input_path=Path("docs/"),
    index_name=index_name,
    store_collection_with_index=False,
    metadata=metadata,
    overwrite=True
)

# BLEU tables are on page 8 and 9. We've indexed the pdf and its evil mustached twin, so we should see similar scores occur twice for every relevant page.
query = "what's the BLEU score of this new strange method?"
results = model.search(query, k=5)

print(f"Search results for '{query}':")
for result in results:
    print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")

print("Test completed successfully!")

overwrite is on. Deleting existing index attention_index to build a new one.
Indexing file: docs/attention.pdf
Added page 1 of document 0 to index.
Added page 2 of document 0 to index.
Added page 3 of document 0 to index.
Added page 4 of document 0 to index.
Added page 5 of document 0 to index.
Added page 6 of document 0 to index.
Added page 7 of document 0 to index.
Added page 8 of document 0 to index.
Added page 9 of document 0 to index.
Added page 10 of document 0 to index.
Added page 11 of document 0 to index.
Added page 12 of document 0 to index.
Added page 13 of document 0 to index.
Added page 14 of document 0 to index.
Added page 15 of document 0 to index.
Index exported to .byaldi/attention_index
Indexing file: docs/attention_with_a_mustache.pdf
Added page 1 of document 1 to index.
Added page 2 of document 1 to index.
Added page 3 of document 1 to index.
Added page 4 of document 1 to index.
Added page 5 of document 1 to index.
Added page 6 of document 1 to index.
Added page 7 o

In [4]:
%%timeit
model.search(query, k=3)

62.8 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [2]:
# Let's load the index now, to ensure the results are still the same.
from byaldi import RAGMultiModalModel

model = RAGMultiModalModel.from_index("attention_index")

Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 17.65it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s]


In [5]:
results = model.search(query, k=5)

print(f"Search results for '{query}':")
for result in results:
    print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Search results for 'what's the BLEU score of this new strange method?':
Doc ID: 0, Page: 1, Score: 19.875
Doc ID: 3, Page: 8, Score: 19.75
Doc ID: 4, Page: 8, Score: 19.75
Doc ID: 3, Page: 9, Score: 19.125
Doc ID: 4, Page: 9, Score: 19.125


## FILTER BASED ON METADATA

In [9]:
results = model.search(query, k=5,filter_metadata={"filename":"attention.pdf"})

print("Metadata information: ",model.model.doc_id_to_metadata)
print(f"Search results for '{query}':")
for result in results:
   print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")

Metadata information:  {0: {'filename': 'attention_table.png'}, 1: {'filename': 'product_c.png'}, 2: {'filename': 'financial_report.pdf'}, 3: {'filename': 'attention_with_a_mustache.pdf'}, 4: {'filename': 'attention.pdf'}}
Search results for 'what's the BLEU score of this new strange method?':
Doc ID: 4, Page: 8, Score: 19.75
Doc ID: 4, Page: 9, Score: 19.125
Doc ID: 4, Page: 1, Score: 17.125
Doc ID: 4, Page: 7, Score: 17.0
Doc ID: 4, Page: 11, Score: 16.75


In [5]:
# Let's see how it looks like with the collection stored with the index, for simpler VLM integration at the cost of memory/storage.
from pathlib import Path
from byaldi import RAGMultiModalModel

model = RAGMultiModalModel.from_pretrained("vidore/colpali")

# Test having base64 in the collection for completely seamless RAG.
pdf_path = Path("docs/attention.pdf")

# Test indexing
index_name = "attention_index_with_collection"
model.index(
    input_path=pdf_path,
    index_name=index_name,
    store_collection_with_index=True,
    overwrite=True
)

# Test searching
# page 6 holds the answer
query = "How does the positional encoding thing work?"
results = model.search(query, k=3)

print(f"Search results for '{query}':")
base_64s = set()
for result in results:
    print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")
    print(f"Base64: {result.base64[:50]}...")
    assert result.base64 not in base_64s
    print("Base64 is unique!")
    base_64s.add(result.base64)
print("Test completed successfully!")

Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

overwrite is on. Deleting existing index attention_index_with_collection to build a new one.
Added page 1 of document 0 to index.
Added page 2 of document 0 to index.
Added page 3 of document 0 to index.
Added page 4 of document 0 to index.
Added page 5 of document 0 to index.
Added page 6 of document 0 to index.
Added page 7 of document 0 to index.
Added page 8 of document 0 to index.
Added page 9 of document 0 to index.
Added page 10 of document 0 to index.
Added page 11 of document 0 to index.
Added page 12 of document 0 to index.
Added page 13 of document 0 to index.
Added page 14 of document 0 to index.
Added page 15 of document 0 to index.
Index exported to .byaldi/attention_index_with_collection
Index exported to .byaldi/attention_index_with_collection
Search results for 'How does the positional encoding thing work?':
Doc ID: 0, Page: 6, Score: 18.875
Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...
Base64 is unique!
Doc ID: 0, Page: 3, Score: 18.625
Base64: iVBORw0

In [7]:
#  Now, let's add another document, which in this case is the same document, but we don't need to tell the model that!

model.add_to_index(pdf_path, store_collection_with_index=True)

Added page 1 of document 1 to index.
Added page 2 of document 1 to index.
Added page 3 of document 1 to index.
Added page 4 of document 1 to index.
Added page 5 of document 1 to index.
Added page 6 of document 1 to index.
Added page 7 of document 1 to index.
Added page 8 of document 1 to index.
Added page 9 of document 1 to index.
Added page 10 of document 1 to index.
Added page 11 of document 1 to index.
Added page 12 of document 1 to index.
Added page 13 of document 1 to index.
Added page 14 of document 1 to index.
Added page 15 of document 1 to index.
Index exported to .byaldi/attention_index_with_collection


In [12]:
results = model.search(query, k=3)
print(f"Search results for '{query}':")
for result in results:
    print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")
    print(f"Base64: {result.base64[:50]}...")
print("Test completed successfully!")

Search results for 'How does the positional encoding thing work?':
Doc ID: 1, Page: 6, Score: 18.875
Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...
Doc ID: 0, Page: 6, Score: 18.875
Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...
Doc ID: 0, Page: 3, Score: 18.625
Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...
Test completed successfully!
