# Sherpa Digital Brain Technical Task

## Setup Instructions

*Git instructions*

## Task

*Task instructions*


In [None]:
import os

# Git magic to copy additional folders from github repo
REPO_URL = "https://github.com/Charter-AI/sherpa-digital-brain-technical-task.git"
REPO_NAME = "sherpa-digital-brain-technical-task"
REPO_PATH = f"/content/{REPO_NAME}"

# Clone only if the repo folder doesn't already exist
if not os.path.exists(REPO_PATH):
    print(f"Cloning {REPO_URL} into {REPO_PATH}")
    os.chdir('/content')
    !git clone $REPO_URL
    os.chdir(REPO_PATH)
else:
    print(f"Repo '{REPO_NAME}' already exists at {REPO_PATH}. \nSkipping clone and pulling instead...")
    os.chdir(REPO_PATH)
    !git pull

# Confirm it worked
print("Current directory:", os.getcwd())
print("Files in current directory:", os.listdir('.'))

In [1]:
%pip install numpy
%pip install torch

!apt-get install -y poppler-utils

%pip install byaldi
%pip install docx2pdf
%pip install flash-attn

ERROR! Session/line number was not unique in database. History logging moved to new session 36

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?
Collecting byaldi
  Obtaining dependency information for byaldi from https://files.pythonho

In [None]:
# Import dependencies
import numpy as np
import os
import sys
import torch
from byaldi import RAGMultiModalModel
import docx2pdf



In [None]:
# Add the project root to Python path so we can access other files/folders
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
# Byaldi functions
default_index = "default_index"

def initialise_rag_model(index_name: str = default_index) -> RAGMultiModalModel:
    """
    Initialises the RAG model with index stored at `.byaldi/<index_name>`.
    If index does not exist, index will be created.
    If index does exist, it will be loaded.
    """
    rag_model = None

    # Use CUDA if available, otherwise use CPU
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # If .byaldi/<index_name> folder exists, load index from there
    # Otherwise, load generic model with vidore/colqwen2-v1.0 vision model

    index_path = project_root + "/.byaldi/" + index_name
    if os.path.exists(index_path):
        rag_model = RAGMultiModalModel.load_from_index(index_path, device=device)
    else:
        rag_model = RAGMultiModalModel.from_pretrained(
            "vidore/colqwen2-v1.0",
            device=device
        )

    return rag_model
    
def index_documents(rag_model: RAGMultiModalModel, index_name: str = default_index, docs_folder_name: str = "documents"):
    """
    Indexes documents for RAG model in `docs_folder_name` and stores index at `.byaldi/<index_name>`. 
    If index already exists, it will be overwritten.
    """
    index_path = project_root + "/.byaldi/" + index_name
    docs_path = project_root + "/" + docs_folder_name

    # Check if documents folder exists
    if not os.path.exists(docs_path):
        raise FileNotFoundError(f"Documents folder {docs_path} does not exist")
    
    rag_model.index(
        input_path=docs_path,
        index_name=index_name,
        store_collection_with_index=True,    # Store base64 encodings of documents with index
        max_image_height=2048,
        max_image_width=2048,
        overwrite=True
    )

def query_rag_model(rag_model: RAGMultiModalModel, query: str, k: int = 5):
    """
    Queries the RAG model with `query` and returns `k` most relevant documents.
    
    Returns a list of dictionaries, each containing the following keys:
    - `doc_id`: ID of document
    - `page_num`: Page number of image within document
    - `score`: Score of document
    - `metadata`: Metadata attached to image
    - `base64`: Base64 encoded image
    """
    results = rag_model.query(
        query=query,
        k=k
    )

    results_dict = [x.__dict__ for x in results]
    return results_dict
    