### Install dependencies....

In [1]:
!pwd

/content


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -r requirements_01.txt

Collecting boto3 (from -r requirements.txt (line 2))
  Downloading boto3-1.39.14-py3-none-any.whl.metadata (6.7 kB)
Collecting PyMuPDF>=1.24.0 (from -r requirements.txt (line 6))
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting tabula-py>=2.9.0 (from -r requirements.txt (line 7))
  Downloading tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Collecting faiss-cpu>=1.8.0 (from -r requirements.txt (line 10))
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting python-dotenv>=1.0.0 (from -r requirements.txt (line 15))
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting botocore<1.40.0,>=1.39.14 (from boto3->-r requirements.txt (line 2))
  Downloading botocore-1.39.14-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->-r requirements.txt (line 2))
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Coll

In [6]:
import boto3
import tabula
import faiss
import json
import base64
import pymupdf
import requests
import os
import logging
import numpy as np
import warnings
from tqdm import tqdm
from botocore.exceptions import ClientError
from langchain_text_splitters import RecursiveCharacterTextSplitter
from IPython import display


logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

warnings.filterwarnings("ignore")

### Data Loading

In [8]:
url = "https://jmedicalcasereports.biomedcentral.com/counter/pdf/10.1186/s13256-025-05102-8.pdf?_gl=1*1pvoysr*_up*MQ..*_gs*MQ..&gclid=CjwKCAjw1ozEBhAdEiwAn9qbzX0vXIRxOZdvBmvUB-BQ1Ahy6bT3qPQX7qmweskDHcsjI6B0J9w-hxoCBNcQAvD_BwE&gbraid=0AAAAApIOJzq7IKzPIUG82kbMm9k0x1z3S"

# Set the filename and filepath
filename = "Tuberculosis of the elbow joint.pdf"
filepath = os.path.join("data", filename)

# Create the data directory if it doesn't exist
os.makedirs("data", exist_ok=True)

# Download the file
response = requests.get(url)
if response.status_code == 200:
    with open(filepath, 'wb') as file:
        file.write(response.content)
    print(f"File downloaded successfully: {filepath}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

File downloaded successfully: data/Tuberculosis of the elbow joint.pdf


In [9]:
# Display the PDF file
# display.IFrame(filepath, width=1000, height=600)

### Data Extraction

In [10]:
# Create the directories
def create_directories(base_dir):
    directories = ["images", "text", "tables", "page_images"]
    for dir in directories:
        os.makedirs(os.path.join(base_dir, dir), exist_ok=True)

# Process tables
def process_tables(doc, page_num, base_dir, items):
    try:
        tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
        if not tables:
            return
        for table_idx, table in enumerate(tables):
            table_text = "\n".join([" | ".join(map(str, row)) for row in table.values])
            table_file_name = f"{base_dir}/tables/{os.path.basename(filepath)}_table_{page_num}_{table_idx}.txt"
            with open(table_file_name, 'w') as f:
                f.write(table_text)
            items.append({"page": page_num, "type": "table", "text": table_text, "path": table_file_name})
    except Exception as e:
        print(f"Error extracting tables from page {page_num}: {str(e)}")

# Process text chunks
def process_text_chunks(text, text_splitter, page_num, base_dir, items):
    chunks = text_splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        text_file_name = f"{base_dir}/text/{os.path.basename(filepath)}_text_{page_num}_{i}.txt"
        with open(text_file_name, 'w') as f:
            f.write(chunk)
        items.append({"page": page_num, "type": "text", "text": chunk, "path": text_file_name})


def process_images(page, page_num, base_dir, items):
    images = page.get_images()
    for idx, image in enumerate(images):
        xref = image[0]
        pix = pymupdf.Pixmap(doc, xref)
        image_name = f"{base_dir}/images/{os.path.basename(filepath)}_image_{page_num}_{idx}_{xref}.png"
        pix.save(image_name)
        with open(image_name, 'rb') as f:
            encoded_image = base64.b64encode(f.read()).decode('utf8')
        items.append({"page": page_num, "type": "image", "path": image_name, "image": encoded_image})

# Process page images
def process_page_images(page, page_num, base_dir, items):
    pix = page.get_pixmap()
    page_path = os.path.join(base_dir, f"page_images/page_{page_num:03d}.png")
    pix.save(page_path)
    with open(page_path, 'rb') as f:
        page_image = base64.b64encode(f.read()).decode('utf8')
    items.append({"page": page_num, "type": "page", "path": page_path, "image": page_image})

In [12]:
doc = pymupdf.open(filepath)
num_pages = len(doc)
base_dir = "data"

# Creating the directories
create_directories(base_dir)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200, length_function=len)
items = []

# Process each page of the PDF
for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
    page = doc[page_num]
    text = page.get_text()
    process_tables(doc, page_num, base_dir, items)
    process_text_chunks(text, text_splitter, page_num, base_dir, items)
    process_images(page, page_num, base_dir, items)
    process_page_images(page, page_num, base_dir, items)


Processing PDF pages: 100%|██████████| 9/9 [00:30<00:00,  3.36s/it]


In [13]:
# Looking at the first text item
[i for i in items if i['type'] == 'text'][0]

{'page': 0,
 'type': 'text',
 'text': 'Manske\xa0et\xa0al. Journal of Medical Case Reports           (2025) 19:88  \nhttps://doi.org/10.1186/s13256-025-05102-8\nCASE REPORT\nOpen Access\n© The Author(s) 2025. Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which \npermits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the \noriginal author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or \nother third party material in this article are included in the article’s Creative Commons licence, unless indicated otherwise in a credit line',
 'path': 'data/text/Tuberculosis of the elbow joint.pdf_text_0_0.txt'}

In [14]:
# Looking at the first table item
[i for i in items if i['type'] == 'table'][0]

{'page': 3,
 'type': 'table',
 'text': 'tively, only one was found to be populated with staphylo- | joint effusion. Owing to the extremely slow growth of the\ncoccus epidermidis (1/3). In the first sample collection, no | bacteria and the extensive resistance testing, we did not\nmycobacteria were detected. Following the recommen- | receive the final resistogram (Fig. 5) until 4 months after\ndation of our colleagues from the department of infecti- | initial presentation.\nology, we did not start tuberculostatic therapy owing to | Thereafter, an interdisciplinary case discussion took\nmissing results from the histopathological and microbio- | place. The idea of another surgical treatment to reduce\nlogical examinations. The following days were unevent- | the bacterial load was depraved. In consideration of the\nful. The inserted drainage was removed when the exudate | finally available resistogram and a joint effusion that was\nregressed. The inflammatory parameters normalized. The | n

In [15]:
# Looking at the first image item
[i for i in items if i['type'] == 'image'][0]

{'page': 0,
 'type': 'image',
 'path': 'data/images/Tuberculosis of the elbow joint.pdf_image_0_0_505.png',
 'image': 'iVBORw0KGgoAAAANSUhEUgAAAHMAAAB3CAIAAAC/sOinAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAPLklEQVR4nO2dC1fiSBCF5///HmbWF6CAiMpDFFQkyFNBEVRAQUd393ZXJ+mEDi8hBrCOe87s7O45k29vV1Vfisqv/35iMfFrhv/mXynm/gdamZiULCB+fnx8/LXEBw/8vsz68/NzoX/iZYmJyILX+/t7r9d7enp65IFfdDsd/E6/3x8MBvingE6IQfZHy/9NQhaYwK7VapVLpfxV/iqHv/LXhetKuXx7c9tsNkG512WIBV9O9gfueLI49WAHjmenZ4lEIh6PJ5PJ9En64vxcy2v4/UajAe7dTrf/+vr2NmD54ScLjyXLBNvvN+qNy4vLo8PD/UgkEg7v70diB7FkIgHWudwVtFy/vX14eHh+fn59ef1RLsUYskidEGO5VE6nT4F1b28vGAjuBoOhvb1oNBo/Pj5Nn+Yuc6ViCZmB4HLlvjG4663cMWSRCtqtlqZpyABQK7D6d3bwEwgEQqEQwWXKvcwx5dbrgEtpgSl3vdPCKLJUu+4aSAUXx8fH0Knf799isbm9vYVfQ8L7+/vHTLlpXbk3RlqAcv+ucVoYRRapAH1VrVbLnGUOY4fgCLVubm5ubGzgLwAmuAfR6LGUFki5P2lhFFneFTzjmINaLBbbDe7u7GyD7D88NnS4Rs49Oz010kKLpYXOOhc0R7JGG4vWNX1ygmYAhWt7extAgfXPn98yXCg3ak8Lt/a0sGbKdSQLlb2+vjbvm1o+n0qlYgcHMtnfPGzKpbQwVNA6Ii2sWUFzJAsQvW4XdHDpSqWSOtktoPwthU25KGjWVkwUtJeXl3UraGqyeHJKs

### Generating Multimodal Embeddings

In [16]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [17]:
def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        return model.get_image_features(**inputs).cpu()

def get_text_embedding(text_str):
    inputs = processor(text=[text_str], return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        return model.get_text_features(**inputs).cpu()


In [18]:
image_dir = "./data/images"
text_dir = "./data/text"
table_dir = "./data/tables"

image_embeddings = {}
text_embeddings = {}
table_embeddings = {}

# Process images
for file in tqdm(os.listdir(image_dir), desc="Images"):
    if file.endswith(".png"):
        path = os.path.join(image_dir, file)
        embedding = get_image_embedding(path)
        image_embeddings[file] = embedding

# Process regular texts
for file in tqdm(os.listdir(text_dir), desc="Text Files"):
    if file.endswith(".txt"):
        path = os.path.join(text_dir, file)
        with open(path, "r", encoding="utf-8") as f:
            content = f.read()
        embedding = get_text_embedding(content)
        text_embeddings[file] = embedding

# Process tables (assuming plain-text tables like CSV-style rows)
for file in tqdm(os.listdir(table_dir), desc="Tables"):
    if file.endswith(".txt"):
        path = os.path.join(table_dir, file)
        with open(path, "r", encoding="utf-8") as f:
            content = f.read()
        embedding = get_text_embedding(content)
        table_embeddings[file] = embedding



Images:   0%|          | 0/8 [00:00<?, ?it/s][A
Images:  12%|█▎        | 1/8 [00:01<00:07,  1.04s/it][A
Images:  38%|███▊      | 3/8 [00:01<00:01,  3.18it/s][A
Images:  62%|██████▎   | 5/8 [00:01<00:00,  3.40it/s][A
Images:  75%|███████▌  | 6/8 [00:01<00:00,  3.64it/s][A
Images: 100%|██████████| 8/8 [00:02<00:00,  3.79it/s]

Text Files:   0%|          | 0/72 [00:00<?, ?it/s][A
Text Files:   1%|▏         | 1/72 [00:00<00:21,  3.23it/s][A
Text Files:   3%|▎         | 2/72 [00:00<00:21,  3.18it/s][A
Text Files:   4%|▍         | 3/72 [00:00<00:17,  3.95it/s][A
Text Files:   7%|▋         | 5/72 [00:00<00:09,  7.12it/s][A
Text Files:  11%|█         | 8/72 [00:01<00:05, 11.81it/s][A
Text Files:  17%|█▋        | 12/72 [00:01<00:03, 15.59it/s][A
Text Files:  19%|█▉        | 14/72 [00:01<00:03, 14.93it/s][A
Text Files:  22%|██▏       | 16/72 [00:01<00:05, 11.16it/s][A
Text Files:  29%|██▉       | 21/72 [00:01<00:02, 17.89it/s][A
Text Files:  36%|███▌      | 26/72 [00:01<00:01, 24

In [19]:
# Check one embedding from each dictionary (if not empty)

if image_embeddings:
    first_image_embedding = next(iter(image_embeddings.values()))
    print("Image embedding vector size:", first_image_embedding.shape)

if text_embeddings:
    first_text_embedding = next(iter(text_embeddings.values()))
    print("Text embedding vector size:", first_text_embedding.shape)

if table_embeddings:
    first_table_embedding = next(iter(table_embeddings.values()))
    print("Table embedding vector size:", first_table_embedding.shape)


Image embedding vector size: torch.Size([1, 512])
Text embedding vector size: torch.Size([1, 512])
Table embedding vector size: torch.Size([1, 512])


### Creating VectorDB

In [20]:
import torch
import numpy as np
import faiss

# Helper function to convert dict of tensors to list of numpy arrays
def convert_embeddings_to_numpy(embedding_dict):
    return [embedding.squeeze(0).cpu().numpy() for embedding in embedding_dict.values()]

# Convert and merge all embeddings
image_vecs = convert_embeddings_to_numpy(image_embeddings)
text_vecs = convert_embeddings_to_numpy(text_embeddings)
table_vecs = convert_embeddings_to_numpy(table_embeddings)

# Combine all into a single array
all_embeddings_np = np.array(image_vecs + text_vecs + table_vecs, dtype=np.float32)

# Confirm shape
print("Final stacked shape:", all_embeddings_np.shape)  # Should be (N, 512)

# Create and populate FAISS index
index = faiss.IndexFlatL2(512)
index.add(all_embeddings_np)

print("FAISS index created with", index.ntotal, "vectors.")


Final stacked shape: (86, 512)
FAISS index created with 86 vectors.


In [23]:
import google.generativeai as genai
from PIL import Image
import io

import os
from google.colab import userdata

api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("API key not found in Colab secrets. Please set GOOGLE_API_KEY.")
genai.configure(api_key=api_key)

def invoke_gemini_multimodal(prompt, matched_items):
    """
    Use Gemini 1.5 Flash for multimodal RAG.
    Accepts a prompt and a list of matched_items with type: 'text', 'table', or 'image'.
    """

    # Prepare content for Gemini
    gemini_content = []

    # Add the retrieved content (text or image)
    for item in matched_items:
        if item['type'] in ['text', 'table']:
            gemini_content.append(item['text'])
        elif item['type'] == 'image':
            image_data = item['image']  # bytes
            image = Image.open(io.BytesIO(image_data))
            gemini_content.append(image)

    # Append the user query
    gemini_content.append(prompt)

    # Create multimodal model instance
    model = genai.GenerativeModel("gemini-1.5-flash")

    # Generate response
    response = model.generate_content(gemini_content, generation_config={
        "temperature": 0.7,
        "top_p": 0.95,
        "max_output_tokens": 512,
    })

    return response.text


### Test The RAG Pipeline

In [24]:
# User Query
query = "How arthritis and elbow joint pain is related to tuberculosis?"

# Generate embeddings for the query using CLIP (text input)
query_embedding = get_text_embedding(query)  # This returns shape (1, 512)

# Ensure it's in the correct format for FAISS (float32 and reshaped)
query_embedding_np = np.array(query_embedding, dtype=np.float32).reshape(1, -1)

# Search for the nearest neighbors in the vector database
distances, result = index.search(query_embedding_np, k=5)


In [25]:
# Check the result (matched chunks)
result.flatten()

array([42, 54, 14, 10, 73])

In [26]:
# Retrieve the matched items
matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]

# Generate RAG response with Amazon Nova
response = invoke_gemini_multimodal(query, matched_items)

display.Markdown(response)

This case report describes a patient with tuberculosis (TB) affecting the elbow joint, causing arthritis and elbow pain.  The connection is that the TB infection spread to the elbow joint, resulting in inflammation (synovitis, as shown in the MRI, Fig 4), and subsequent pain and limited range of motion.  The osteolysis (bone loss) suspected in the initial x-ray (Fig 1) is further evidence of the destructive nature of the infection in the joint.
