In [1]:
from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
from PIL import Image
import torch

# Load LayoutLM model and processor
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")

Some weights of LayoutLMv2ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv2-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The warning above is not relevant for our particular usecase, since we are not using the model to make predictions or inferences.

In [2]:
from pdf2image import convert_from_path
from PIL import Image


# Transform PDF into list of images for each page
pdf = "medical_files/medicare_doc.pdf"
all_images = convert_from_path(pdf, 300)


In [3]:
from parser.image_processing import process_image_in_segments
import pytesseract
import torch


def image_to_text(image):
    return pytesseract.image_to_string(image)


def process_image_with_layout(image):
    image = image.convert("RGB")
    encoding = processor(image, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**encoding)
    
    # post-process outputs (e.g., get tokena dn bounding boxes)
    tokens = processor.tokenizer.convert_ids_to_tokens( encoding["input_ids"][0].tolist() )
    boxes = encoding["bbox"][0].tolist()
    
    return tokens, boxes


def segment_image(image, processor, max_tokens=512):
    width, height = image.size
    segment_size = height // 2
    
    segments = list()
    for top in range(0, height, segment_size):
        box = (0, top, width, top + segment_size)
        segment = image.crop(box)
        
        tokens = processor(segment, return_tensors="pt")
        
        if tokens["input_ids"].size(1) > max_tokens:
            yield segment_image(segment, processor, max_tokens)
        else:
            yield segment

In [5]:
from datasets import Dataset


# Create and save pdf dataset to disk
data = {
    "text": [i["text"] for i in structured_data], 
    "bbox": [i["bbox"] for i in structured_data],
    "tokens": [i["tokens"] for i in structured_data],
    "title": [f"Medicare Doc - page {i['page_number']}" for i in structured_data]
}
dataset = Dataset.from_dict(data)
# dataset.save_to_disk(dataset_path)

print(f"Title: {data['title'][1]}")
print(f"Text: {data['text'][1][0:50]}...truncated...")
print(f"bbox: {data['bbox'][1][0:10]}...truncated...")
print(f"Tokens: {data['tokens'][1][0:10]}...truncated...")

Title: Medicare Doc - page 2
Text: Medicare strives to improve access to affordable t...truncated...
bbox: [[0, 0, 0, 0], [91, 408, 171, 429], [178, 408, 235, 429], [178, 408, 235, 429], [242, 409, 259, 429], [267, 408, 339, 432], [346, 413, 404, 429], [411, 409, 428, 429], [435, 407, 527, 429], [534, 409, 631, 429]]...truncated...
Tokens: ['[CLS]', 'medicare', 'strive', '##s', 'to', 'improve', 'access', 'to', 'affordable', 'treatments']...truncated...


In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

def embed_text(text_data):
    embeddings = embedding_model.embed_documents([text_data["text"]])[0]
    return {"embeddings": embeddings}

dataset = dataset.map(embed_text)
print(dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'bbox', 'tokens', 'title', 'embeddings'],
    num_rows: 4
})


In [7]:
import faiss
import numpy as np


# Define the FAISS index
dimension = len(dataset[0]["embeddings"])  # dimension of embeddings
index = faiss.IndexFlatL2(dimension)
index.add(np.vstack(dataset["embeddings"]))
index_path = "test_medicare_index"
faiss.write_index(index, index_path)

# Save the index to disk
dataset.add_faiss_index(column="embeddings")
dataset_path = "test_medicare_dataset"
# Drop the FAISS index before saving the dataset to disk
dataset.drop_index("embeddings")
dataset.save_to_disk(dataset_path)
# dataset.add_faiss_index(column="embeddings")
# dataset.get_index("embeddings").save(index_path)

  0%|          | 0/1 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

In [8]:

def retrieve_pages(index, query, embedding_model, page_mapping, k=5):
    query_embeddings = embedding_model.embed_documents([query])
    query_embedding = np.array(query_embeddings[0]).reshape(1, -1)
    distance, indices = index.search(query_embedding, k)
    valid_indices = [idx for idx in indices[0] if idx != -1]
    pages = [page_mapping[idx] for idx in valid_indices]
    return pages


# Save the page mapping for retrieval
page_mapping = {i: entry for i, entry in enumerate(structured_data)}
    
query = "Better Mental health care"
relevant_pages = retrieve_pages(index, query, embedding_model, page_mapping)
print(relevant_pages)

[{'text': 'meaical equipment penef#it, tTnat INSULIN Is Coverea unaer Meqicare Fart B. YOU WONT\npay more than $35 for a month’s supply and the Medicare deductible no longer\napplies. Go to pages 39 and 88.\n\nRecommended adult vaccines are also now available at no cost to you. Go to page 50.\n\nChanges to telehealth coverage\n\nYou can still get telehealth services at any location in the U.S., including your home,\nuntil the end of 2024. After that, you must be in an office or medical facility located in\na rural area to get most telehealth services. There are some exceptions, like for mental\nhealth services. Go to page 51.\n\nManaging and treating chronic pain\nMedicare now covers monthly services to treat chronic pain if you’ve been living with it\nfor more than 3 months. Go to page 34.\n\nBetter mental health care\n\nMedicare now covers intensive outpatient program services provided by hospitals,\ncommunity mental health centers, and other locations if you need mental health care.

In [9]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

# Initialize RAG model

tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")

retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq", 
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
)

rag_model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [10]:
from langchain_community.llms import LlamaCpp
from langchain.prompts import ChatPromptTemplate
import torch


# Initialize BioMistral LLM
llm = LlamaCpp(
    model_path= "BioMistral-7B.Q4_K_M.gguf",
    temperature=0.2,
    max_tokens=1024,
    top_p=1
)

def calculate_token_length(text, tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    return inputs.input_ids.size(1)


def format_context(relevant_pages, tokenizer, max_length):
    print("Start check of segment size....")
    context = list()

    for page in relevant_pages:
        text = page["text"]
        bbox = page["bbox"]
        tokens = page["tokens"]
        
    if len(tokens):
        context_entry = f"Text: {text}\nTokens: {tokens}\nBbox: {bbox}\n"
        
        
    combined_context = "\n".join(context)
    return combined_context

def process_chunk(chunk, question):
    prompt = f"Context: {chunk}\n\nQuestion: {question}\n\nAnswer:"
    t_size = calculate_token_length(prompt, tokenizer)
    print(f"The size of this prompt is {t_size} >>>>")
    response = llm(prompt)
    return response


def generate_answer(question, n_docs=5):
    # Tokenize the question
    inputs = tokenizer(question, return_tensors="pt")
    # Generate question hidden states using the RAG model's question encoder
    outputs = rag_model.question_encoder(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)
    question_hidden_states = outputs[0].detach().numpy()
    
    # Retrieve relevant documents
    relevant_pages = retrieve_pages(index, question, embedding_model, page_mapping, k=n_docs)
    
    # Calculate the maximum length for the context
    max_context_length = 512
    
    # Segment the context
    context_segments = segment_context(relevant_pages, tokenizer, max_context_length)
    
    # # Process each chunk and gather responses
    # combined_response = []
    # for chunk in context_segments:
    #     response = process_chunk(chunk, question)
    #     combined_response.append(response)
    
    # # Combine the responses to form the final answer
    # final_answer = "\n".join(combined_response)
    # return final_answer


llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from BioMistral-7B.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = hub
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32             

In [11]:
question = "What is the maximum amount of money I can pay for a one-month supply of insulin?"
answer = generate_answer(question)
print(answer)

NameError: name 'segment_context' is not defined