STEPS-
1. Open a pdf file.
2. Format the text.
3. Embed and turn the chunks of text into embeddings.
4. Build retrieval system based on Vector search via query.
5. Create a prompt that incorporates the retrieved text.
6. Generate an answer to the query.

In [None]:
!pip install -r requirements.txt

In [None]:
#import pdf document
import fitz
from tqdm.auto import tqdm
pdf_path="/content/simple-local-rag/human-nutrition-text.pdf"

def process_text(text):
  return text.replace("\n", " ").strip()

def open_pdf(pdf_path):
  pdf= fitz.open(pdf_path)
  page_text=[]
  for page_num, page in tqdm(enumerate(pdf)):
    text= process_text(page.get_text())
    page_text.append(
        {"page_number": page_num - 41,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # assuming 4 tokens per character
                                "text": text}
        )
  return page_text

page_text_list=open_pdf(pdf_path)


In [None]:
len(page_text_list)

In [None]:
import random

random.sample(page_text_list, k=2)

In [None]:
#convert to sentences using spaCy
from spacy.lang.en import English

sentenser=English()

sentenser.add_pipe("sentencizer")

for page in tqdm(page_text_list):
  page["sentences"]=list(sentenser(page["text"]).sents)

  page["sentences"]=[str(sent) for sent in page["sentences"]]

  page["sentence_count"]=len(page["sentences"])

random.sample(page_text_list,k=1)


In [None]:
import pandas as pd
df=pd.DataFrame(page_text_list)
df.describe().round(1)

In [None]:
#text(1 page all sentences) > chunk(10 sentences) > sentence

def split_list(givenList, splitsize):
  return [givenList[i:i+splitsize] for i in range(0, len(givenList), splitsize)]

chunk_size=10
for item in tqdm(page_text_list):
  item["chunks"]=split_list(item["sentences"], chunk_size)
  item["chunk_count"]=len(item["chunks"])

random.sample(page_text_list, k=1)

In [None]:
import re
#joining all chunk sentences
chunk_list=[]


for item in page_text_list:
  for chunk in item["chunks"]:
    chunk_dict={}
    chunk_dict["page_number"]=item["page_number"]

    joined_chunk="".join(chunk).replace("  ", " ").strip()
    joined_chunk=re.sub( r'\.([A-Z])', r'. \1', joined_chunk)
    chunk_dict["chunk"]=joined_chunk

    chunk_dict["chunk_char_count"]=len(joined_chunk)
    chunk_dict["chunk_word_count"]=len(joined_chunk.split(" "))
    chunk_dict["chunk_char_count"]=len(joined_chunk)/4
    chunk_list.append(chunk_dict)

len(chunk_list)


In [None]:
#removing small chunks
min_tokens=30

df=pd.DataFrame(chunk_list)
big_chunks_list=df[df["chunk_char_count"]>min_tokens].to_dict(orient="records")
len(big_chunks_list)


In [None]:
from sentence_transformers import SentenceTransformer, util
embed_model=SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")

for item in tqdm(big_chunks_list):
  item["embeddings"]=embed_model.encode(item["chunk"])

In [None]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(big_chunks_list)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

END OF EMBEDDINGS PART


---

PART-2 Retrieval

In [None]:
import pandas as pd

embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

In [None]:
import torch
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embedding_df_load["embeddings"]= text_chunks_and_embedding_df_load["embeddings"].apply(lambda x: np.fromstring(x.strip("[]"),sep=" "))

new_chunk_list= text_chunks_and_embedding_df_load.to_dict(orient="records")
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df_load["embeddings"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

In [None]:
query="Protein Deficiency"

from sentence_transformers import SentenceTransformer, util
embed_model=SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")

qembed=embed_model.encode(query ,convert_to_tensor=True)

#Cosine similarity must be used for text similarity due to -1 to 1 output
#We use dot product as embedding model returns normalized output anyways
#which will give similar results
dot_prod=util.dot_score(a=qembed, b=embeddings)[0]

print(dot_prod.shape)

In [None]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [None]:
print("Query", query)
top_results = torch.topk(dot_prod, k=5)

for score,indx in zip(top_results[0],top_results[1]):
  print("score",score)
  print_wrapped(new_chunk_list[indx]["chunk"])
  print("Page", new_chunk_list[indx]["page_number"])
  print("-"*10,"\n")


In [None]:
def retrieval_pipeline(
      query,
      embeddings=embeddings,
      embed_model=embed_model,
      top_k=5,
  ):
  qembed=embed_model.encode(query ,convert_to_tensor=True)
  dot_prod=util.dot_score(a=qembed, b=embeddings)[0]

  top_results = torch.topk(dot_prod, k=top_k)
  return top_results[0],top_results[1]

def print_retrieved_chunks(
        query,
        embeddings=embeddings,
        embed_model=embed_model,
        top_k=5,
  ):
  score,indx=retrieval_pipeline(query,embeddings,embed_model,top_k)
  print("Query", query)
  for score,indx in zip(score,indx):
    print("score",score)
    print_wrapped(new_chunk_list[indx]["chunk"])
    print("Page", new_chunk_list[indx]["page_number"])


In [None]:
print_retrieved_chunks("Kwashiorkor")

In [None]:
!nvidia-smi


In [None]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

In [None]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

In [None]:
from google.colab import userdata
hf_api=userdata.get('HF_api')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

tokenizer = AutoTokenizer.from_pretrained(model_id,token=hf_api)
model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_api)

from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)


if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id,device="cuda")

llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=True, # use full memory
                                                 attn_implementation=attn_implementation, # which attention version to use
                                                 device="cuda")

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")



In [None]:
llm_model

In [None]:
ef get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

In [None]:
input_text = "What are the macronutrients, and what roles do they play in the human body?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

In [None]:
tok_ip=tokenizer(prompt, return_tensors="pt").to("cuda")

tok_op=llm_model.generate(
    input_ids=tok_ip["input_ids"],
    attention_mask=tok_ip["attention_mask"],
    max_new_tokens=128,
    )

In [None]:
output=tokenizer.decode(tok_op[0], skip_special_tokens=True)
output_replaced=output.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')
print(f"Output text:\n{output}")

In [None]:
qlist=[
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins",
]

In [None]:
import random
query = random.choice(query_list)

print(f"Query: {query}")
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

In [None]:
#AUGMENTATION

def format_prompt(query, context_items):

  context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

  base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

  base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

In [None]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [None]:
answer = ask(query=query, temperature=1, max_new_tokens=256, return_answer_only=False)

In [None]:
print_wrapped(answer)