## Data Preprocessing

In [None]:
import fitz
from tqdm.auto import tqdm

pdf_path = "ln_human_anat_final.pdf"

def text_formatter(text: str) -> str: 
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()

    # Potentially more text formatting functions can go here
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = [] 
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number-10,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_setence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()


In [4]:
from spacy.lang.en import English

nlp = English()
 
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1eb154936d0>

In [None]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
import random
random.sample(pages_and_texts, k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

## Chunking

In [8]:
num_chunk_list_size = 10

def chunk_text(input_text: list[str], chunk_size: int= num_chunk_list_size) -> list[list[str]]:
    return [input_text[i: i+chunk_size] for i in range(0, len(input_text), chunk_size)]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = chunk_text(item["sentences"])
    item["sentence_chunks_num"] = len(item["sentence_chunks"])

  0%|          | 0/428 [00:00<?, ?it/s]

In [None]:
rnd = random.sample(pages_and_texts, k=1)
rnd_chunk = rnd[0]["sentence_chunks"]
rnd_chunk


In [11]:
import re

page_chunks = []

for item in tqdm(pages_and_texts):
    for sentence in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_no"] = item["page_number"]

        joined_sentence = "".join(sentence).replace("  "," ").strip()
        joined_sentence = re.sub(r'\.([A-Z])', r'. \1', joined_sentence)

        chunk_dict["chunk_sentences"] = joined_sentence

        chunk_dict["character_count"] = len(joined_sentence)
        chunk_dict["word_count"] = len([word for word in joined_sentence.split(" ")])
        chunk_dict["token_count"] = len(joined_sentence)/4

        page_chunks.append(chunk_dict)

  0%|          | 0/428 [00:00<?, ?it/s]

In [None]:
random.sample(page_chunks, k=1)

In [None]:
df = pd.DataFrame(page_chunks)
min_token= 30
pages_chunks_over_min_token = df[df["token_count"] > min_token].to_dict(orient="records")
pages_chunks_over_min_token[:2]

In [None]:
text_chunks = [item["chunk_sentences"] for item in pages_chunks_over_min_token]
text_chunks

## Embedding

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

model.to("cuda")

# text_chunk_embeddings = model.encode(text_chunks, batch_size=32, convert_to_tensor=True)

for item in tqdm(pages_chunks_over_min_token):
    item["embedding"] = model.encode(item["chunk_sentences"])

In [16]:
pages_chunks_over_min_token[419]["embedding"].shape

(768,)

## Saving to file(Pytorch tensors) /saving to vector Database using chromaDB

### 1 saving to file(pytorch tensors)

In [23]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_chunks_over_min_token)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
import torch
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))


embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)

pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

text_chunks_and_embedding_df
pages_and_chunks

## ChromaDB implementation for storing and retrieveing

In [65]:
import chromadb
from chromadb.utils import embedding_functions

CHROMA_DATA_PATH = "chroma/"
EMBED_MODEL = "all-mpnet-base-v2"
COLLECTION_NAME = "text_and_chunks"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [None]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL
)

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)

In [None]:
documents=[item["chunk_sentences"] for item in pages_chunks_over_min_token]
collection.add(
    documents=documents,
    ids=[f"id{i}" for i in range(len(documents))],
)

In [73]:
query_results = collection.query(
     query_texts=["What are enzymes?"],
     include=["documents", "distances"],
     n_results=5
 )

In [72]:
query_results["embeddings"]

In [74]:
query_results["documents"]

[['Human Anatomy and Physiology 330  along the alimentary canal, and finally eliminates the digestive wastes from the body. Chewing or mastication, swallowing or deglutition, peristalsis, and defecation are the main processes of mechanical digestion. Chemical digestion breaks down large, nonabsorbable food molecules−molecules that are able to pass through the intestinal mucosa into blood and lymph. Chemical digestion consists of numerous chemical reactions catalyzed by enzymes in saliva, gastric juice, pancreatic juice, and intestinal juice.  Carbohydrate Digestion  Very little digestion of carbohydrates (starches and sugars) occurs before food reaches the small intestine. Salivary amylase usually has little time to do its work because so many of us swallow our food so fast. Gastric juice contains no carbohydrate-digesting enzymes. But after the food reaches the small intestine, pancreatic and intestinal juice enzymes digest the starches and sugars. A pancreatic enzyme (amylase) starts

In [84]:
def chroma_retreiver(query:str, collection=collection):
    query_results = collection.query(
     query_texts=[query],
     include=["documents", "distances"],
     n_results=5
    )
    return query_results["documents"]
    

In [85]:
chroma_retreiver("What are enzymes?")

[['Human Anatomy and Physiology 330  along the alimentary canal, and finally eliminates the digestive wastes from the body. Chewing or mastication, swallowing or deglutition, peristalsis, and defecation are the main processes of mechanical digestion. Chemical digestion breaks down large, nonabsorbable food molecules−molecules that are able to pass through the intestinal mucosa into blood and lymph. Chemical digestion consists of numerous chemical reactions catalyzed by enzymes in saliva, gastric juice, pancreatic juice, and intestinal juice.  Carbohydrate Digestion  Very little digestion of carbohydrates (starches and sugars) occurs before food reaches the small intestine. Salivary amylase usually has little time to do its work because so many of us swallow our food so fast. Gastric juice contains no carbohydrate-digesting enzymes. But after the food reaches the small intestine, pancreatic and intestinal juice enzymes digest the starches and sugars. A pancreatic enzyme (amylase) starts

In [27]:
embeddings.shape

torch.Size([559, 768])

In [29]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

## Retriever from scratch for the database from csv

In [78]:
from sentence_transformers import util

def retriever(query: str, embeddings: torch.tensor, model: SentenceTransformer=model, return_amount: int=5):
  
    query_embedding = model.encode(query, convert_to_tensor=True)
    dot_scores = util.cos_sim(query_embedding, embeddings)[0]

    scores, indices = torch.topk(input=dot_scores, k=return_amount)

    return scores, indices

def print_top_results_and_scores(query: str, embeddings: torch.tensor, pages_and_chunks: list[dict]=pages_and_chunks, return_amount: int=5):
    scores, indices = retriever(query=query, embeddings=embeddings, return_amount=return_amount)

    for score, idx in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print("Text:")
        print_wrapped(pages_and_chunks[idx]["chunk_sentences"])
        print(f"Page number: {pages_and_chunks[idx]['page_no']}")
        print("\n")

In [None]:
query="what are enzymes"
# retrieve_relevant_resources(query=query, embeddings=embeddings) 
print_top_results_and_scores(query=query, embeddings=embeddings)

## Generator

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

attn_implementation = "sdpa" # scaled dot product attention
print(f"Using attention implementation: {attn_implementation}") 

model_id = "google/gemma-2b-it"

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16,
                                                 quantization_config= quantization_config,
                                                 low_cpu_mem_usage=False, 
                                                 attn_implementation=attn_implementation)

In [91]:
query = "What is the functions of the mouth?"

In [80]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    context = "- " + "\n- ".join([item["chunk_sentences"] for item in context_items])

    base_prompt = """
        Based on the following context items, please answer the query.
        Give yourself room to think by extracting relevant passages from the context before answering the query.
        Don't return the thinking, only return the answer.
        Make sure your answers are as explanatory as possible.
        \nNow use the following context items to answer the user query:
        {context}
        \nRelevant passages: <extract relevant passages from the context here>
        User query: {query}
        Answer:
    """ 
    base_prompt = base_prompt.format(context=context,
                                     query=query)

    dialogue_template = [
        {"role": "user",
         "content": base_prompt}
    ]

    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)
    
    return prompt

scores, indices = retriever(query=query, embeddings=embeddings)

context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query, context_items=context_items)
print(prompt)

<bos><start_of_turn>user
Based on the following context items, please answer the query.
        Give yourself room to think by extracting relevant passages from the context before answering the query.
        Don't return the thinking, only return the answer.
        Make sure your answers are as explanatory as possible.
        
Now use the following context items to answer the user query:
        - Human Anatomy and Physiology 330  along the alimentary canal, and finally eliminates the digestive wastes from the body. Chewing or mastication, swallowing or deglutition, peristalsis, and defecation are the main processes of mechanical digestion. Chemical digestion breaks down large, nonabsorbable food molecules−molecules that are able to pass through the intestinal mucosa into blood and lymph. Chemical digestion consists of numerous chemical reactions catalyzed by enzymes in saliva, gastric juice, pancreatic juice, and intestinal juice.  Carbohydrate Digestion  Very little digestion of c

In [95]:
def prompt_formatter_chroma(query:str , context_items: list[list]):
    context = "- " + "\n- ".join([item for sublist in context_items for item in sublist])
    base_prompt = """
        Based on the following context items, please answer the query.
        Give yourself room to think by extracting relevant passages from the context before answering the query.
        Don't return the thinking, only return the answer.
        Make sure your answers are as explanatory as possible.
        \nNow use the following context items to answer the user query:
        {context}
        User query: {query}
        Answer:
    """ 
    base_prompt = base_prompt.format(context=context,
                                     query=query)

    dialogue_template = [
        {"role": "user",
         "content": base_prompt}
    ]

    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)
    
    return prompt

context_items = chroma_retreiver(query)

prompt_formatter_chroma(query, context_items)

"<bos><start_of_turn>user\nBased on the following context items, please answer the query.\n        Give yourself room to think by extracting relevant passages from the context before answering the query.\n        Don't return the thinking, only return the answer.\n        Make sure your answers are as explanatory as possible.\n        \nNow use the following context items to answer the user query:\n        - Human Anatomy and Physiology 316  The Mouth  The mouth, also called the oral cavity, is where a substance begins its travels through the digestive tract (Figure 11-4). The mouth has three digestive functions:  1. To receive food, a process called ingestion  2. To prepare food for digestion  3. To begin the digestion of starch. In to this space projects a muscular organ, the tongue, which is used for chewing and swallowing, and is one of the principal organs of speech. The tongue has on its surface a number of special organs, called taste buds, by means of which taste sensations (bi

In [96]:
def ask(query: str,
        temperature: float=0.7,
        max_new_tokens:int=256,
        ):

    # scores, indices = retriever(query=query, embeddings=embeddings)

    # context_items = [pages_and_chunks[i] for i in indices] 

    context_items_chroma = chroma_retreiver(query)

    # for i, item in enumerate(context_items): 
    #     item["score"] = scores[i].cpu()

    # prompt = prompt_formatter(query=query,
    #                           context_items=context_items)
    
    prompt = prompt_formatter_chroma(query=query, context_items=context_items_chroma)

    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = llm_model.generate(**input_ids, temperature=temperature, do_sample=True, max_new_tokens=max_new_tokens)

    output_text = tokenizer.decode(outputs[0])

 
    output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "")

    return output_text, context_items

In [97]:
ask(query=query,
    temperature=0.2)

("Sure, here's the answer to the user's query:\n\nThe mouth serves three main functions in the digestive tract:\n\n1. **Receiving food:** The mouth receives food through the process of ingestion.\n2. **Preparing food for digestion:** The mouth prepares food by mixing it with saliva to moisten it and facilitate chewing and swallowing.\n3. **Beginning the digestion of starch:** The mouth is the first part of the digestive tract to break down starches into simpler molecules that can be absorbed by the body.",
 [['Human Anatomy and Physiology 316  The Mouth  The mouth, also called the oral cavity, is where a substance begins its travels through the digestive tract (Figure 11-4). The mouth has three digestive functions:  1. To receive food, a process called ingestion  2. To prepare food for digestion  3. To begin the digestion of starch. In to this space projects a muscular organ, the tongue, which is used for chewing and swallowing, and is one of the principal organs of speech. The tongue 