In [1]:
import pandas as pd
import numpy as np
import os
import requests
import torch
import fitz
import random
import re
import psutil
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sentence_transformers import util, SentenceTransformer
from spacy.lang.en import English
from time import perf_counter as timer 
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 
from transformers import BitsAndBytesConfig



In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


## Import Data & Process

In [3]:
pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  filename = pdf_path

  response = requests.get(url)

  if response.status_code == 200:
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


In [4]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)): 
        text = page.get_text() 
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [5]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [6]:
nlp = English()
nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [7]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1
1,-40,0,1,1,0.0,,[],0
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2


In [8]:
num_sentence_chunk_size = 10

def split_list(input_list: list,
               slice_size: int = num_sentence_chunk_size) -> list[list[str]]:
    
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [10]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy,sentence_chunks,num_chunks
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1,[[Human Nutrition: 2020 Edition]],1
1,-40,0,1,1,0.0,,[],0,[],0
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1,[[Human Nutrition: 2020 Edition UNIVERSITY O...,1
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1,[[Human Nutrition: 2020 Edition by University ...,1
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2,[[Contents Preface University of Hawai‘i at ...,1


In [11]:
pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [12]:
df = pd.DataFrame(pages_and_chunks)
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25


In [13]:
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}")

chunk token count: 16.0 | Text: Accessed January 20, 2018. 1032 | The Effect of New Technologies
chunk token count: 17.0 | Text: Figure 2.15 Major Respiratory Structures The Respiratory System | 99
chunk token count: 9.5 | Text: 742 | Building Healthy Eating Patterns
chunk token count: 10.75 | Text: Accessed December 10, 2017. 880 | Childhood
chunk token count: 28.75 | Text: Bouayed, J. and T. Bohn. (2010). Exogenous Antioxidants—Double-Edged Swords in Cellular Redox MyPlate Planner | 753


In [14]:
pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")


In [15]:
random.sample(pages_and_chunks_over_min_token_length, k=1)

[{'page_number': 107,
  'sentence_chunk': 'in the endocrine system are the pituitary, thyroid, parathyroid, adrenals, thymus, pineal, pancreas, ovaries, and testes. The glands secrete hormones, which are biological molecules that regulate cellular processes in other target tissues, so they require transportation by the circulatory system. Adequate nutrition is critical for the functioning of all the glands in the endocrine system. A protein deficiency impairs gonadal-hormone release, preventing reproduction. Athletic teenage girls with very little body fat often do not menstruate. Children who are malnourished usually do not produce enough growth hormone and fail to reach normal height for their age group. Probably the most popularized connection between nutrition and the functions of the endocrine system is that unhealthy dietary patterns are linked to obesity and the development of Type 2 diabetes. The Centers for Disease Control and Prevention (CDC) estimates that twenty-six million

## EMBEDDING THE TEXT CHUNCKS

In [16]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device = device)



In [17]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_length]

text_chunk_embeddings = embedding_model.encode(
    text_chunks,
    batch_size=32,
    # convert_to_tensor=True
)

for i, item in tqdm(enumerate(pages_and_chunks_over_min_token_length)):
    item["embedding"] = text_chunk_embeddings[i]

print(pages_and_chunks_over_min_token_length[1]["embedding"])

0it [00:00, ?it/s]

[ 5.52156121e-02  5.92140220e-02 -1.66167002e-02 -2.04602312e-02
  6.92423135e-02  3.51346135e-02 -1.87619645e-02  3.21568623e-02
  7.78691024e-02 -8.06521624e-03  2.60772090e-02  1.17807147e-04
  2.36337334e-02  6.99440809e-03  1.76008643e-06 -3.82591551e-03
  3.45729385e-03  1.16404612e-02  1.01687415e-02  4.95471694e-02
 -5.18356450e-02  1.88298319e-02  4.51909713e-02  4.23135012e-02
 -4.12121825e-02  4.93987091e-03  3.25199589e-02 -1.81734581e-02
  8.84532649e-03 -6.44744113e-02 -5.04505588e-03  1.74673516e-02
 -1.65685068e-03 -8.50824863e-02  2.46762693e-06 -1.69053245e-02
  1.09408684e-02  3.01258136e-02 -6.66744560e-02  6.21617064e-02
  3.50563154e-02 -2.47929841e-02 -1.59021597e-02  2.37372015e-02
  3.93133499e-02  4.06050757e-02  4.51445505e-02 -5.83526306e-03
 -1.52490400e-02  8.62988178e-03 -1.96103961e-03 -3.10199130e-02
 -3.25587057e-02  2.62657250e-03  3.76190506e-02  3.28164026e-02
 -1.42093375e-02  1.82541087e-02  4.01719799e-03 -4.65871580e-02
  4.56143077e-03  5.84205

## Save embedding to files

In [18]:
# text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
# embedding_df_save_path = "text_chunks_and_embeddings_df.csv"
# text_chunks_and_embeddings_df.to_csv(embedding_df_save_path, index=False)

In [19]:
# text_chunks_and_embeddings_df_load = pd.read_csv(embedding_df_save_path)
# text_chunks_and_embeddings_df_load.head()
# can use an vector database for storage if embedding database is really large

## RAG - Search and Answer

In [20]:
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([1680, 768])

In [21]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,"[0.0674242601, 0.0902282521, -0.00509550329, -..."
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,"[0.0552156121, 0.059214022, -0.0166167002, -0...."
2,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5,"[0.0279801711, 0.0339814015, -0.020642681, 0.0..."
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25,"[0.0682566911, 0.0381274782, -0.00846852828, -..."
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,"[0.0330264345, -0.00849768426, 0.00957158767, ..."


## Functionising the semantic search pipeline

In [22]:
def print_wrapped(text, width=80):
    import textwrap
    print(textwrap.fill(text, width=width))
    
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    start_time = timer() 
    dot_scores = util.dot_score(query_embedding, embeddings)[0]  # do not need to normalise and use L2 normilisation as the values are already normalised.
    end_time = timer()
    
    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices


def retrieve_and_print_top_results(query: str,
                                   embeddings: torch.tensor,
                                   model: SentenceTransformer = embedding_model,
                                   pages_and_chunks: list[dict]= pages_and_chunks,
                                   n_resources_to_return: int = 5,
                                   print_time: bool = True):
    
    if model is None:
        raise ValueError("A SentenceTransformer model must be provided.")
    
    if pages_and_chunks is None:
        raise ValueError("The list 'pages_and_chunks' containing the text chunks must be provided.")
    
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()
    
    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")
    
    scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        print(f"Page number: {pages_and_chunks[index]['page_number']}\n")
    
    return scores, indices

In [23]:
query = "foods high in fibre"

scores, indices = retrieve_and_print_top_results(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 1680 embeddings: 0.05164 seconds.
Query: foods high in fibre

Results:
Score: 0.6866
Dietary fiber is categorized as either water-soluble or insoluble. Some examples
of soluble fibers are inulin, pectin, and guar gum and they are found in peas,
beans, oats, barley, and rye. Cellulose and lignin are insoluble fibers and a
few dietary sources of them are whole-grain foods, flax, cauliflower, and
avocados. Cellulose is the most abundant fiber in plants, making up the cell
walls and providing structure. Soluble fibers are more easily accessible to
bacterial enzymes in the large intestine so they can be broken down to a greater
extent than insoluble fibers, but even some breakdown of cellulose and other
insoluble fibers occurs. The last class of fiber is functional fiber. Functional
fibers have been added to foods and have been shown to provide health benefits
to humans. Functional fibers may be extracted from plants and purified or
synthetically made. An 

(tensor([0.6866, 0.6661, 0.5586, 0.5434, 0.5024], device='mps:0'),
 tensor([ 360,  418,  358, 1047,  376], device='mps:0'))

## Loading an LLM locally

In [24]:
use_quantization_config = True  # or False, depending on your use case
attn_implementation = "sdpa"

model_id = "google/gemma-2b-it"
print(f"[INFO] Using model_id: {model_id}")

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype= torch.float16,
                                                 low_cpu_mem_usage = False,
                                                 attn_implementation=attn_implementation)

if not use_quantization_config:
    llm_model.to(device)

[INFO] Using model_id: google/gemma-2b-it




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

2506172416

In [26]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    model_mem_bytes = mem_params + mem_buffers 
    model_mem_mb = model_mem_bytes / (1024**2) 
    model_mem_gb = model_mem_bytes / (1024**3) 

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 5079453696, 'model_mem_mb': 4844.14, 'model_mem_gb': 4.73}

## GENERATE TEXT WITH OUR LLM LOCALLY

In [32]:
gpt4_questions = [
    "What are the macronutrients, and what roles do they play in the human body?",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in the human body.",
    "What role does fibre play in digestion? Name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management.",
    "How many times a day should a baby be breastfed?",
    "What are the signs of pellagra?",
    "What role does saliva play in digestion?",
    "How much protein should a person consume daily?",
    "What are examples of water-soluble vitamins?"
]

query_list = gpt4_questions 

In [33]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])


    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible for all the options below.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:
\nIf you do not know, return general information regarding the query using the context items:
{context}"""

    base_prompt = base_prompt.format(context=context, query=query)

    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [34]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)
    
    context_items = [pages_and_chunks[i] for i in indices]

    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() 
        
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    input_ids = tokenizer(prompt, return_tensors="pt").to("cpu")

    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        output_text = output_text.replace(prompt, '').replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    if return_answer_only:
        return output_text
    
    return output_text, context_items

In [35]:
query = random.choice(query_list)

print(f"Query: {query}")

answer, context_items = ask(query=query, 
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print(answer)

for item in context_items:
    page_number = item['page_number']
    sentence_chunk = item['sentence_chunk']
    print(f"Page Number: {page_number}")
    print(f"Text Chunk: {sentence_chunk}\n")

Query: Describe the process of digestion and absorption of nutrients in the human body.
[INFO] Time taken to get scores on 1680 embeddings: 0.00741 seconds.
Answer:

Sure, here's the answer to your query:

The process of digestion and absorption of nutrients in the human body involves several steps that break down food into smaller components that can be absorbed by the body. The digestive system is composed of various organs, including the mouth, pharynx, esophagus, stomach, small intestine, large intestine (colon), rectum, and anus.

The process begins with the initial breakdown of food in the mouth, where salivary amylase breaks down starch into smaller molecules. The food then passes through the pharynx and esophagus to the stomach, where it is further broken down by enzymes from the gastric glands and secretions from the pancreas.

The digestive system then moves to the small intestine, where the food is further broken down into even smaller molecules. The small intestine is the l