### Import PDF Document

In [5]:
import os
import requests

#get PDF document path
pdf_path = "human-nutrition-text.pdf"

#download pdf
if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")

    #enter URL of pdf
    url="https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    #The local file name to save file
    filename = pdf_path

    #send a GET request to URL
    response = requests.get(url)

    #check if req successful
    if response.status_code == 200:
        #open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] the file has been downloaded and saved as {filename}")
    else:
        print(f"[INFO] failed to download file")
else:
    print(f"File {pdf_path} exists")

File human-nutrition-text.pdf exists


Opening pdf using pymupdf library

In [6]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str)->str:
    """minor text formatting"""
    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text
    
def open_and_read_pdf(pdf_path: str)->list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number - 41,
                               "page_char_count": len(text),
                               "word_count": len(text.split(" ")),
                               "page_sentence_count_raw": len(text.split(". ")),
                               "page_token_count": len(text)/4,
                               "text": text
                              })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [7]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 54,
  'page_char_count': 0,
  'word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 467,
  'page_char_count': 1548,
  'word_count': 271,
  'page_sentence_count_raw': 14,
  'page_token_count': 387.0,
  'text': 'as building blocks instead. This happens when a cell is in positive- energy balance. For example, the citric-acid-cycle intermediate,  α-ketoglutarate can be anabolically processed to the amino acids  glutamate or glutamine if they are required. The human body is  capable of synthesizing eleven of the twenty amino acids that make  up proteins. The metabolic pathways of amino acid synthesis are  all inhibited by the specific amino acid that is the end-product of a  given pathway. Thus, if a cell has enough glutamine it turns off its  synthesis.  Anabolic pathways are regulated by their end-products, but even  more so by the energy state of the cell. When there is ample energy,  bigger molecules, such as protein

In [8]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [9]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


In [11]:
#split pages into sentences.
from spacy.lang.en import English

nlp = English()

#add a sentencizer pipeline
nlp.add_pipe("sentencizer")

#create a document instance
doc = nlp("This is a sentence. This is another sentence. I love Tharuniyaa.")

assert len(list(doc.sents)) == 3

list(doc.sents)

[This is a sentence., This is another sentence., I love Tharuniyaa.]

In [77]:
pages_and_texts[69]

{'page_number': 28,
 'page_char_count': 966,
 'word_count': 163,
 'page_sentence_count_raw': 11,
 'page_token_count': 241.5,
 'text': '3. Ovo-vegetarian. This type of vegetarian diet includes eggs but  not dairy products.  4. Vegan. This type of vegetarian diet does not include dairy,  eggs, or any type of animal product or animal by-product.  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.  These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this version of the text. You can  view it onl

In [12]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
random.sample(pages_and_texts, k = 1)

[{'page_number': 672,
  'page_char_count': 701,
  'word_count': 137,
  'page_sentence_count_raw': 6,
  'page_token_count': 175.25,
  'text': 'Dietary Reference Intakes for Zinc  Table 11.4 Dietary Reference Intakes for Zinc  Age Group  RDA(mg/day)  UL(mg/ day)  Infant (0–6 months)  2*  4  Infants (6–12 months)  3  5  Children (1–3 years)  3  7  Children (4–8 years)  5  12  Children (9–13 years)  8  23  Adolescents (14–18 years)  11 (males), 9  (females)  34  Adults (19 + years)  11 (males), 8  (females)  40  * denotes Adequate Intake  Fact Sheet for Health Professionals: Zinc. National Institute of  Health, Office of Dietary Supplements. https://ods.od.nih.gov/ factsheets/Zinc-HealthProfessional/. Updated February 11, 2016.  Accessed November 10, 2017.  Dietary Sources of Zinc  Table 11.5 Zinc Content of Various Foods  672  |  Zinc',
  'sentences': ['Dietary Reference Intakes for Zinc  Table 11.4 Dietary Reference Intakes for Zinc  Age Group  RDA(mg/day)  UL(mg/ day)  Infant (0–6 month

In [14]:
df = pd.DataFrame(pages_and_texts)
df.describe().round()

Unnamed: 0,page_number,page_char_count,word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.0,1148.0,198.0,10.0,287.0,10.0
std,349.0,560.0,96.0,6.0,140.0,6.0
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,261.0,762.0,134.0,4.0,190.0,5.0
50%,562.0,1232.0,214.0,10.0,308.0,10.0
75%,864.0,1604.0,271.0,14.0,401.0,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


In [15]:
### chunking text
num_sentences_chunk_size = int(df["page_sentence_count_spacy"].mean())

#recursively split lists of texts to chunk sized lists

def split_list(input_list: list[str],
               slice_size: int)-> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))

split_list(test_list, num_sentences_chunk_size)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [16]:
#loop through pages and texts and split sentences into chunks

for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list = item["sentences"], slice_size = num_sentences_chunk_size)

    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [17]:
random.sample(pages_and_texts, k=1)

[{'page_number': 84,
  'page_char_count': 1372,
  'word_count': 247,
  'page_sentence_count_raw': 12,
  'page_token_count': 343.0,
  'text': '“Blood Flow  Through the  Heart” by  OpenStax  College / CC  BY 3.0  The cardiovascular system is one of the eleven organ systems of  the human body. Its main function is to transport nutrients to cells  and wastes from cells (Figure 2.12 “Cardiovascular Transportation  of Nutrients”). This system consists of the heart, blood, and blood  vessels. The heart pumps the blood, and the blood is the  transportation fluid. The transportation route to all tissues, a highly  intricate blood-vessel network, comprises arteries, veins, and  capillaries. Nutrients absorbed in the small intestine travel mainly to  the liver through the hepatic portal vein. From the liver, nutrients  travel upward through the inferior vena cava blood vessel to the  heart. The heart forcefully pumps the nutrient-rich blood first to  the lungs to pick up some oxygen and then to a

In [18]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### Splitting each chunk into it's own item

In [19]:
import re

#split each chunk into it's own item

pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item['sentence_chunks']:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        #join the sentences together into a paragraph-like structure

        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r' \1', joined_sentence_chunk)


        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [20]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 329,
  'sentence_chunk': 'Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mobile devices, however, for the best user experience it is strongly recommended that users complete these activities using a desktop or laptop computer and in Google Chrome. \xa0 An interactive or media element has been excluded from this version of the text You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=225 An interactive or media element has been excluded from this version of the text You can view it online here: Digestion and Absorption of Lipids | 329',
  'chunk_char_count': 856,
  'chunk_word_count': 126,
  'chunk_token_count': 214.0}]

In [21]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,731.11,112.33,182.78
std,347.79,445.65,71.22,111.41
min,-41.0,12.0,3.0,3.0
25%,280.5,313.5,44.0,78.38
50%,586.0,745.0,114.0,186.25
75%,890.0,1112.0,173.0,278.0
max,1166.0,1824.0,297.0,456.0


In [22]:
# Show random chunks with under 30 tokens in length
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

chunk token count: 12.25 | Text: PART VI CHAPTER 6 PROTEIN Chapter 6 Protein | 357
chunk token count: 6.5 | Text: Fat-Soluble Vitamins | 539
chunk token count: 20.5 | Text: http://chl-pacific.org/wp-content/uploads/2011/08/ Novotny-et- Toddler Years | 859
chunk token count: 10.5 | Text: The Major Types of Foodborne Illness | 993
chunk token count: 12.5 | Text: PART VII CHAPTER 7 ALCOHOL Chapter 7 Alcohol | 429


In [23]:
#filter out df for rows under 30 tokens

pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

### Embedding text chunks

In [24]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                     device="cpu")
# create a list of sentences
sentences = ["Transformers library provides an easy way of embedding.",
            "Sentences can be embedded one by one or in a list.",
            "I like horses!"]

embeddings = embedding_model.encode(sentences)

embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(f'Sentence: {sentence}')
    print(f'Embedding: {embedding}')

Sentence: Transformers library provides an easy way of embedding.
Embedding: [-5.57340011e-02  6.40663179e-03  4.91124252e-03  2.10787468e-02
  2.60457527e-02 -3.93987866e-03  2.07133256e-02  1.94761076e-03
  1.50001450e-02 -5.51366583e-02  2.70182043e-02  6.25496283e-02
 -3.87967750e-02  1.28000220e-02  3.05092148e-02 -6.30558804e-02
  1.99427288e-02  1.75520759e-02 -4.22759317e-02 -2.30977051e-02
 -1.51005005e-02 -3.94923054e-03  7.30106533e-02 -1.42055079e-02
  4.02332330e-03  2.79319994e-02 -4.20499593e-02 -4.00918648e-02
  4.42362614e-02  3.44449957e-03 -1.98470131e-02 -1.96271986e-02
  7.59227425e-02  6.29687356e-03  1.36537187e-06  9.31025203e-03
 -2.30744444e-02  2.87997536e-02  1.91687066e-02  2.41094120e-02
  5.69810271e-02 -8.54929164e-03 -2.02619471e-02  1.32503612e-02
 -9.55634937e-03 -1.86347850e-02  4.25398462e-02  6.16819132e-03
  8.44275653e-02 -3.61137800e-02 -1.27371456e-02 -2.70189848e-02
  7.39516038e-03  1.82170682e-02  8.67250666e-04  2.67479476e-02
 -5.63373370e

In [25]:
embeddings[0].shape

(768,)

In [26]:
embedding = embedding_model.encode("My favourite animal is the cow")
embedding

array([-8.87764711e-03,  8.35835114e-02, -2.81862803e-02, -3.71655039e-02,
        2.18684189e-02,  5.61196283e-02, -7.55760372e-02,  1.01799155e-02,
        1.48691228e-02, -2.08466798e-02, -2.89396364e-02,  4.55776379e-02,
       -3.03733237e-02, -1.41894128e-02, -1.61682982e-02, -3.85182947e-02,
        3.52857485e-02,  5.25924517e-03, -2.22115181e-02,  3.23855244e-02,
       -2.36056726e-02,  4.16592658e-02, -1.16740977e-02, -2.24949457e-02,
       -1.69876087e-02,  8.03107116e-03, -3.82865430e-03, -2.72515453e-02,
        2.71817148e-02,  2.64698267e-02, -6.16850033e-02, -8.03409293e-02,
        2.93563437e-02, -2.60433536e-02,  1.26088594e-06,  7.68714305e-03,
       -2.78717466e-02,  5.80591755e-03,  4.93111610e-02, -4.50255349e-03,
        3.92271951e-03,  1.44862225e-02, -1.33295488e-02,  1.35614304e-02,
        1.89375672e-02,  6.01764061e-02,  4.52522524e-02,  1.80459451e-02,
       -9.52361450e-02,  2.16227174e-02, -3.90326465e-03, -2.29934510e-02,
       -3.01052202e-02, -

In [27]:
%%time

##embedding_model.to("cpu")

##for item in tqdm(pages_and_chunks_over_min_token_len):
    ##item["embedding"] = embedding_model.encode(item["sentence_chunk"])

CPU times: total: 0 ns
Wall time: 0 ns


In [28]:
import torch
print(torch.cuda.is_available())

True


In [29]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

CPU times: total: 0 ns
Wall time: 0 ns


In [30]:
%%time
embedding_model.to("cuda")
#embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size=32,
                                              convert_to_tensor=True)
text_chunk_embeddings
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: total: 6min 22s
Wall time: 1min 29s


In [31]:
pages_and_chunks_over_min_token_len[419]

{'page_number': 277,
 'sentence_chunk': 'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture to

In [32]:
#save embeddings to a file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [33]:
#import saved csv file
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242899e-02 9.02281404e-02 -5.09547861e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156307e-02 5.92139401e-02 -1.66167300e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,765,114,191.25,[ 2.79099271e-02 3.32786553e-02 -2.14719474e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,940,142,235.0,[ 6.72774166e-02 3.66581492e-02 -8.32148921e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264568e-02 -8.49767309e-03 9.57160536e-...


For larger embeddings try using a vector database for storage instead of torch.tensor

In [34]:
#perform a similarity search(vector search / semantic search)
import random
import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

#convert embedding column to np.array(if it got converted to strings)
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

#convert embeddings to torch.tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df["embedding"].to_list(), axis=0), dtype=torch.float32).to("cuda")

#convert the df to list of dicts
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")

text_chunks_and_embeddings_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00,"[0.0674242899, 0.0902281404, -0.00509547861, -..."
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50,"[0.0552156307, 0.0592139401, -0.01661673, -0.0..."
2,-37,Contents Preface University of Hawai‘i at Māno...,765,114,191.25,"[0.0279099271, 0.0332786553, -0.0214719474, 0...."
3,-36,Lifestyles and Nutrition University of Hawai‘i...,940,142,235.00,"[0.0672774166, 0.0366581492, -0.00832148921, -..."
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.50,"[0.0330264568, -0.00849767309, 0.00957160536, ..."
...,...,...,...,...,...,...
1675,1164,Flashcard Images Note: Most images in the flas...,1298,176,324.50,"[0.0175535046, -0.0117120268, -0.0118186092, -..."
1676,1164,Hazard Analysis Critical Control Points reused...,373,51,93.25,"[0.0390881523, -0.0577164255, 0.0135394614, -0..."
1677,1165,ShareAlike 11 Organs reused “Pancreas Organ An...,1277,173,319.25,"[0.0727669522, 0.0187518969, -0.00522322766, -..."
1678,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,408,59,102.00,"[0.0963975936, -0.0226673596, 0.0103908964, 0...."


In [35]:
embeddings.shape

torch.Size([1680, 768])

In [36]:
from sentence_transformers import util, SentenceTransformer, CrossEncoder

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                     device=device)

Small semantic search pipeline

Note: to use dot comparison, ensure vector sizes are of same shape and tensors/vectors are in the same data type

In [37]:
# define the query
query = "Good foods for protein"
print(f'{query}')

#embed the query - use the same model
query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

#get similarity scores with dot product (use cosine similarity if output's aren't normalised)

from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"[INFO] time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

#get top-k scores let's say 5
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Good foods for protein
[INFO] time taken to get scores on 1680 embeddings: 0.00134 seconds.


torch.return_types.topk(
values=tensor([0.7636, 0.7510, 0.6724, 0.6614, 0.6509], device='cuda:0'),
indices=tensor([611, 616, 620, 615, 618], device='cuda:0'))

In [38]:
# larger_embeddings = torch.randn(100*embeddings.shape[0], 768).to(device)

# start_time = timer()
# dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
# end_time = timer()

# print(f"[INFO] time taken to get scores on {len(larger_embeddings)} embeddings: {end_time - start_time:.5f} seconds.")


In [39]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [40]:
print("Results: ")
# Loop through zipped together scores and indices from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    print("Text: ")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Results: 
Score: 0.7636
Text: 
Dietary Sources of Protein The protein food group consists of foods made from
meat, seafood, poultry, eggs, soy, dry beans, peas, and seeds According to the
Harvard School of Public Health, “animal protein and vegetable protein probably
have the same effects on health It’s the protein package that’s likely to make a
difference.”1 1. Protein: The Bottom Line Harvard School of Public Proteins,
Diet, and Personal Choices | 411
Page number: 411


Score: 0.7510
Text: 
Additionally, a person should consume 8 ounces of cooked seafood every week
(typically as two 4-ounce servings) to assure they are getting the healthy
omega-3 fatty acids that have been linked to a lower risk for heart disease
Another tip is choosing to eat dry beans, peas, or soy products as a main dish
Some of the menu choices include chili with kidney and pinto beans, hummus on
pita bread, and black bean enchiladas You could also enjoy nuts in a variety of
ways You can put them on a salad, in 

Note: We could potentially improve the order of these results with a reranking model. A model that has been trained specifically to take search results eg. the top 25 semantic results and rank them in order from most likely to least likely.

In [41]:
# code to test implementation
import fitz

pdf_path = "human-nutrition-text.pdf"

doc = fitz.open(pdf_path)
page = doc.load_page(411 + 41)

img = page.get_pixmap(dpi=300)

#save image(optional)
img.save("test1.png")
doc.close()

#convert the pixmap to numpy array

img_array = np.frombuffer(img.samples_mv,
                         dtype=np.uint8).reshape((img.h, img.w, img.n))

#alternatively, use matplotlib
import matplotlib.pyplot as plt

plt.figure(figsize=(13, 10))
plt.imshow(img_array)
plt.title(f"Query: {query} | Most relevant page: ")
plt.axis("off")
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

# similarity measures: DOT product and cosine similarity

In [42]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    #get euclidean/l2 norm

    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product/(norm_vector1 * norm_vector2)

vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

#calculate dot product
print("Dot product between vector1 and vector2: ", torch.dot(vector1, vector2))
print("Dot product between vector1 and vector2: ", torch.dot(vector1, vector3))
print("Dot product between vector1 and vector2: ", torch.dot(vector1, vector4))

#cosine similarity - normalise for magnitude
print("Cosine similarity between vector1 and vector2 ", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector2 ", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector2 ", cosine_similarity(vector1, vector4))

Dot product between vector1 and vector2:  tensor(14.)
Dot product between vector1 and vector2:  tensor(32.)
Dot product between vector1 and vector2:  tensor(-14.)
Cosine similarity between vector1 and vector2  tensor(1.0000)
Cosine similarity between vector1 and vector2  tensor(0.9746)
Cosine similarity between vector1 and vector2  tensor(-1.0000)


### Functionizing semantic search pipeline

In [43]:
def retrieve_relevant_resources(query: str,
                               embeddings: torch.tensor=embeddings,
                               model: SentenceTransformer=embedding_model,
                               n_resources_to_return: int=5,
                               print_time: bool=True):
    """
    Embeds a query with model and returns topk scores and indices from embeddings
    """

    #embed the query

    query_embedding = model.encode(query, convert_to_tensor=True)

    #get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] time taken to get scores on ({len(embeddings)}) embeddings: {end_time - start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, k = n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                embeddings: torch.tensor = embeddings,
                                pages_and_chunks: list[dict]=pages_and_chunks,
                                n_resources_to_return: int = 5):
    """
    find relevant passages and print them along with scores
    """

    scores, indices = retrieve_relevant_resources(query=query)
    for score, idx in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print("Text: ")
        print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
        print(f"Page number: {pages_and_chunks[idx]['page_number']}")
        print("\n")

In [44]:
query="foods high in fiber"
retrieve_relevant_resources(query=query)
print_top_results_and_scores(query=query)

[INFO] time taken to get scores on (1680) embeddings: 0.00022 seconds.
[INFO] time taken to get scores on (1680) embeddings: 0.00013 seconds.
Score: 0.6964
Text: 
• Change it up a bit and experience the taste and satisfaction of other whole
grains such as barley, quinoa, and bulgur. • Eat snacks high in fiber, such as
almonds, pistachios, raisins, and air-popped popcorn. Add an artichoke and green
peas to your dinner plate more 276 | Carbohydrates and Personal Diet Choices
Page number: 276


Score: 0.6699
Text: 
Dietary fiber is categorized as either water-soluble or insoluble. Some examples
of soluble fibers are inulin, pectin, and guar gum and they are found in peas,
beans, oats, barley, and rye Cellulose and lignin are insoluble fibers and a few
dietary sources of them are whole-grain foods, flax, cauliflower, and avocados
Cellulose is the most abundant fiber in plants, making up the cell walls and
providing structure Soluble fibers are more easily accessible to bacterial
enzymes in

## Setting up a local LLM

In [45]:
### get gpu availability
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
print(f"Available gpu memory: {(gpu_memory_bytes/(2**30))}")

Available gpu memory: 5.99951171875


In [46]:
!nvidia-smi

Fri May  9 14:44:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.02                 Driver Version: 576.02         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   46C    P8             14W /   40W |    1551MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [47]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# create a quantization config(requires bitsandbytes + accelerate)
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                        bnb_4bit_compute_dtype=torch.float16)

In [62]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "Google/gemma-3-1b-it"

#load the tokenizer model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
                            torch_dtype=torch.float16,
                                            )

model.to("cuda")

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((11

In [63]:
def get_model_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_params(model)

999885952

In [64]:
def get_model_mem_size(model: torch.nn.Module):
    #get model parameters and buffer sizer
    mem_param = sum([param.nelement()* param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    model_mem_bytes = mem_param + mem_buffers
    model_mem_mb = model_mem_bytes / (1024**2)
    model_mem_gb = model_mem_bytes / (1024**3)

    return {
        "model_mem_gb": round(model_mem_gb, 2),
        "model_mem_bytes": round(model_mem_bytes, 2)
    }
get_model_mem_size(model)

{'model_mem_gb': 1.86, 'model_mem_bytes': 1999772930}

In [73]:
input_text = "How long should infants be breastfed for?"
print(f'Input text: \n {input_text}')

dialogue_template = [
    {
        "role": "user",
        "content": input_text
    }
]

prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                      tokenize=False,
                                      add_generation_prompt=True)

print(f'{prompt}')

Input text: 
 How long should infants be breastfed for?
<bos><start_of_turn>user
How long should infants be breastfed for?<end_of_turn>
<start_of_turn>model



In [76]:
%%time

#tokenize the input text (turn it into numbers and send it to gpu)
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

#generate output
outputs = model.generate(**input_ids,
                        max_new_tokens=256)

print(f"Model output: \n: {outputs}\n")

Model output: 
: tensor([[     2,      2,    105,   2364,    107,   3910,   1440,   1374,  45033,
            577,  16489,  42168,    573, 236881,    106,    107,    105,   4368,
            107,  19058, 236764,   1531, 236789, 236751,   2910,   1003,  79033,
          12032, 236888,   2085, 236789, 236751,    496,   2606,    529,  15601,
            532,   2958,  12989,    506,    623,  63914, 236775,   3861,    529,
          79033, 236764,    840,   1590, 236789, 236751,    496,  25890,    529,
           1144,    506,  11848,   6816,   5840, 236787,    108,   1018,    818,
          14040, 111506,    753,    669,   4109,   5554,  19922,    568,  62770,
          62902,    108, 236829,   5213, 236825,  89335,  53121,    669,   4109,
           5554,  19922,    568,  62770, 236768,  37414,  79033,    573,   5213,
         236825,   3794,   1018, 168009, 236761,   1174,    563,    506, 107833,
            529,    910,   9106, 236761,    108,   1018,  11355,    563, 236743,
         23

In [77]:
outputs_decoded = tokenizer.decode(outputs[0])
print(f"{outputs_decoded}\n")

<bos><bos><start_of_turn>user
How long should infants be breastfed for?<end_of_turn>
<start_of_turn>model
Okay, let's talk about breastfeeding duration! There's a lot of debate and research surrounding the "ideal" length of breastfeeding, but here's a breakdown of what the experts generally recommend:

**The Current Recommendation - The World Health Organization (WHO)**

* **6 Months:** The World Health Organization (WHO) recommends breastfeeding for **6 months** consecutively. This is the cornerstone of their advice.

**Why is 6 Months So Important?**

* **Nutritional Benefits:** Breast milk provides optimal nutrition for a newborn's developing gut, immune system, and brain.  It's packed with antibodies, enzymes, and other nutrients that are crucial for healthy growth and development.
* **Immune System Development:** Breast milk contains antibodies and other immune factors that help protect infants from infections.
* **Brain Development:**  Studies show that breastfeeding is linked to

In [101]:
query_list = [
    "What are the essential nutrients needed for a balanced diet?",
    "How does protein intake affect muscle growth and recovery?",
    "What are the best sources of plant-based protein?",
    "How do different types of carbohydrates impact blood sugar levels?",
    "What are the benefits and risks of intermittent fasting?",
    "Which vitamins and minerals are most important for immune health?",
    "How can I optimize my diet for better gut health?",
    "What are common nutritional deficiencies and their symptoms?",
    "How does hydration impact metabolism and overall health?",
    "What are some science-backed strategies for sustainable weight loss?"
]

In [102]:
import random

query = random.choice(query_list)

print(f"query: {query}")

#get just the score and indices of top related results

scores, indices = retrieve_relevant_resources(query=query, embeddings=embeddings)

scores, indices

query: What are the best sources of plant-based protein?
[INFO] time taken to get scores on (1680) embeddings: 0.00012 seconds.


(tensor([0.7075, 0.7022, 0.6947, 0.6856, 0.6577], device='cuda:0'),
 tensor([611, 618, 617, 616, 620], device='cuda:0'))

### Augumenting our prompt with context items

In [109]:
def prompt_formatter(query: str,
                    context_items: list[dict])-> str:
    context = "- " + "\n\n- ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query. Ignore page numbers denoted by a number followed by |
context items:
{context}
Query: {query}
Answer:
    """
    prompt = base_prompt.format(context = context, query = query)
    return prompt

query = random.choice(query_list)

scores, indices = retrieve_relevant_resources(query=query, embeddings=embeddings)

context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query, 
                         context_items = context_items)
for i, item in enumerate(context_items):
    item["score"] = scores[i].cpu()
print(prompt)

[INFO] time taken to get scores on (1680) embeddings: 0.00029 seconds.
Based on the following context items, please answer the query. Ignore page numbers denoted by a number followed by |
context items:
- Adequacy An adequate diet is one that favors nutrient-dense foods Nutrient- dense foods are defined as foods that contain many essential nutrients per calorie Nutrient-dense foods are the opposite of “empty-calorie” foods, such as sugary carbonated beverages, which are also called “nutrient-poor.”Nutrient-dense foods include fruits and vegetables, lean meats, poultry, fish, low-fat dairy products, and whole grains Choosing more nutrient-dense foods will facilitate weight loss, while simultaneously providing all necessary nutrients. Balance Balance the foods in your diet Achieving balance in your diet entails not consuming one nutrient at the expense of another For example, calcium is essential for healthy teeth and bones, but too much calcium will interfere with iron absorption Most f

In [110]:
context_items

[{'page_number': 31,
  'sentence_chunk': 'Adequacy An adequate diet is one that favors nutrient-dense foods Nutrient- dense foods are defined as foods that contain many essential nutrients per calorie Nutrient-dense foods are the opposite of “empty-calorie” foods, such as sugary carbonated beverages, which are also called “nutrient-poor.”Nutrient-dense foods include fruits and vegetables, lean meats, poultry, fish, low-fat dairy products, and whole grains Choosing more nutrient-dense foods will facilitate weight loss, while simultaneously providing all necessary nutrients. Balance Balance the foods in your diet Achieving balance in your diet entails not consuming one nutrient at the expense of another For example, calcium is essential for healthy teeth and bones, but too much calcium will interfere with iron absorption Most foods that are good sources of iron are poor sources of calcium, so in order to get the necessary amounts of calcium and iron from your diet, a proper balance betwe

In [107]:
%%time
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids,
                        temperature=0.7,
                        max_new_tokens=256,
                        do_sample=True
                        )

output_text = tokenizer.decode(outputs[0])

print(f"query: {query}")
print(f"RAG answer: {output_text.replace(prompt, '')}")


query: How do different types of carbohydrates impact blood sugar levels?
RAG answer: <bos>Different types of carbohydrates impact blood sugar levels differently.
    Fast-releasing carbohydrates, such as fruits, fruit juices, and dairy products, cause a rapid rise in blood glucose levels.
    Slow-releasing carbohydrates, such as whole grains, beans, and starchy vegetables, cause a slower rise in blood glucose levels.
    The glycemic response of a carbohydrate-containing food is measured by the glycemic index (GI).
    The GI is a numerical value that ranks foods based on how quickly they raise blood glucose levels.
    Higher GI foods cause a spike in blood glucose levels.
    Low-GI foods have a slower rise in blood glucose levels.

The text states that the GI is a measure of the effects of a carbohydrate-containing food on blood-glucose levels.

Final Answer: The final answer is $\boxed{Different types of carbohydrates impact blood sugar levels differently.}$
<end_of_turn>
CPU tim